diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
new file mode 100644
index 00000000000..96781221cf1
--- /dev/null
+++ b/.github/workflows/go.yml
@@ -0,0 +1,362 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Go
+
+on:
+ push:
+ paths:
+ - '.github/workflows/go.yml'
+ - 'ci/docker/*_go.dockerfile'
+ - 'ci/scripts/go_*'
+ - 'go/**'
+ pull_request:
+ paths:
+ - '.github/workflows/go.yml'
+ - 'ci/docker/*_go.dockerfile'
+ - 'ci/docker/**'
+ - 'ci/scripts/go_*'
+ - 'go/**'
+
+concurrency:
+ group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+
+jobs:
+
+ docker:
+ name: AMD64 Debian 11 Go ${{ matrix.go }}
+ runs-on: ubuntu-latest
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 30
+ strategy:
+ fail-fast: false
+ matrix:
+ go: [1.17, 1.18]
+ include:
+ - go: 1.17
+ staticcheck: v0.2.2
+ - go: 1.18
+ staticcheck: latest
+ env:
+ GO: ${{ matrix.go }}
+ STATICCHECK: ${{ matrix.staticcheck }}
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ submodules: recursive
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ - name: Setup Archery
+ run: pip install -e dev/archery[docker]
+ - name: Execute Docker Build
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ run: archery docker run debian-go
+ - name: Docker Push
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ continue-on-error: true
+ run: archery docker push debian-go
+ - name: Run Benchmarks
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ env:
+ CONBENCH_URL: https://conbench.ursa.dev
+ CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }}
+ CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }}
+ CONBENCH_REF: ${{ github.ref_name }}
+ run: |
+ pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python
+ python ci/scripts/go_bench_adapt.py
+
+ docker_cgo:
+ name: AMD64 Debian 11 GO ${{ matrix.go }} - CGO
+ runs-on: ubuntu-latest
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 15
+ strategy:
+ fail-fast: false
+ matrix:
+ go: [1.17, 1.18]
+ include:
+ - go: 1.17
+ staticcheck: v0.2.2
+ - go: 1.18
+ staticcheck: latest
+ env:
+ GO: ${{ matrix.go }}
+ STATICCHECK: ${{ matrix.staticcheck }}
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ submodules: recursive
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.8
+ - name: Setup Archery
+ run: pip install -e dev/archery[docker]
+ - name: Execute Docker Build
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ run: archery docker run debian-go-cgo
+ - name: Docker Push
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ continue-on-error: true
+ run: archery docker push debian-go-cgo
+
+
+ docker_cgo_python:
+ name: AMD64 Debian 11 GO ${{ matrix.go }} - CGO Python
+ runs-on: ubuntu-latest
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 15
+ strategy:
+ fail-fast: false
+ matrix:
+ go: [1.17, 1.18]
+ include:
+ - go: 1.17
+ staticcheck: v0.2.2
+ - go: 1.18
+ staticcheck: latest
+ env:
+ GO: ${{ matrix.go }}
+ STATICCHECK: ${{ matrix.staticcheck }}
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.8
+ - name: Setup Archery
+ run: pip install -e dev/archery[docker]
+ - name: Execute Docker Build
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ run: archery docker run debian-go-cgo-python
+ - name: Docker Push
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ continue-on-error: true
+ run: archery docker push debian-go-cgo-python
+
+ windows:
+ name: AMD64 Windows 2019 Go ${{ matrix.go }}
+ runs-on: windows-2019
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 15
+ strategy:
+ fail-fast: false
+ matrix:
+ go: [1.17, 1.18]
+ include:
+ - go: 1.17
+ staticcheck: v0.2.2
+ - go: 1.18
+ staticcheck: latest
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ submodules: recursive
+ - name: Install go
+ uses: actions/setup-go@v3
+ with:
+ go-version: ${{ matrix.go }}
+ cache: true
+ cache-dependency-path: go/go.sum
+ - name: Install staticcheck
+ run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }}
+ - name: Build
+ shell: bash
+ run: ci/scripts/go_build.sh $(pwd)
+ - name: Test
+ shell: bash
+ run: ci/scripts/go_test.sh $(pwd)
+
+ macos:
+ name: AMD64 macOS 11 Go ${{ matrix.go }}
+ runs-on: macos-latest
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 30
+ strategy:
+ fail-fast: false
+ matrix:
+ go: [1.17, 1.18]
+ include:
+ - go: 1.17
+ staticcheck: v0.2.2
+ - go: 1.18
+ staticcheck: latest
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ submodules: recursive
+ - name: Install go
+ uses: actions/setup-go@v3
+ with:
+ go-version: ${{ matrix.go }}
+ cache: true
+ cache-dependency-path: go/go.sum
+ - name: Install staticcheck
+ run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }}
+ - name: Build
+ shell: bash
+ run: ci/scripts/go_build.sh $(pwd)
+ - name: Test
+ shell: bash
+ run: ci/scripts/go_test.sh $(pwd)
+ - name: Setup Python
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ - name: Run Benchmarks
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ shell: bash
+ env:
+ CONBENCH_URL: 'https://conbench.ursa.dev'
+ CONBENCH_EMAIL: ${{ secrets.CONBENCH_EMAIL }}
+ CONBENCH_PASSWORD: ${{ secrets.CONBENCH_PASS }}
+ CONBENCH_REF: ${{ github.ref_name }}
+ run: |
+ pip install benchadapt@git+https://github.com/conbench/conbench.git@main#subdirectory=benchadapt/python
+ python ci/scripts/go_bench_adapt.py
+
+
+ macos-cgo:
+ name: AMD64 macOS 11 Go ${{ matrix.go }} - CGO
+ runs-on: macos-latest
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 60
+ strategy:
+ fail-fast: false
+ matrix:
+ go: [1.17, 1.18]
+ include:
+ - go: 1.17
+ staticcheck: v0.2.2
+ - go: 1.18
+ staticcheck: latest
+ env:
+ ARROW_GO_TESTCGO: "1"
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ submodules: recursive
+ - name: Install go
+ uses: actions/setup-go@v3
+ with:
+ go-version: ${{ matrix.go }}
+ cache: true
+ cache-dependency-path: go/go.sum
+ - name: Brew Install Arrow
+ shell: bash
+ run: brew install apache-arrow
+ - name: Install staticcheck
+ run: go install honnef.co/go/tools/cmd/staticcheck@${{ matrix.staticcheck }}
+ - name: Build
+ shell: bash
+ run: ci/scripts/go_build.sh $(pwd)
+ - name: Test
+ shell: bash
+ run: ci/scripts/go_test.sh $(pwd)
+
+ windows-mingw:
+ name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} CGO
+ runs-on: windows-2019
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 60
+ strategy:
+ fail-fast: false
+ matrix:
+ mingw-n-bits:
+ #- 32 runtime handling for CGO needs 64-bit currently
+ - 64
+ env:
+ ARROW_GO_TESTCGO: "1"
+ MINGW_LINT: "1"
+ steps:
+ - name: Disable Crash Dialogs
+ run: |
+ reg add `
+ "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" `
+ /v DontShowUI `
+ /t REG_DWORD `
+ /d 1 `
+ /f
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ submodules: recursive
+ - uses: msys2/setup-msys2@v2
+ with:
+ msystem: MINGW${{ matrix.mingw-n-bits }}
+ update: true
+ - name: Setup MSYS2
+ shell: msys2 {0}
+ run: |
+ ci/scripts/msys2_setup.sh cgo
+ - name: Update CGO Env vars
+ shell: msys2 {0}
+ run: |
+ echo "CGO_CPPFLAGS=-I$(cygpath --windows ${MINGW_PREFIX}/include)" >> $GITHUB_ENV
+ echo "CGO_LDFLAGS=-g -O2 -L$(cygpath --windows ${MINGW_PREFIX}/lib) -L$(cygpath --windows ${MINGW_PREFIX}/bin)" >> $GITHUB_ENV
+ echo "MINGW_PREFIX=$(cygpath --windows ${MINGW_PREFIX})" >> $GITHUB_ENV
+ - name: Install go
+ uses: actions/setup-go@v3
+ with:
+ go-version: '1.18'
+ cache: true
+ cache-dependency-path: go/go.sum
+ - name: Install staticcheck
+ run: go install honnef.co/go/tools/cmd/staticcheck@latest
+ - name: Build
+ shell: bash
+ run: ci/scripts/go_build.sh $(pwd)
+ - name: Test
+ shell: bash
+ run: ci/scripts/go_test.sh $(pwd)
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
new file mode 100644
index 00000000000..9173f0e530b
--- /dev/null
+++ b/.github/workflows/r.yml
@@ -0,0 +1,334 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: R
+
+on:
+ push:
+ paths:
+ - ".github/workflows/r.yml"
+ - "ci/scripts/r_*.sh"
+ - "ci/scripts/cpp_*.sh"
+ - "ci/scripts/PKGBUILD"
+ - "ci/etc/rprofile"
+ - "ci/docker/**"
+ - "cpp/**"
+ - "r/**"
+ pull_request:
+ paths:
+ - ".github/workflows/r.yml"
+ - "ci/scripts/r_*.sh"
+ - "ci/scripts/cpp_*.sh"
+ - "ci/scripts/PKGBUILD"
+ - "ci/etc/rprofile"
+ - "ci/docker/**"
+ - "cpp/**"
+ - "r/**"
+
+concurrency:
+ group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+
+env:
+ DOCKER_VOLUME_PREFIX: ".docker/"
+
+jobs:
+ ubuntu:
+ name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} Force-Tests ${{ matrix.force-tests }}
+ runs-on: ubuntu-latest
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 75
+ strategy:
+ fail-fast: false
+ matrix:
+ r: ["4.2"]
+ ubuntu: [20.04]
+ force-tests: ["true"]
+ env:
+ R: ${{ matrix.r }}
+ UBUNTU: ${{ matrix.ubuntu }}
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Cache Docker Volumes
+ uses: actions/cache@v3
+ with:
+ path: .docker
+ # As this key is identical on both matrix builds only one will be able to successfully cache,
+ # this is fine as there are no differences in the build
+ key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }}
+ restore-keys: |
+ ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
+ ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-
+ - name: Check pkgdown reference sections
+ run: ci/scripts/r_pkgdown_check.sh
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.8
+ - name: Setup Archery
+ run: pip install -e dev/archery[docker]
+ - name: Execute Docker Build
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ run: |
+ sudo sysctl -w kernel.core_pattern="core.%e.%p"
+ ulimit -c unlimited
+ # Setting a non-default and non-probable Marquesas French Polynesia time
+ # it has both with a .45 offset and very very few people who live there.
+ archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r
+ - name: Dump install logs
+ run: cat r/check/arrow.Rcheck/00install.out
+ if: always()
+ - name: Dump test logs
+ run: cat r/check/arrow.Rcheck/tests/testthat.Rout*
+ if: always()
+ - name: Save the test output
+ if: always()
+ uses: actions/upload-artifact@v3
+ with:
+ name: test-output
+ path: r/check/arrow.Rcheck/tests/testthat.Rout*
+ - name: Docker Push
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ continue-on-error: true
+ run: archery docker push ubuntu-r
+
+ bundled:
+ name: "${{ matrix.config.org }}/${{ matrix.config.image }}:${{ matrix.config.tag }}"
+ runs-on: ubuntu-latest
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 60
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - { org: "rhub", image: "debian-gcc-devel", tag: "latest", devtoolset: "" }
+ env:
+ R_ORG: ${{ matrix.config.org }}
+ R_IMAGE: ${{ matrix.config.image }}
+ R_TAG: ${{ matrix.config.tag }}
+ DEVTOOLSET_VERSION: ${{ matrix.config.devtoolset }}
+ steps:
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Setup Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.8
+ - name: Setup Archery
+ run: pip install -e dev/archery[docker]
+ - name: Execute Docker Build
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ run: |
+ sudo sysctl -w kernel.core_pattern="core.%e.%p"
+ ulimit -c unlimited
+ # Don't set a TZ here to test that case. These builds will have the following warning in them:
+ # System has not been booted with systemd as init system (PID 1). Can't operate.
+ # Failed to connect to bus: Host is down
+ archery docker run -e TZ="" r
+ - name: Dump install logs
+ run: cat r/check/arrow.Rcheck/00install.out
+ if: always()
+ - name: Dump test logs
+ run: cat r/check/arrow.Rcheck/tests/testthat.Rout*
+ if: always()
+ - name: Save the test output
+ if: always()
+ uses: actions/upload-artifact@v3
+ with:
+ name: test-output
+ path: r/check/arrow.Rcheck/tests/testthat.Rout*
+ - name: Docker Push
+ if: success() && github.event_name == 'push' && github.repository == 'apache/arrow'
+ env:
+ ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
+ ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
+ continue-on-error: true
+ run: archery docker push r
+
+ windows-cpp:
+ name: AMD64 Windows C++ RTools ${{ matrix.config.rtools }} ${{ matrix.config.arch }}
+ runs-on: windows-2019
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 90
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - { rtools: 40, arch: 'ucrt64' }
+ steps:
+ - run: git config --global core.autocrlf false
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Setup ccache
+ shell: bash
+ run: |
+ ci/scripts/ccache_setup.sh
+ echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> $GITHUB_ENV
+ - name: Cache ccache
+ uses: actions/cache@v3
+ with:
+ path: ccache
+ key: r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }}
+ restore-keys: |
+ r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
+ r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch }}-
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ r-version: "4.1"
+ rtools-version: 40
+ Ncpus: 2
+ - name: Build Arrow C++
+ shell: bash
+ env:
+ MINGW_ARCH: ${{ matrix.config.arch }}
+ run: ci/scripts/r_windows_build.sh
+ - name: Rename libarrow.zip
+ # So that they're unique when multiple are downloaded in the next step
+ shell: bash
+ run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
+ - uses: actions/upload-artifact@v3
+ with:
+ name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
+ path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip
+
+ windows-r:
+ needs: [windows-cpp]
+ name: AMD64 Windows R ${{ matrix.config.rversion }} RTools ${{ matrix.config.rtools }}
+ runs-on: windows-2019
+ if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
+ timeout-minutes: 75
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - { rtools: 42, rversion: "4.2" }
+ - { rtools: 42, rversion: "devel" }
+ env:
+ ARROW_R_CXXFLAGS: "-Werror"
+ _R_CHECK_TESTS_NLINES_: 0
+ steps:
+ - run: git config --global core.autocrlf false
+ - name: Checkout Arrow
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - run: mkdir r/windows
+ - name: Download artifacts
+ if: ${{ matrix.config.rtools == 42 }}
+ uses: actions/download-artifact@v3
+ with:
+ name: libarrow-rtools40-ucrt64.zip
+ path: r/windows
+ - name: Unzip and rezip libarrows
+ shell: bash
+ run: |
+ cd r/windows
+ ls *.zip | xargs -n 1 unzip -uo
+ rm -rf *.zip
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ r-version: ${{ matrix.config.rversion }}
+ rtools-version: ${{ matrix.config.rtools }}
+ Ncpus: 2
+ - uses: r-lib/actions/setup-r-dependencies@v2
+ env:
+ GITHUB_PAT: "${{ github.token }}"
+ with:
+ # For some arcane reason caching does not work on the windows runners
+ # most likely due to https://github.com/actions/cache/issues/815
+ cache: false
+ working-directory: 'r'
+ extra-packages: |
+ any::rcmdcheck
+ - name: Install MinIO
+ shell: bash
+ run: |
+ mkdir -p "$HOME/.local/bin"
+ curl \
+ --output "$HOME/.local/bin/minio.exe" \
+ https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z
+ chmod +x "$HOME/.local/bin/minio.exe"
+ echo "$HOME/.local/bin" >> $GITHUB_PATH
+ # TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows
+ # - name: Install Google Cloud Storage Testbench
+ # shell: bash
+ # run: ci/scripts/install_gcs_testbench.sh default
+ - name: Check
+ shell: Rscript {0}
+ run: |
+ # Because we do R CMD build and r/windows is in .Rbuildignore,
+ # assemble the libarrow.zip file and pass it as an env var
+ setwd("r/windows")
+ zip("libarrow.zip", ".")
+ setwd("..")
+
+ Sys.setenv(
+ RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"),
+ MAKEFLAGS = paste0("-j", parallel::detectCores()),
+ ARROW_R_DEV = TRUE,
+ "_R_CHECK_FORCE_SUGGESTS_" = FALSE
+ )
+ rcmdcheck::rcmdcheck(".",
+ build_args = '--no-build-vignettes',
+ args = c('--no-manual', '--as-cran', '--ignore-vignettes', '--run-donttest'),
+ error_on = 'warning',
+ check_dir = 'check',
+ timeout = 3600
+ )
+ - name: Run lintr
+ if: ${{ matrix.config.rversion == '4.2' }}
+ env:
+ NOT_CRAN: "true"
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ shell: Rscript {0}
+ working-directory: r
+ run: |
+ Sys.setenv(
+ RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "r", "windows", "libarrow.zip"),
+ MAKEFLAGS = paste0("-j", parallel::detectCores()),
+ ARROW_R_DEV = TRUE,
+ "_R_CHECK_FORCE_SUGGESTS_" = FALSE
+ )
+ # we use pak for package installation since it is faster, safer and more convenient
+ pak::local_install()
+ pak::pak("lintr")
+ lintr::expect_lint_free()
+ - name: Dump install logs
+ shell: cmd
+ run: cat r/check/arrow.Rcheck/00install.out
+ if: always()
+ - name: Dump test logs
+ shell: bash
+ run: find r/check -name 'testthat.Rout*' -exec cat '{}' \; || true
+ if: always()
diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb
index 38900cf12f3..6626c67c3ab 100644
--- a/c_glib/test/test-orc-file-reader.rb
+++ b/c_glib/test/test-orc-file-reader.rb
@@ -185,8 +185,8 @@ def all_columns
test("select fields") do
require_gi_bindings(3, 2, 6)
@reader.field_indices = [1, 3]
- assert_equal(build_table("boolean1" => build_boolean_array([false, true]),
- "short1" => build_int16_array([1024, 2048])),
+ assert_equal(build_table("byte1" => build_int8_array([1, 100]),
+ "int1" => build_int32_array([65536, 65536])),
@reader.read_stripes)
end
end
@@ -200,10 +200,8 @@ def all_columns
test("select fields") do
require_gi_bindings(3, 2, 6)
@reader.field_indices = [1, 3]
- boolean1 = build_boolean_array([false, true])
- short1 = build_int16_array([1024, 2048])
- assert_equal(build_record_batch("boolean1" => boolean1,
- "short1" => short1),
+ assert_equal(build_record_batch("byte1" => build_int8_array([1, 100]),
+ "int1" => build_int32_array([65536, 65536])),
@reader.read_stripe(0))
end
end
diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py
index 26abcc028b3..726f83239b5 100644
--- a/ci/conan/all/conanfile.py
+++ b/ci/conan/all/conanfile.py
@@ -302,7 +302,7 @@ def requirements(self):
if self._with_thrift():
self.requires("thrift/0.16.0")
if self._with_protobuf():
- self.requires("protobuf/3.21.1")
+ self.requires("protobuf/3.21.4")
if self._with_jemalloc():
self.requires("jemalloc/5.2.1")
if self._with_boost():
@@ -346,7 +346,7 @@ def requirements(self):
if self.options.with_zstd:
self.requires("zstd/1.5.2")
if self._with_re2():
- self.requires("re2/20220201")
+ self.requires("re2/20220601")
if self._with_utf8proc():
self.requires("utf8proc/2.7.0")
if self.options.with_backtrace:
diff --git a/ci/scripts/go_bench.sh b/ci/scripts/go_bench.sh
new file mode 100644
index 00000000000..5347b42524e
--- /dev/null
+++ b/ci/scripts/go_bench.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# this will output the benchmarks to STDOUT but if `-json` is passed
+# as the second argument, it will create a file "bench_stats.json"
+# in the directory this is called from containing a json representation
+
+set -ex
+
+# simplistic semver comparison
+verlte() {
+ [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ]
+}
+verlt() {
+ [ "$1" = "$2" ] && return 1 || verlte $1 $2
+}
+
+ver=`go env GOVERSION`
+
+source_dir=${1}/go
+
+export PARQUET_TEST_DATA=${1}/cpp/submodules/parquet-testing/data
+pushd ${source_dir}
+
+go test -bench=. -benchmem -run=^$ ./... | tee bench_stat.dat
+
+if verlte "1.18" "${ver#go}"; then
+ go test -bench=. -benchmem -run=^$ ./arrow/compute | tee bench_stat_compute.dat
+fi
+
+popd
+
+if [[ "$2" = "-json" ]]; then
+ go install go.bobheadxi.dev/gobenchdata@latest
+ export PATH=`go env GOPATH`/bin:$PATH
+ cat ${source_dir}/bench_*.dat | gobenchdata --json bench_stats.json
+fi
+
+rm ${source_dir}/bench_*.dat
\ No newline at end of file
diff --git a/ci/scripts/go_bench_adapt.py b/ci/scripts/go_bench_adapt.py
new file mode 100644
index 00000000000..db1c09cbc59
--- /dev/null
+++ b/ci/scripts/go_bench_adapt.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import os
+import uuid
+import logging
+from pathlib import Path
+from typing import List
+
+from benchadapt import BenchmarkResult
+from benchadapt.adapters import BenchmarkAdapter
+from benchadapt.log import log
+
+log.setLevel(logging.DEBUG)
+
+ARROW_ROOT = Path(__file__).parent.parent.parent.resolve()
+SCRIPTS_PATH = ARROW_ROOT / "ci" / "scripts"
+RUN_REASON = "commit" if os.environ.get("CONBENCH_REF") == "master" else "branch"
+
+class GoAdapter(BenchmarkAdapter):
+ result_file = "bench_stats.json"
+ command = ["bash", SCRIPTS_PATH / "go_bench.sh", ARROW_ROOT, "-json"]
+
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(command=self.command, *args, **kwargs)
+
+ def _transform_results(self) -> List[BenchmarkResult]:
+ with open(self.result_file, "r") as f:
+ raw_results = json.load(f)
+
+ run_id = uuid.uuid4().hex
+ parsed_results = []
+ for suite in raw_results[0]["Suites"]:
+ batch_id = uuid.uuid4().hex
+ pkg = suite["Pkg"]
+
+ for benchmark in suite["Benchmarks"]:
+ data = benchmark["Mem"]["MBPerSec"] * 1e6
+ time = 1 / benchmark["NsPerOp"] * 1e9
+
+ name = benchmark["Name"].removeprefix('Benchmark')
+ ncpu = name[name.rfind('-')+1:]
+ pieces = name[:-(len(ncpu)+1)].split('/')
+
+ parsed = BenchmarkResult(
+ run_id=run_id,
+ batch_id=batch_id,
+ stats={
+ "data": [data],
+ "unit": "b/s",
+ "times": [time],
+ "time_unit": "i/s",
+ "iterations": benchmark["Runs"],
+ },
+ context={
+ "benchmark_language": "Go",
+ "goos": suite["Goos"],
+ "goarch": suite["Goarch"],
+ },
+ tags={
+ "pkg": pkg,
+ "num_cpu": ncpu,
+ "name": pieces[0],
+ "params": '/'.join(pieces[1:]),
+ },
+ run_reason=RUN_REASON,
+ )
+ parsed.run_name = f"{parsed.run_reason}: {parsed.github['commit']}"
+ parsed_results.append(parsed)
+
+ return parsed_results
+
+
+if __name__ == "__main__":
+ go_adapter = GoAdapter(result_fields_override={"info":{}})
+ go_adapter()
\ No newline at end of file
diff --git a/ci/scripts/go_test.sh b/ci/scripts/go_test.sh
index e31fa555642..54b05c3cc2b 100755
--- a/ci/scripts/go_test.sh
+++ b/ci/scripts/go_test.sh
@@ -61,7 +61,7 @@ pushd ${source_dir}/arrow
TAGS="assert,test"
if [[ -n "${ARROW_GO_TESTCGO}" ]]; then
if [[ "${MSYSTEM}" = "MINGW64" ]]; then
- export PATH=${MINGW_PREFIX}/bin:$PATH
+ export PATH=${MINGW_PREFIX}\\bin:${MINGW_PREFIX}\\lib:$PATH
fi
TAGS="${TAGS},ccalloc"
fi
diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh
index 853f03267bd..c801f90d414 100755
--- a/ci/scripts/r_docker_configure.sh
+++ b/ci/scripts/r_docker_configure.sh
@@ -72,11 +72,18 @@ fi
if [[ -n "$DEVTOOLSET_VERSION" ]]; then
$PACKAGE_MANAGER install -y centos-release-scl
$PACKAGE_MANAGER install -y "devtoolset-$DEVTOOLSET_VERSION"
-
- # Only add make var if not set
- if ! grep -Fq "CXX17=" ~/.R/Makevars &> /dev/null; then
+
+ # Enable devtoolset here so that `which gcc` finds the right compiler below
+ source /opt/rh/devtoolset-${DEVTOOLSET_VERSION}/enable
+
+ # Build images which require the devtoolset don't have CXX17 variables
+ # set as the system compiler doesn't support C++17
+ if [ ! "`{R_BIN} CMD config CXX17`" ]; then
mkdir -p ~/.R
- echo "CXX17=g++ -std=gnu++17 -g -O2 -fpic" >> ~/.R/Makevars
+ echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars
+ echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars
+ echo "CXX17STD = -std=c++17" >> ~/.R/Makevars
+ echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars
fi
fi
diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh
index f532bc7cf0a..d7df44e2e43 100755
--- a/ci/scripts/r_test.sh
+++ b/ci/scripts/r_test.sh
@@ -26,19 +26,6 @@ pushd ${source_dir}
printenv
-if [[ -n "$DEVTOOLSET_VERSION" ]]; then
- # enable the devtoolset version to use it
- source /opt/rh/devtoolset-$DEVTOOLSET_VERSION/enable
-
- # Build images which require the devtoolset don't have CXX17 variables
- # set as the system compiler doesn't support C++17
- mkdir -p ~/.R
- echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars
- echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars
- echo "CXX17STD = -std=c++17" >> ~/.R/Makevars
- echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars
-fi
-
# Run the nixlibs.R test suite, which is not included in the installed package
${R_BIN} -e 'setwd("tools"); testthat::test_dir(".")'
diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh
index c361af1d267..6b6a5dd0c99 100755
--- a/ci/scripts/r_windows_build.sh
+++ b/ci/scripts/r_windows_build.sh
@@ -23,26 +23,15 @@ set -ex
# Make sure it is absolute and exported
export ARROW_HOME="$(cd "${ARROW_HOME}" && pwd)"
-if [ "$RTOOLS_VERSION" = "35" ]; then
- # Use rtools-backports if building with rtools35
- curl https://raw.githubusercontent.com/r-windows/rtools-backports/master/pacman.conf > /etc/pacman.conf
- pacman --noconfirm -Syy
- # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5)
- RWINLIB_LIB_DIR="lib-4.9.3"
- # This is the default (will build for each arch) but we can set up CI to
- # do these in parallel
- : ${MINGW_ARCH:="mingw32 mingw64"}
-else
- # Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN
- # curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf
- # curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz"
- # pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz
- # pacman --noconfirm -Scc
-
- pacman --noconfirm -Syy
- RWINLIB_LIB_DIR="lib"
- : ${MINGW_ARCH:="mingw32 mingw64 ucrt64"}
-fi
+# Uncomment L38-41 if you're testing a new rtools dependency that hasn't yet sync'd to CRAN
+# curl https://raw.githubusercontent.com/r-windows/rtools-packages/master/pacman.conf > /etc/pacman.conf
+# curl -OSsl "http://repo.msys2.org/msys/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz"
+# pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz
+# pacman --noconfirm -Scc
+
+pacman --noconfirm -Syy
+RWINLIB_LIB_DIR="lib"
+: ${MINGW_ARCH:="mingw32 mingw64 ucrt64"}
export MINGW_ARCH
@@ -78,26 +67,19 @@ fi
if [ -d mingw64/lib/ ]; then
ls $MSYS_LIB_DIR/mingw64/lib/
# Make the rest of the directory structure
- # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5)
- mkdir -p $DST_DIR/${RWINLIB_LIB_DIR}/x64
- # lib is for the new gcc 8 toolchain (Rtools 4.0)
mkdir -p $DST_DIR/lib/x64
# Move the 64-bit versions of libarrow into the expected location
- mv mingw64/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/x64
- # These may be from https://dl.bintray.com/rtools/backports/
- cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x64
+ mv mingw64/lib/*.a $DST_DIR/lib/x64
# These are from https://dl.bintray.com/rtools/mingw{32,64}/
- cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64
+ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/x64
fi
# Same for the 32-bit versions
if [ -d mingw32/lib/ ]; then
ls $MSYS_LIB_DIR/mingw32/lib/
- mkdir -p $DST_DIR/${RWINLIB_LIB_DIR}/i386
mkdir -p $DST_DIR/lib/i386
- mv mingw32/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/i386
- cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386
- cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/i386
+ mv mingw32/lib/*.a $DST_DIR/lib/i386
+ cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy,zstd,lz4,brotli*,bz2,crypto,curl,ss*,utf8proc,re2,aws*}.a $DST_DIR/lib/i386
fi
# Do the same also for ucrt64
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index cef4eb0b161..1abc6a5fe46 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -400,22 +400,13 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -Wno-noexcept-type")
endif()
- if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.2")
- # Disabling semantic interposition allows faster calling conventions
- # when calling global functions internally, and can also help inlining.
- # See https://stackoverflow.com/questions/35745543/new-option-in-gcc-5-3-fno-semantic-interposition
- set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fno-semantic-interposition")
- endif()
-
- if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
- # Add colors when paired with ninja
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
- endif()
+ # Disabling semantic interposition allows faster calling conventions
+ # when calling global functions internally, and can also help inlining.
+ # See https://stackoverflow.com/questions/35745543/new-option-in-gcc-5-3-fno-semantic-interposition
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -fno-semantic-interposition")
- if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.0")
- # Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43407
- set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes")
- endif()
+ # Add colors when paired with ninja
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
if(CMAKE_UNITY_BUILD)
# Work around issue similar to https://bugs.webkit.org/show_bug.cgi?id=176869
@@ -507,18 +498,12 @@ if(ARROW_CPU_FLAG STREQUAL "armv8")
add_definitions(-DARROW_HAVE_NEON)
- if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS
- "5.4")
- message(WARNING "Disable Armv8 CRC and Crypto as compiler doesn't support them well."
- )
- else()
- if(ARROW_ARMV8_ARCH_FLAG MATCHES "\\+crypto")
- add_definitions(-DARROW_HAVE_ARMV8_CRYPTO)
- endif()
- # armv8.1+ implies crc support
- if(ARROW_ARMV8_ARCH_FLAG MATCHES "armv8\\.[1-9]|\\+crc")
- add_definitions(-DARROW_HAVE_ARMV8_CRC)
- endif()
+ if(ARROW_ARMV8_ARCH_FLAG MATCHES "\\+crypto")
+ add_definitions(-DARROW_HAVE_ARMV8_CRYPTO)
+ endif()
+ # armv8.1+ implies crc support
+ if(ARROW_ARMV8_ARCH_FLAG MATCHES "armv8\\.[1-9]|\\+crc")
+ add_definitions(-DARROW_HAVE_ARMV8_CRC)
endif()
elseif(NOT ARROW_SIMD_LEVEL STREQUAL "NONE")
message(WARNING "ARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} not supported by Arm.")
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 9b6cc4865f3..65925dd7b0f 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -652,18 +652,9 @@ endif()
if(DEFINED ENV{ARROW_SNAPPY_URL})
set(SNAPPY_SOURCE_URL "$ENV{ARROW_SNAPPY_URL}")
else()
- if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS
- "4.9")
- # There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 "SNAPPY_OLD" for those (ARROW-14661)
- set_urls(SNAPPY_SOURCE_URL
- "https://github.com/google/snappy/archive/${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz"
- "${THIRDPARTY_MIRROR_URL}/snappy-${ARROW_SNAPPY_OLD_BUILD_VERSION}.tar.gz")
- set(ARROW_SNAPPY_BUILD_SHA256_CHECKSUM ${ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM})
- else()
- set_urls(SNAPPY_SOURCE_URL
- "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz"
- "${THIRDPARTY_MIRROR_URL}/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz")
- endif()
+ set_urls(SNAPPY_SOURCE_URL
+ "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz"
+ "${THIRDPARTY_MIRROR_URL}/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz")
endif()
# Remove these two lines once https://github.com/substrait-io/substrait/pull/342 merges
@@ -3943,7 +3934,7 @@ macro(build_grpc)
gRPC::grpc
gRPC::grpcpp_for_bundling
gRPC::upb)
- if(ABS_VENDORED)
+ if(ABSL_VENDORED)
list(APPEND ARROW_BUNDLED_STATIC_LIBS ${GRPC_GPR_ABSL_LIBRARIES})
endif()
endmacro()
@@ -4618,10 +4609,6 @@ endif()
macro(build_awssdk)
message(STATUS "Building AWS C++ SDK from source")
- if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS
- "4.9")
- message(FATAL_ERROR "AWS C++ SDK requires gcc >= 4.9")
- endif()
set(AWSSDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/awssdk_ep-install")
set(AWSSDK_INCLUDE_DIR "${AWSSDK_PREFIX}/include")
set(AWSSDK_LIB_DIR "lib")
diff --git a/cpp/proto/substrait/extension_rels.proto b/cpp/proto/substrait/extension_rels.proto
index 518412969f5..6f806d00e5b 100644
--- a/cpp/proto/substrait/extension_rels.proto
+++ b/cpp/proto/substrait/extension_rels.proto
@@ -16,7 +16,7 @@
// under the License.
syntax = "proto3";
-package arrow.substrait_ext;
+package arrow.substrait;
import "substrait/algebra.proto";
@@ -25,12 +25,20 @@ option go_package = "github.com/apache/arrow/substrait";
option java_multiple_files = true;
option java_package = "io.arrow.substrait";
+// As-Of-Join relation
message AsOfJoinRel {
- repeated AsOfJoinKeys input_keys = 1;
+ // One key per input relation, each key describing how to join the corresponding input
+ repeated AsOfJoinKey keys = 1;
+
+ // As-Of tolerance, in units of the on-key
int64 tolerance = 2;
- message AsOfJoinKeys {
+ // As-Of-Join key
+ message AsOfJoinKey {
+ // A field reference defining the on-key
.substrait.Expression on = 1;
+
+ // A set of field references defining the by-key
repeated .substrait.Expression by = 2;
}
}
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 5af5ebccc84..18f88bc6dfb 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -411,7 +411,7 @@ class ORCFileReader::Impl {
ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
include_indices_list.push_back(*it);
}
- opts->includeTypes(include_indices_list);
+ opts->include(include_indices_list);
return Status::OK();
}
diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc
index 5c234cc97c4..afc4bdb1d3b 100644
--- a/cpp/src/arrow/adapters/orc/adapter_test.cc
+++ b/cpp/src/arrow/adapters/orc/adapter_test.cc
@@ -226,7 +226,8 @@ std::shared_ptr
GenerateRandomTable(const std::shared_ptr& schema
void AssertTableWriteReadEqual(const std::shared_ptr& input_table,
const std::shared_ptr& expected_output_table,
- const int64_t max_size = kDefaultSmallMemStreamSize) {
+ const int64_t max_size = kDefaultSmallMemStreamSize,
+ std::vector* opt_selected_read_indices = nullptr) {
EXPECT_OK_AND_ASSIGN(auto buffer_output_stream,
io::BufferOutputStream::Create(max_size));
auto write_options = adapters::orc::WriteOptions();
@@ -250,7 +251,11 @@ void AssertTableWriteReadEqual(const std::shared_ptr& input_table,
ASSERT_EQ(reader->GetCompression(), write_options.compression);
ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size);
ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride);
- EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read());
+ EXPECT_OK_AND_ASSIGN(auto actual_output_table,
+ opt_selected_read_indices == nullptr
+ ? reader->Read()
+ : reader->Read(*opt_selected_read_indices));
+ ASSERT_OK(actual_output_table->ValidateFull());
AssertTablesEqual(*expected_output_table, *actual_output_table, false, false);
}
@@ -451,6 +456,37 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) {
std::shared_ptr table = TableFromJSON(table_schema, {});
AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16);
}
+TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) {
+ std::shared_ptr table = TableFromJSON(table_schema, {R"([])"});
+ std::shared_ptr schema_selected =
+ schema({field("int8", int8()), field("int32", int32())});
+ std::shared_ptr table_selected = TableFromJSON(schema_selected, {R"([])"});
+ std::vector selected_indices = {1, 3};
+ AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16,
+ &selected_indices);
+}
+TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) {
+ std::vector selected_indices = {1, 7};
+ random::RandomArrayGenerator rand(kRandomSeed);
+ std::shared_ptr local_schema = schema({
+ field("bool", boolean()),
+ field("int32", int32()),
+ field("int64", int64()),
+ field("float", float32()),
+ field("struct", struct_({field("a", utf8()), field("b", int64())})),
+ field("double", float64()),
+ field("date32", date32()),
+ field("ts3", timestamp(TimeUnit::NANO)),
+ field("string", utf8()),
+ field("binary", binary()),
+ });
+ auto batch = rand.BatchOf(local_schema->fields(), 100);
+ std::shared_ptr table = Table::Make(local_schema, batch->columns());
+ EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices));
+ AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize,
+ &selected_indices);
+}
+
class TestORCWriterTrivialWithConversion : public ::testing::Test {
public:
TestORCWriterTrivialWithConversion() {
diff --git a/cpp/src/arrow/compute/exec/asof_join_benchmark.cc b/cpp/src/arrow/compute/exec/asof_join_benchmark.cc
index 3c6b78d29f1..1e485401d6b 100644
--- a/cpp/src/arrow/compute/exec/asof_join_benchmark.cc
+++ b/cpp/src/arrow/compute/exec/asof_join_benchmark.cc
@@ -115,6 +115,15 @@ AsofJoinNodeOptions GetRepeatedOptions(size_t repeat, FieldRef on_key,
return AsofJoinNodeOptions(input_keys, tolerance);
}
+AsofJoinNodeOptions GetRepeatedOptions(size_t repeat, FieldRef on_key,
+ std::vector by_key, int64_t tolerance) {
+ std::vector input_keys(repeat);
+ for (size_t i = 0; i < repeat; i++) {
+ input_keys[i] = {on_key, by_key};
+ }
+ return AsofJoinNodeOptions(input_keys, tolerance);
+}
+
static void AsOfJoinOverhead(benchmark::State& state) {
int64_t tolerance = 0;
AsofJoinNodeOptions options =
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index f07254e1115..da9748de96e 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -742,16 +742,7 @@ Result> ParquetFileFragment::TryCountRows(
compute::Expression predicate) {
DCHECK_NE(metadata_, nullptr);
if (ExpressionHasFieldRefs(predicate)) {
-#if defined(__GNUC__) && (__GNUC__ < 5)
- // ARROW-12694: with GCC 4.9 (RTools 35) we sometimes segfault here if we move(result)
- auto result = TestRowGroups(std::move(predicate));
- if (!result.ok()) {
- return result.status();
- }
- auto expressions = result.ValueUnsafe();
-#else
ARROW_ASSIGN_OR_RAISE(auto expressions, TestRowGroups(std::move(predicate)));
-#endif
int64_t rows = 0;
for (size_t i = 0; i < row_groups_->size(); i++) {
// If the row group is entirely excluded, exclude it from the row count
diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index 08c9b833b1c..88d28a17ed6 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -786,7 +786,19 @@ Result ProjectionDescr::FromNames(std::vector name
const Schema& dataset_schema) {
std::vector exprs(names.size());
for (size_t i = 0; i < exprs.size(); ++i) {
- exprs[i] = compute::field_ref(names[i]);
+ // If name isn't in schema, try finding it by dotted path.
+ if (dataset_schema.GetFieldByName(names[i]) == nullptr) {
+ auto name = names[i];
+ if (name.rfind(".", 0) != 0) {
+ name = "." + name;
+ }
+ ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(name));
+ // safe as we know there is at least 1 dot.
+ names[i] = name.substr(name.rfind(".") + 1);
+ exprs[i] = compute::field_ref(field_ref);
+ } else {
+ exprs[i] = compute::field_ref(names[i]);
+ }
}
auto fields = dataset_schema.fields();
for (const auto& aug_field : kAugmentedFields) {
diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc
index 59fb8089c0a..73694f4b33a 100644
--- a/cpp/src/arrow/dataset/scanner_test.cc
+++ b/cpp/src/arrow/dataset/scanner_test.cc
@@ -1088,6 +1088,24 @@ TEST_P(TestScanner, ProjectedScanNested) {
AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out);
}
+TEST_P(TestScanner, ProjectedScanNestedFromNames) {
+ SetSchema({
+ field("struct", struct_({field("i32", int32()), field("f64", float64())})),
+ field("nested", struct_({field("left", int32()),
+ field("right", struct_({field("i32", int32()),
+ field("f64", float64())}))})),
+ });
+ ASSERT_OK_AND_ASSIGN(auto descr,
+ ProjectionDescr::FromNames({".struct.i32", "nested.right.f64"},
+ *options_->dataset_schema))
+ SetProjection(options_.get(), std::move(descr));
+ auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_);
+ auto batch_out = ConstantArrayGenerator::Zeroes(
+ GetParam().items_per_batch,
+ schema({field("i32", int32()), field("f64", float64())}));
+ AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out);
+}
+
TEST_P(TestScanner, MaterializeMissingColumn) {
SetSchema({field("i32", int32()), field("f64", float64())});
auto batch_missing_f64 = ConstantArrayGenerator::Zeroes(
diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h
index ee9c2fcad8f..8764eed27aa 100644
--- a/cpp/src/arrow/dataset/test_util.h
+++ b/cpp/src/arrow/dataset/test_util.h
@@ -157,7 +157,7 @@ class DatasetFixtureMixin : public ::testing::Test {
std::shared_ptr lhs;
ASSERT_OK(expected->ReadNext(&lhs));
EXPECT_NE(lhs, nullptr);
- AssertBatchesEqual(*lhs, batch);
+ AssertBatchesEqual(*lhs, batch, true);
}
/// \brief Ensure that record batches found in reader are equals to the
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc
index 343d8cd9ee6..823576003c4 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.cc
+++ b/cpp/src/arrow/engine/substrait/expression_internal.cc
@@ -37,6 +37,8 @@ using internal::checked_cast;
namespace engine {
+namespace substrait = ::substrait;
+
namespace {
Id NormalizeFunctionName(Id id) {
diff --git a/cpp/src/arrow/engine/substrait/expression_internal.h b/cpp/src/arrow/engine/substrait/expression_internal.h
index f132afc0c1a..6eb7d4117c8 100644
--- a/cpp/src/arrow/engine/substrait/expression_internal.h
+++ b/cpp/src/arrow/engine/substrait/expression_internal.h
@@ -32,6 +32,8 @@
namespace arrow {
namespace engine {
+namespace substrait = ::substrait;
+
ARROW_ENGINE_EXPORT
Result FromProto(const substrait::Expression&, const ExtensionSet&,
const ConversionOptions&);
diff --git a/cpp/src/arrow/engine/substrait/options.cc b/cpp/src/arrow/engine/substrait/options.cc
index 6614814771b..f7c8bf4713e 100644
--- a/cpp/src/arrow/engine/substrait/options.cc
+++ b/cpp/src/arrow/engine/substrait/options.cc
@@ -28,13 +28,15 @@
namespace arrow {
namespace engine {
+namespace substrait = ::substrait;
+
class DefaultExtensionProvider : public ExtensionProvider {
public:
Result MakeRel(const std::vector& inputs,
const google::protobuf::Any& rel,
const ExtensionSet& ext_set) override {
- if (rel.Is()) {
- substrait_ext::AsOfJoinRel as_of_join_rel;
+ if (rel.Is()) {
+ arrow::substrait::AsOfJoinRel as_of_join_rel;
rel.UnpackTo(&as_of_join_rel);
return MakeAsOfJoinRel(inputs, as_of_join_rel, ext_set);
}
@@ -45,18 +47,18 @@ class DefaultExtensionProvider : public ExtensionProvider {
private:
Result MakeAsOfJoinRel(
const std::vector& inputs,
- const substrait_ext::AsOfJoinRel& as_of_join_rel, const ExtensionSet& ext_set) {
+ const arrow::substrait::AsOfJoinRel& as_of_join_rel, const ExtensionSet& ext_set) {
if (inputs.size() < 2) {
return Status::Invalid("substrait::AsOfJoinNode too few input tables: ",
inputs.size());
}
- if (static_cast(as_of_join_rel.input_keys_size()) != inputs.size()) {
+ if (static_cast(as_of_join_rel.keys_size()) != inputs.size()) {
return Status::Invalid("substrait::AsOfJoinNode mismatched number of inputs");
}
size_t n_input = inputs.size(), i = 0;
std::vector input_keys(n_input);
- for (const auto& keys : as_of_join_rel.input_keys()) {
+ for (const auto& keys : as_of_join_rel.keys()) {
// on-key
if (!keys.has_on()) {
return Status::Invalid("substrait::AsOfJoinNode missing on-key for input ", i);
diff --git a/cpp/src/arrow/engine/substrait/options.h b/cpp/src/arrow/engine/substrait/options.h
index 2f46abbbfb8..57f29f65630 100644
--- a/cpp/src/arrow/engine/substrait/options.h
+++ b/cpp/src/arrow/engine/substrait/options.h
@@ -36,7 +36,7 @@ namespace engine {
/// How strictly to adhere to the input structure when converting between Substrait and
/// Acero representations of a plan. This allows the user to trade conversion accuracy
/// for performance and lenience.
-enum class ConversionStrictness {
+enum class ARROW_ENGINE_EXPORT ConversionStrictness {
/// When a primitive is used at the input that doesn't have an exact match at the
/// output, reject the conversion. This effectively asserts that there is no (known)
/// information loss in the conversion, and that plans should either round-trip back and
diff --git a/cpp/src/arrow/engine/substrait/plan_internal.cc b/cpp/src/arrow/engine/substrait/plan_internal.cc
index 64f129101f4..4b1abb7677b 100644
--- a/cpp/src/arrow/engine/substrait/plan_internal.cc
+++ b/cpp/src/arrow/engine/substrait/plan_internal.cc
@@ -34,6 +34,8 @@ using internal::checked_cast;
namespace engine {
+namespace substrait = ::substrait;
+
Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan) {
plan->clear_extension_uris();
diff --git a/cpp/src/arrow/engine/substrait/plan_internal.h b/cpp/src/arrow/engine/substrait/plan_internal.h
index 7d6dd375288..473edac7239 100644
--- a/cpp/src/arrow/engine/substrait/plan_internal.h
+++ b/cpp/src/arrow/engine/substrait/plan_internal.h
@@ -30,6 +30,8 @@
namespace arrow {
namespace engine {
+namespace substrait = ::substrait;
+
/// \brief Replaces the extension information of a Substrait Plan message with the given
/// extension set, such that the anchors defined therein can be used in the rest of the
/// plan.
diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc b/cpp/src/arrow/engine/substrait/relation_internal.cc
index d06cc7ef6f2..0363d382f10 100644
--- a/cpp/src/arrow/engine/substrait/relation_internal.cc
+++ b/cpp/src/arrow/engine/substrait/relation_internal.cc
@@ -43,6 +43,8 @@ using internal::UriFromAbsolutePath;
namespace engine {
+namespace substrait = ::substrait;
+
struct EmitInfo {
std::vector expressions;
std::shared_ptr schema;
diff --git a/cpp/src/arrow/engine/substrait/relation_internal.h b/cpp/src/arrow/engine/substrait/relation_internal.h
index c7241490783..3f48cb55e62 100644
--- a/cpp/src/arrow/engine/substrait/relation_internal.h
+++ b/cpp/src/arrow/engine/substrait/relation_internal.h
@@ -31,6 +31,8 @@
namespace arrow {
namespace engine {
+namespace substrait = ::substrait;
+
/// Information resulting from converting a Substrait relation.
struct ARROW_ENGINE_EXPORT DeclarationInfo {
/// The compute declaration produced thus far.
diff --git a/cpp/src/arrow/engine/substrait/serde.cc b/cpp/src/arrow/engine/substrait/serde.cc
index b2cac3f82e4..22ef0d38f7a 100644
--- a/cpp/src/arrow/engine/substrait/serde.cc
+++ b/cpp/src/arrow/engine/substrait/serde.cc
@@ -42,6 +42,8 @@
namespace arrow {
namespace engine {
+namespace substrait = ::substrait;
+
Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name,
google::protobuf::Message* message) {
google::protobuf::io::ArrayInputStream buf_stream{buf.data(),
diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc
index 09d260aadf6..fb4454c8d97 100644
--- a/cpp/src/arrow/engine/substrait/serde_test.cc
+++ b/cpp/src/arrow/engine/substrait/serde_test.cc
@@ -2255,16 +2255,6 @@ TEST(SubstraitRoundTrip, BasicPlanEndToEnd) {
EXPECT_TRUE(expected_table->Equals(*rnd_trp_table));
}
-NamedTableProvider ProvideMadeTable(
- std::function>(const std::vector&)> make) {
- return [make](const std::vector& names) -> Result {
- ARROW_ASSIGN_OR_RAISE(auto table, make(names));
- std::shared_ptr options =
- std::make_shared(table);
- return compute::Declaration("table_source", {}, options, "mock_source");
- };
-}
-
TEST(SubstraitRoundTrip, ProjectRel) {
#ifdef _WIN32
GTEST_SKIP() << "ARROW-16392: Substrait File URI not supported for Windows";
@@ -3330,6 +3320,16 @@ TEST(Substrait, IsthmusPlan) {
*compute::default_exec_context(), buf, {}, conversion_options);
}
+NamedTableProvider ProvideMadeTable(
+ std::function>(const std::vector&)> make) {
+ return [make](const std::vector& names) -> Result {
+ ARROW_ASSIGN_OR_RAISE(auto table, make(names));
+ std::shared_ptr options =
+ std::make_shared(table);
+ return compute::Declaration("table_source", {}, options, "mock_source");
+ };
+}
+
TEST(Substrait, ProjectWithMultiFieldExpressions) {
compute::ExecContext exec_context;
auto dummy_schema =
@@ -3659,7 +3659,7 @@ TEST(Substrait, NestedEmitProjectWithMultiFieldExpressions) {
buf, {}, conversion_options);
}
-TEST(Substrait, PlanWithExtension) {
+TEST(Substrait, PlanWithAsOfJoinExtension) {
// This demos an extension relation
std::string substrait_json = R"({
"extensionUris": [],
@@ -3752,8 +3752,8 @@ TEST(Substrait, PlanWithExtension) {
}
],
"detail": {
- "@type": "/arrow.substrait_ext.AsOfJoinRel",
- "input_keys" : [
+ "@type": "/arrow.substrait.AsOfJoinRel",
+ "keys" : [
{
"on": {
"selection": {
diff --git a/cpp/src/arrow/engine/substrait/test_plan_builder.cc b/cpp/src/arrow/engine/substrait/test_plan_builder.cc
index 79820672ed9..a6b2d7c6780 100644
--- a/cpp/src/arrow/engine/substrait/test_plan_builder.cc
+++ b/cpp/src/arrow/engine/substrait/test_plan_builder.cc
@@ -31,6 +31,8 @@
namespace arrow {
namespace engine {
+
+namespace substrait = ::substrait;
namespace internal {
static const ConversionOptions kPlanBuilderConversionOptions;
diff --git a/cpp/src/arrow/engine/substrait/type_internal.h b/cpp/src/arrow/engine/substrait/type_internal.h
index 6db9aea01ae..9b4132c74c2 100644
--- a/cpp/src/arrow/engine/substrait/type_internal.h
+++ b/cpp/src/arrow/engine/substrait/type_internal.h
@@ -31,6 +31,8 @@
namespace arrow {
namespace engine {
+namespace substrait = ::substrait;
+
ARROW_ENGINE_EXPORT
Result, bool>> FromProto(const substrait::Type&,
const ExtensionSet&,
diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc
index e7162eb981c..b2f947eea46 100644
--- a/cpp/src/parquet/column_reader_test.cc
+++ b/cpp/src/parquet/column_reader_test.cc
@@ -260,7 +260,8 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRepeated) {
ASSERT_NO_FATAL_FAILURE(ExecuteDict(num_pages, levels_per_page, &descr));
}
-TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
+// Tests skipping around page boundaries.
+TEST_F(TestPrimitiveReader, TestSkipAroundPageBoundries) {
int levels_per_page = 100;
int num_pages = 5;
max_def_level_ = 0;
@@ -289,10 +290,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
values_.begin() + static_cast(2.5 * static_cast(levels_per_page)));
ASSERT_TRUE(vector_equal(sub_values, vresult));
- // 2) skip_size == page_size (skip across two pages)
+ // 2) skip_size == page_size (skip across two pages from page 2.5 to 3.5)
levels_skipped = reader->Skip(levels_per_page);
ASSERT_EQ(levels_per_page, levels_skipped);
- // Read half a page
+ // Read half a page (page 3.5 to 4)
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
&values_read);
sub_values.clear();
@@ -303,10 +304,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
ASSERT_TRUE(vector_equal(sub_values, vresult));
// 3) skip_size < page_size (skip limited to a single page)
- // Skip half a page
+ // Skip half a page (page 4 to 4.5)
levels_skipped = reader->Skip(levels_per_page / 2);
ASSERT_EQ(0.5 * levels_per_page, levels_skipped);
- // Read half a page
+ // Read half a page (page 4.5 to 5)
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
&values_read);
sub_values.clear();
@@ -316,6 +317,15 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
values_.end());
ASSERT_TRUE(vector_equal(sub_values, vresult));
+ // 4) skip_size = 0
+ levels_skipped = reader->Skip(0);
+ ASSERT_EQ(0, levels_skipped);
+
+ // 5) Skip past the end page. There are 5 pages and we have either skipped
+ // or read all of them, so there is nothing left to skip.
+ levels_skipped = reader->Skip(10);
+ ASSERT_EQ(0, levels_skipped);
+
values_.clear();
def_levels_.clear();
rep_levels_.clear();
@@ -323,6 +333,55 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
reader_.reset();
}
+// Skip with repeated field. This test makes it clear that we are skipping
+// values and not records.
+TEST_F(TestPrimitiveReader, TestSkipRepeatedField) {
+ // Example schema: message M { repeated int32 b = 1 }
+ max_def_level_ = 1;
+ max_rep_level_ = 1;
+ NodePtr type = schema::Int32("b", Repetition::REPEATED);
+ const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+ // Example rows: {}, {[10, 10]}, {[20, 20, 20]}
+ std::vector values = {10, 10, 20, 20, 20};
+ std::vector def_levels = {0, 1, 1, 1, 1, 1};
+ std::vector rep_levels = {0, 0, 1, 0, 1, 1};
+ num_values_ = static_cast(def_levels.size());
+ std::shared_ptr page = MakeDataPage(
+ &descr, values, num_values_, Encoding::PLAIN, /*indices=*/{},
+ /*indices_size=*/0, def_levels, max_def_level_, rep_levels, max_rep_level_);
+
+ pages_.push_back(std::move(page));
+
+ InitReader(&descr);
+ Int32Reader* reader = static_cast(reader_.get());
+
+ // Vecotrs to hold read values, definition levels, and repetition levels.
+ std::vector read_vals(4, -1);
+ std::vector read_defs(4, -1);
+ std::vector read_reps(4, -1);
+
+ // Skip two levels.
+ int64_t levels_skipped = reader->Skip(2);
+ ASSERT_EQ(2, levels_skipped);
+
+ int64_t num_read_values = 0;
+ // Read the next set of values
+ reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
+ &num_read_values);
+ ASSERT_EQ(num_read_values, 4);
+ // Note that we end up in the record with {[10, 10]}
+ ASSERT_TRUE(vector_equal({10, 20, 20, 20}, read_vals));
+ ASSERT_TRUE(vector_equal({1, 1, 1, 1}, read_defs));
+ ASSERT_TRUE(vector_equal({1, 0, 1, 1}, read_reps));
+
+ // No values remain in data page
+ levels_skipped = reader->Skip(2);
+ ASSERT_EQ(0, levels_skipped);
+ reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
+ &num_read_values);
+ ASSERT_EQ(num_read_values, 0);
+}
+
// Page claims to have two values but only 1 is present.
TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
max_def_level_ = 1;
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 7409b66c86e..0cc496e9385 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -81,9 +81,6 @@ ARROW_RE2_BUILD_SHA256_CHECKSUM=f89c61410a072e5cbcf8c27e3a778da7d6fd2f2b5b1445cd
# 1.1.9 is patched to implement https://github.com/google/snappy/pull/148 if this is bumped, remove the patch
ARROW_SNAPPY_BUILD_VERSION=1.1.9
ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8b4f1c962b478b6e06e7
-# There is a bug in GCC < 4.9 with Snappy 1.1.9, so revert to 1.1.8 for those (ARROW-14661)
-ARROW_SNAPPY_OLD_BUILD_VERSION=1.1.8
-ARROW_SNAPPY_OLD_BUILD_SHA256_CHECKSUM=16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
ARROW_SUBSTRAIT_BUILD_VERSION=v0.6.0
ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=7b8583b9684477e9027f417bbfb4febb8acfeb01923dcaa7cf0fd3f921d69c88
ARROW_THRIFT_BUILD_VERSION=0.16.0
@@ -94,8 +91,8 @@ ARROW_UTF8PROC_BUILD_VERSION=v2.7.0
ARROW_UTF8PROC_BUILD_SHA256_CHECKSUM=4bb121e297293c0fd55f08f83afab6d35d48f0af4ecc07523ad8ec99aa2b12a1
ARROW_XSIMD_BUILD_VERSION=9.0.1
ARROW_XSIMD_BUILD_SHA256_CHECKSUM=b1bb5f92167fd3a4f25749db0be7e61ed37e0a5d943490f3accdcd2cd2918cc0
-ARROW_ZLIB_BUILD_VERSION=1.2.12
-ARROW_ZLIB_BUILD_SHA256_CHECKSUM=91844808532e5ce316b3c010929493c0244f3d37593afd6de04f71821d5136d9
+ARROW_ZLIB_BUILD_VERSION=1.2.13
+ARROW_ZLIB_BUILD_SHA256_CHECKSUM=b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30
ARROW_ZSTD_BUILD_VERSION=v1.5.2
ARROW_ZSTD_BUILD_SHA256_CHECKSUM=f7de13462f7a82c29ab865820149e778cbfe01087b3a55b5332707abf9db4a6e
diff --git a/dev/archery/archery/crossbow/core.py b/dev/archery/archery/crossbow/core.py
index c8ea8a13a4b..a83c190d121 100644
--- a/dev/archery/archery/crossbow/core.py
+++ b/dev/archery/archery/crossbow/core.py
@@ -738,14 +738,16 @@ class Target(Serializable):
(currently only an email address where the notification should be sent).
"""
- def __init__(self, head, branch, remote, version, email=None):
+ def __init__(self, head, branch, remote, version, r_version, email=None):
self.head = head
self.email = email
self.branch = branch
self.remote = remote
self.github_repo = "/".join(_parse_github_user_repo(remote))
self.version = version
+ self.r_version = r_version
self.no_rc_version = re.sub(r'-rc\d+\Z', '', version)
+ self.no_rc_r_version = re.sub(r'-rc\d+\Z', '', r_version)
# TODO(ARROW-17552): Remove "master" from default_branch after
# migration to "main".
self.default_branch = ['main', 'master']
@@ -791,8 +793,39 @@ def from_repo(cls, repo, head=None, branch=None, remote=None, version=None,
if email is None:
email = repo.user_email
+ version_dev_match = re.match(r".*\.dev(\d+)$", version)
+ if version_dev_match:
+ with open(f"{repo.path}/r/DESCRIPTION") as description_file:
+ description = description_file.read()
+ r_version_pattern = re.compile(r"^Version:\s*(.*)$",
+ re.MULTILINE)
+ r_version = re.findall(r_version_pattern, description)[0]
+ if r_version:
+ version_dev = int(version_dev_match[1])
+ # "1_0000_00_00 +" is for generating a greater version
+ # than YYYYMMDD. For example, 1_0000_00_01
+ # (version_dev == 1 case) is greater than 2022_10_16.
+ #
+ # Why do we need a greater version than YYYYMMDD? It's
+ # for keeping backward compatibility. We used
+ # MAJOR.MINOR.PATCH.YYYYMMDD as our nightly package
+ # version. (See also ARROW-16403). If we use "9000 +
+ # version_dev" here, a developer that used
+ # 9.0.0.20221016 can't upgrade to the later nightly
+ # package unless we release 10.0.0. Because 9.0.0.9234
+ # or something is less than 9.0.0.20221016.
+ r_version_dev = 1_0000_00_00 + version_dev
+ # version: 10.0.0.dev234
+ # r_version: 9.0.0.9000
+ # -> 9.0.0.100000234
+ r_version = re.sub(r"\.9000\Z", f".{r_version_dev}", r_version)
+ else:
+ r_version = version
+ else:
+ r_version = version
+
return cls(head=head, email=email, branch=branch, remote=remote,
- version=version)
+ version=version, r_version=r_version)
def is_default_branch(self):
# TODO(ARROW-17552): Switch the condition to "is" instead of "in"
@@ -1105,7 +1138,10 @@ def from_config(cls, config, target, tasks=None, groups=None, params=None):
'version': target.version,
'no_rc_version': target.no_rc_version,
'no_rc_semver_version': target.no_rc_semver_version,
- 'no_rc_snapshot_version': target.no_rc_snapshot_version}
+ 'no_rc_snapshot_version': target.no_rc_snapshot_version,
+ 'r_version': target.r_version,
+ 'no_rc_r_version': target.no_rc_r_version,
+ }
for task_name, task in task_definitions.items():
task = task.copy()
artifacts = task.pop('artifacts', None) or [] # because of yaml
@@ -1260,6 +1296,7 @@ def validate(self):
branch='master',
remote='https://github.com/apache/arrow',
version='1.0.0dev123',
+ r_version='0.13.0.100000123',
email='dummy@example.ltd'
)
job = Job.from_config(config=self,
diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml
index 15e8ca3ff5e..eb03bbee0bd 100644
--- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml
+++ b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job-no-failure.yaml
@@ -5,7 +5,9 @@ target: !Target
branch: refs/pull/4435/merge
remote: https://github.com/apache/arrow
version: 0.13.0.dev306
+ r_version: 0.12.0.100000306
no_rc_version: 0.13.0.dev306
+ no_rc_r_version: 0.12.0.100000306
tasks:
docker-cpp-cmake32: !Task
ci: circle
@@ -64,4 +66,4 @@ branch: ursabot-1
_queue: !Queue
path: the_path
github_token: xxxxxxxxx
- _remote_url: https://github.com/apache/crossbow
\ No newline at end of file
+ _remote_url: https://github.com/apache/crossbow
diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml
index 90eab704988..f6de07dd456 100644
--- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml
+++ b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml
@@ -5,7 +5,9 @@ target: !Target
branch: refs/pull/4435/merge
remote: https://github.com/apache/arrow
version: 0.13.0.dev306
+ r_version: 0.12.0.100000306
no_rc_version: 0.13.0.dev306
+ no_rc_r_version: 0.12.0.100000306
tasks:
docker-cpp-cmake32: !Task
ci: circle
@@ -64,4 +66,4 @@ branch: ursabot-1
_queue: !Queue
path: the_path
github_token: xxxxxxxxx
- _remote_url: https://github.com/apache/crossbow
\ No newline at end of file
+ _remote_url: https://github.com/apache/crossbow
diff --git a/dev/archery/archery/docker/cli.py b/dev/archery/archery/docker/cli.py
index bbdd2261db6..c7b42c094f6 100644
--- a/dev/archery/archery/docker/cli.py
+++ b/dev/archery/archery/docker/cli.py
@@ -16,6 +16,7 @@
# under the License.
import os
+import sys
import click
@@ -289,3 +290,27 @@ def docker_compose_images(obj):
click.echo('Available images:')
for image in compose.images():
click.echo(f' - {image}')
+
+
+@docker.command('info')
+@click.argument('service_name')
+@click.option('--show', '-s', required=False,
+ help="Show only specific docker-compose key. Examples of keys:"
+ " command, environment, build, dockerfile")
+@click.pass_obj
+def docker_compose_info(obj, service_name, show):
+ """Show docker-compose definition info for service_name.
+
+ SERVICE_NAME is the name of the docker service defined on
+ the docker-compose. Look at `archery docker images` output for names.
+ """
+ compose = obj['compose']
+ try:
+ service = compose.config.raw_config["services"][service_name]
+ except KeyError:
+ click.echo(f'Service name {service_name} could not be found', err=True)
+ sys.exit(1)
+ else:
+ click.echo(f'Service {service_name} docker-compose config:')
+ output = "\n".join(compose.info(service, show))
+ click.echo(output)
diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py
index da15f86935b..de5e6cf41c9 100644
--- a/dev/archery/archery/docker/core.py
+++ b/dev/archery/archery/docker/core.py
@@ -95,12 +95,12 @@ def _read_config(self, config_path, compose_bin):
"""
yaml = YAML()
with config_path.open() as fp:
- config = yaml.load(fp)
+ self.raw_config = yaml.load(fp)
- services = config['services'].keys()
- self.hierarchy = dict(flatten(config.get('x-hierarchy', {})))
- self.limit_presets = config.get('x-limit-presets', {})
- self.with_gpus = config.get('x-with-gpus', [])
+ services = self.raw_config['services'].keys()
+ self.hierarchy = dict(flatten(self.raw_config.get('x-hierarchy', {})))
+ self.limit_presets = self.raw_config.get('x-limit-presets', {})
+ self.with_gpus = self.raw_config.get('x-with-gpus', [])
nodes = self.hierarchy.keys()
errors = []
@@ -417,3 +417,22 @@ def _push(service):
def images(self):
return sorted(self.config.hierarchy.keys())
+
+ def info(self, key_name, filters=None, prefix=' '):
+ output = []
+ for key, value in key_name.items():
+ if hasattr(value, 'items'):
+ temp_filters = filters
+ if key == filters or filters is None:
+ output.append(f'{prefix} {key}')
+ # Keep showing this specific key
+ # as parent matched filter
+ temp_filters = None
+ output.extend(self.info(value, temp_filters, prefix + " "))
+ else:
+ if key == filters or filters is None:
+ output.append(
+ f'{prefix} {key}: ' +
+ f'{value if value is not None else ""}'
+ )
+ return output
diff --git a/dev/archery/archery/docker/tests/test_docker.py b/dev/archery/archery/docker/tests/test_docker.py
index 899a0449e1a..bc25738becf 100644
--- a/dev/archery/archery/docker/tests/test_docker.py
+++ b/dev/archery/archery/docker/tests/test_docker.py
@@ -114,6 +114,11 @@
arrow_compose_yml = """
version: '3.5'
+x-sccache: &sccache
+ AWS_ACCESS_KEY_ID:
+ AWS_SECRET_ACCESS_KEY:
+ SCCACHE_BUCKET:
+
x-with-gpus:
- ubuntu-cuda
@@ -162,6 +167,8 @@
image: org/ubuntu-cpp-cmake32
ubuntu-c-glib:
image: org/ubuntu-c-glib
+ environment:
+ <<: [*sccache]
ubuntu-ruby:
image: org/ubuntu-ruby
ubuntu-cuda:
@@ -529,3 +536,39 @@ def test_listing_images(arrow_compose_path):
'ubuntu-cuda',
'ubuntu-ruby',
]
+
+
+def test_service_info(arrow_compose_path):
+ compose = DockerCompose(arrow_compose_path)
+ service = compose.config.raw_config["services"]["conda-cpp"]
+ assert compose.info(service) == [
+ " image: org/conda-cpp",
+ " build",
+ " context: .",
+ " dockerfile: ci/docker/conda-cpp.dockerfile"
+ ]
+
+
+def test_service_info_filters(arrow_compose_path):
+ compose = DockerCompose(arrow_compose_path)
+ service = compose.config.raw_config["services"]["conda-cpp"]
+ assert compose.info(service, filters="dockerfile") == [
+ " dockerfile: ci/docker/conda-cpp.dockerfile"
+ ]
+
+
+def test_service_info_non_existing_filters(arrow_compose_path):
+ compose = DockerCompose(arrow_compose_path)
+ service = compose.config.raw_config["services"]["conda-cpp"]
+ assert compose.info(service, filters="non-existing") == []
+
+
+def test_service_info_inherited_env(arrow_compose_path):
+ compose = DockerCompose(arrow_compose_path)
+ service = compose.config.raw_config["services"]["ubuntu-c-glib"]
+ assert compose.info(service, filters="environment") == [
+ " environment",
+ " AWS_ACCESS_KEY_ID: ",
+ " AWS_SECRET_ACCESS_KEY: ",
+ " SCCACHE_BUCKET: "
+ ]
diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py
index f6ecc7d1ffa..490ee787e33 100755
--- a/dev/merge_arrow_pr.py
+++ b/dev/merge_arrow_pr.py
@@ -426,7 +426,9 @@ def extract_co_authors(commit):
commit_title = f'{self.title} (#{self.number})'
commit_message_chunks = []
if self.body is not None:
- commit_message_chunks.append(self.body)
+ # avoid github user name references by inserting a space after @
+ body = re.sub(r"@(\w+)", "@ \\1", self.body)
+ commit_message_chunks.append(body)
committer_name = run_cmd("git config --get user.name").strip()
committer_email = run_cmd("git config --get user.email").strip()
diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh
index ad74dbce8d0..0c05cf2192c 100755
--- a/dev/release/post-08-docs.sh
+++ b/dev/release/post-08-docs.sh
@@ -81,7 +81,8 @@ tar xvf docs.tar.gz
rm -f docs.tar.gz
git checkout docs/c_glib/index.html
if [ "$is_major_release" = "yes" ] ; then
- mv docs_temp docs/${previous_version}
+ previous_series=${previous_version%.*}
+ mv docs_temp docs/${previous_series}
fi
git add docs
git commit -m "[Website] Update documentations for ${version}"
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index bdb666fd658..14d48b1a615 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -141,6 +141,7 @@ go/arrow/unionmode_string.go
go/arrow/compute/go.sum
go/arrow/compute/datumkind_string.go
go/arrow/compute/funckind_string.go
+go/arrow/compute/internal/kernels/_lib/vendored/*
go/*.tmpldata
go/*.s
go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh
index a9681af5287..902e4ec7134 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -213,6 +213,7 @@ test_apt() {
;;
esac
if ! docker run --rm -v "${ARROW_DIR}":/arrow:delegated \
+ --security-opt="seccomp=unconfined" \
"${target}" \
/arrow/dev/release/verify-apt.sh \
"${VERSION}" \
@@ -1067,11 +1068,12 @@ test_macos_wheels() {
# the interpreter should be installed from python.org:
# https://www.python.org/ftp/python/3.9.6/python-3.9.6-macosx10.9.pkg
if [ "$(uname -m)" = "arm64" ]; then
- for pyver in "3.9 3.10"; do
+ for pyver in 3.9 3.10; do
local python="/Library/Frameworks/Python.framework/Versions/${pyver}/bin/python${pyver}"
# create and activate a virtualenv for testing as arm64
for arch in "arm64" "x86_64"; do
+ show_header "Testing Python ${pyver} universal2 wheel on ${arch}"
VENV_ENV=wheel-${pyver}-universal2-${arch} PYTHON=${python} maybe_setup_virtualenv || continue
# install pyarrow's universal2 wheel
pip install pyarrow-${VERSION}-cp${pyver/.}-cp${pyver/.}-macosx_11_0_universal2.whl
diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat b/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat
index 02de305eaaa..21e2ae714ed 100644
--- a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat
+++ b/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat
@@ -31,7 +31,7 @@ cmake -G "Ninja" ^
-DARROW_HDFS:BOOL=ON ^
-DARROW_JSON:BOOL=ON ^
-DARROW_MIMALLOC:BOOL=ON ^
- -DARROW_ORC:BOOL=ON ^
+ -DARROW_ORC:BOOL=OFF ^
-DARROW_PACKAGE_PREFIX="%LIBRARY_PREFIX%" ^
-DARROW_PARQUET:BOOL=ON ^
-DARROW_S3:BOOL=ON ^
diff --git a/dev/tasks/linux-packages/github.linux.amd64.yml b/dev/tasks/linux-packages/github.linux.amd64.yml
index f252a081d67..d6488d5e714 100644
--- a/dev/tasks/linux-packages/github.linux.amd64.yml
+++ b/dev/tasks/linux-packages/github.linux.amd64.yml
@@ -44,7 +44,6 @@ jobs:
rake version:update
rake docker:pull || :
rake --trace {{ task_namespace }}:build BUILD_DIR=build
- sudo rm -rf */*/build
popd
env:
APT_TARGETS: {{ target }}
@@ -103,5 +102,5 @@ jobs:
ARROW_VERSION: {{ arrow.version }}
YUM_TARGETS: {{ target }}
- {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/**/*{}") %}
+ {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/*/*/repositories/**/*{}") %}
{{ macros.github_upload_releases(patterns)|indent }}
diff --git a/dev/tasks/linux-packages/travis.linux.arm64.yml b/dev/tasks/linux-packages/travis.linux.arm64.yml
index bc2311a33d6..f3ec4f1de2b 100644
--- a/dev/tasks/linux-packages/travis.linux.arm64.yml
+++ b/dev/tasks/linux-packages/travis.linux.arm64.yml
@@ -160,5 +160,5 @@ script:
- popd
after_success:
- {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/**/*{}") %}
+ {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/*/*/repositories/**/*{}") %}
{{ macros.travis_upload_releases(patterns) }}
diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja
index 3bec472bcf6..bd3358e0733 100644
--- a/dev/tasks/macros.jinja
+++ b/dev/tasks/macros.jinja
@@ -269,7 +269,7 @@ on:
rm -f apache-arrow*.rb.bak
{% endmacro %}
-{%- macro github_change_r_pkg_version(is_fork, version = '\\2.\'\"$(date +%Y%m%d)\"\'' ) -%}
+{%- macro github_change_r_pkg_version(is_fork, version) -%}
- name: Modify version
shell: bash
run: |
@@ -339,12 +339,16 @@ on:
# tree not available in git-bash on windows
run: |
ls -R repo
- - name: Add dev repo to .Rprofile
+ - name: Add repos to .Rprofile
shell: Rscript {0}
run: |
- str <- paste0("options(arrow.dev_repo ='file://", getwd(), "/repo' )")
- print(str)
profile_path <- file.path(getwd(), ".Rprofile")
+ repo <- paste0("file://", getwd(), "/repo")
+ str <- paste0("options(arrow.repo = '", repo, "' )")
+ print(str)
+ write(str, file = profile_path, append = TRUE)
+ str <- paste0("options(arrow.dev_repo = '", repo, "' )")
+ print(str)
write(str, file = profile_path, append = TRUE)
# Set envvar for later steps by appending to $GITHUB_ENV
write(paste0("R_PROFILE_USER=", profile_path), file = Sys.getenv("GITHUB_ENV"), append = TRUE)
diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml
index 222dbab3a08..e53579f7416 100644
--- a/dev/tasks/r/github.packages.yml
+++ b/dev/tasks/r/github.packages.yml
@@ -17,10 +17,6 @@
{% import 'macros.jinja' as macros with context %}
-# This allows us to set a custom version via param:
-# crossbow submit --param custom_version=8.5.3 r-binary-packages
-# if the param is unset defaults to the usual Ymd naming scheme
-{% set package_version = custom_version|replace("Unset", "\\2.\'\"$(date +%Y%m%d)\"\'") %}
{% set is_fork = macros.is_fork %}
{{ macros.github_header() }}
@@ -35,7 +31,7 @@ jobs:
pkg_version: {{ '${{ steps.save-version.outputs.pkg_version }}' }}
steps:
{{ macros.github_checkout_arrow()|indent }}
- {{ macros.github_change_r_pkg_version(is_fork, package_version)|indent }}
+ {{ macros.github_change_r_pkg_version(is_fork, arrow.no_rc_r_version)|indent }}
- name: Save Version
id: save-version
shell: bash
@@ -163,7 +159,7 @@ jobs:
rig default {{ '${{ matrix.r_version.r }}' }}$rig_arch
rig system setup-user-lib
- rig system add-pak
+ rig system add-pak
{{ macros.github_setup_local_r_repo(false, true)|indent }}
- name: Prepare Dependency Installation
@@ -275,18 +271,13 @@ jobs:
ARROW_R_DEV: "TRUE"
LIBARROW_BUILD: "FALSE"
LIBARROW_BINARY: {{ '${{ matrix.config.libarrow_binary }}' }}
- DEVTOOLSET_VERSION: {{ '${{ matrix.config.devtoolset }}' }}
shell: bash
run: |
- if [[ "$DEVTOOLSET_VERSION" -gt 0 ]]; then
- # enable the devtoolset version to use it
- source /opt/rh/devtoolset-$DEVTOOLSET_VERSION/enable
- fi
Rscript -e '
{{ macros.github_test_r_src_pkg()|indent(8) }}
'
- name: Upload binary artifact
- if: matrix.config.devtoolset
+ if: matrix.config.devtoolset
uses: actions/upload-artifact@v3
with:
name: r-pkg_centos7
@@ -307,11 +298,11 @@ jobs:
pkg <- pkg[[1]]
warning("Multiple packages found! Using first one.")
}
-
+
# Install dependencies from RSPM
install.packages("arrow", repos = "https://packagemanager.rstudio.com/all/__linux__/centos7/latest")
remove.packages("arrow")
-
+
install.packages(pkg)
library(arrow)
read_parquet(system.file("v0.7.1.parquet", package = "arrow"))
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 81a2c24bba1..bdf53ff1dac 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -957,17 +957,17 @@ tasks:
params:
custom_version: Unset
artifacts:
- - r-lib__libarrow__bin__windows__arrow-[0-9\.]+\.zip
- - r-lib__libarrow__bin__centos-7__arrow-[0-9\.]+\.zip
- - r-lib__libarrow__bin__ubuntu-18.04__arrow-[0-9\.]+\.zip
- - r-lib__libarrow__bin__ubuntu-22.04__arrow-[0-9\.]+\.zip
- - r-pkg__bin__windows__contrib__4.1__arrow_[0-9\.]+\.zip
- - r-pkg__bin__windows__contrib__4.2__arrow_[0-9\.]+\.zip
- - r-pkg__bin__macosx__contrib__4.1__arrow_[0-9\.]+\.tgz
- - r-pkg__bin__macosx__contrib__4.2__arrow_[0-9\.]+\.tgz
- - r-pkg__bin__macosx__big-sur-arm64__contrib__4.1__arrow_[0-9\.]+\.tgz
- - r-pkg__bin__macosx__big-sur-arm64__contrib__4.2__arrow_[0-9\.]+\.tgz
- - r-pkg__src__contrib__arrow_[0-9\.]+\.tar\.gz
+ - r-lib__libarrow__bin__windows__arrow-{no_rc_r_version}\.zip
+ - r-lib__libarrow__bin__centos-7__arrow-{no_rc_r_version}\.zip
+ - r-lib__libarrow__bin__ubuntu-18.04__arrow-{no_rc_r_version}\.zip
+ - r-lib__libarrow__bin__ubuntu-22.04__arrow-{no_rc_r_version}\.zip
+ - r-pkg__bin__windows__contrib__4.1__arrow_{no_rc_r_version}\.zip
+ - r-pkg__bin__windows__contrib__4.2__arrow_{no_rc_r_version}\.zip
+ - r-pkg__bin__macosx__contrib__4.1__arrow_{no_rc_r_version}\.tgz
+ - r-pkg__bin__macosx__contrib__4.2__arrow_{no_rc_r_version}\.tgz
+ - r-pkg__bin__macosx__big-sur-arm64__contrib__4.1__arrow_{no_rc_r_version}\.tgz
+ - r-pkg__bin__macosx__big-sur-arm64__contrib__4.2__arrow_{no_rc_r_version}\.tgz
+ - r-pkg__src__contrib__arrow_{no_rc_r_version}\.tar\.gz
########################### Release verification ############################
diff --git a/docker-compose.yml b/docker-compose.yml
index 1c3813757fa..86e4c9fd61b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1068,7 +1068,7 @@ services:
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
pip install -e /arrow/dev/archery[numpydoc] &&
- archery numpydoc --allow-rule PR01,PR10 &&
+ archery numpydoc --allow-rule PR01,PR03,PR10 &&
/arrow/ci/scripts/python_test.sh /arrow"]
conda-python-dask:
diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index fc48b2d65ec..74737cb7496 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -198,7 +198,7 @@ dependencies for Arrow C++ and PyArrow as pre-built binaries, which can make
Arrow development easier and faster.
Let's create a conda environment with all the C++ build and Python dependencies
-from conda-forge, targeting development for Python 3.9:
+from conda-forge, targeting development for Python 3.10:
On Linux and macOS:
@@ -210,7 +210,7 @@ On Linux and macOS:
--file arrow/ci/conda_env_python.txt \
--file arrow/ci/conda_env_gandiva.txt \
compilers \
- python=3.9 \
+ python=3.10 \
pandas
As of January 2019, the ``compilers`` package is needed on many Linux
@@ -495,23 +495,20 @@ First, starting from a fresh clone of Apache Arrow:
--file arrow\ci\conda_env_cpp.txt ^
--file arrow\ci\conda_env_python.txt ^
--file arrow\ci\conda_env_gandiva.txt ^
- python=3.9
+ python=3.10
$ conda activate pyarrow-dev
Now, we build and install Arrow C++ libraries.
-We set a number of environment variables:
-
-- the path of the installation directory of the Arrow C++ libraries as
- ``ARROW_HOME``
-- add the path of installed DLL libraries to ``PATH``
-- and the CMake generator to be used as ``PYARROW_CMAKE_GENERATOR``
+We set the path of the installation directory of the Arrow C++ libraries as
+``ARROW_HOME``. When using a conda environment, Arrow C++ is installed
+in the environment directory, which path is saved in the
+`CONDA_PREFIX `_
+environment variable.
.. code-block::
- $ set ARROW_HOME=%cd%\arrow-dist
- $ set PATH=%ARROW_HOME%\bin;%PATH%
- $ set PYARROW_CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+ $ set ARROW_HOME=%CONDA_PREFIX%\Library
Let's configure, build and install the Arrow C++ libraries:
@@ -519,7 +516,7 @@ Let's configure, build and install the Arrow C++ libraries:
$ mkdir arrow\cpp\build
$ pushd arrow\cpp\build
- $ cmake -G "%PYARROW_CMAKE_GENERATOR%" ^
+ $ cmake -G "Ninja" ^
-DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
-DCMAKE_UNITY_BUILD=ON ^
-DARROW_COMPUTE=ON ^
@@ -535,7 +532,7 @@ Let's configure, build and install the Arrow C++ libraries:
-DARROW_WITH_ZLIB=ON ^
-DARROW_WITH_ZSTD=ON ^
..
- $ cmake --build . --target INSTALL --config Release
+ $ cmake --build . --target install --config Release
$ popd
Now, we can build pyarrow:
@@ -572,10 +569,6 @@ Then run the unit tests with:
the Python extension. This is recommended for development as it allows the
C++ libraries to be re-built separately.
- As a consequence however, ``python setup.py install`` will also not install
- the Arrow C++ libraries. Therefore, to use ``pyarrow`` in python, ``PATH``
- must contain the directory with the Arrow .dll-files.
-
If you want to bundle the Arrow C++ libraries with ``pyarrow``, add
the ``--bundle-arrow-cpp`` option when building:
@@ -586,56 +579,10 @@ Then run the unit tests with:
Important: If you combine ``--bundle-arrow-cpp`` with ``--inplace`` the
Arrow C++ libraries get copied to the source tree and are not cleared
by ``python setup.py clean``. They remain in place and will take precedence
- over any later Arrow C++ libraries contained in ``PATH``. This can lead to
+ over any later Arrow C++ libraries contained in ``CONDA_PREFIX``. This can lead to
incompatibilities when ``pyarrow`` is later built without
``--bundle-arrow-cpp``.
-Running C++ unit tests for Python integration
----------------------------------------------
-
-Running C++ unit tests should not be necessary for most developers. If you do
-want to run them, you need to pass ``-DARROW_BUILD_TESTS=ON`` during
-configuration of the Arrow C++ library build:
-
-.. code-block::
-
- $ mkdir arrow\cpp\build
- $ pushd arrow\cpp\build
- $ cmake -G "%PYARROW_CMAKE_GENERATOR%" ^
- -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
- -DARROW_BUILD_TESTS=ON ^
- -DARROW_COMPUTE=ON ^
- -DARROW_CSV=ON ^
- -DARROW_CXXFLAGS="/WX /MP" ^
- -DARROW_DATASET=ON ^
- -DARROW_FILESYSTEM=ON ^
- -DARROW_HDFS=ON ^
- -DARROW_JSON=ON ^
- -DARROW_PARQUET=ON ^
- ..
- $ cmake --build . --target INSTALL --config Release
- $ popd
-
-Getting ``arrow-python-test.exe`` (C++ unit tests for python integration) to
-run is a bit tricky because your ``%PYTHONHOME%`` must be configured to point
-to the active conda environment:
-
-.. code-block::
-
- $ set PYTHONHOME=%CONDA_PREFIX%
- $ pushd arrow\cpp\build\release\Release
- $ arrow-python-test.exe
- $ popd
-
-To run all tests of the Arrow C++ library, you can also run ``ctest``:
-
-.. code-block::
-
- $ set PYTHONHOME=%CONDA_PREFIX%
- $ pushd arrow\cpp\build
- $ ctest
- $ popd
-
Caveats
-------
diff --git a/docs/source/java/flight_sql.rst b/docs/source/java/flight_sql.rst
new file mode 100644
index 00000000000..dbf97238d4c
--- /dev/null
+++ b/docs/source/java/flight_sql.rst
@@ -0,0 +1,32 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+================
+Arrow Flight SQL
+================
+
+Arrow Flight SQL is an RPC framework for efficient transfer of Arrow data
+over the network.
+
+.. seealso::
+
+ :doc:`Flight SQL protocol documentation <../format/FlightSql>`
+ Documentation of the Flight SQL protocol.
+
+For usage information, see the `API documentation`_.
+
+.. _API documentation: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/sql/package-summary.html
diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst
new file mode 100644
index 00000000000..65b1a7162f4
--- /dev/null
+++ b/docs/source/java/flight_sql_jdbc_driver.rst
@@ -0,0 +1,128 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+============================
+Arrow Flight SQL JDBC Driver
+============================
+
+The Flight SQL JDBC driver is a JDBC driver implementation that uses
+the :doc:`Flight SQL protocol <../format/FlightSql>` under the hood.
+This driver can be used with any database that implements Flight SQL.
+
+.. contents::
+
+Installation and Requirements
+=============================
+
+The driver is compatible with JDK 8+. On JDK 9+, the following JVM
+parameter is required:
+
+.. code-block:: shell
+
+ java --add-opens=java.base/java.nio=ALL-UNNAMED ...
+
+To add a dependency via Maven, use a ``pom.xml`` like the following:
+
+.. code-block:: xml
+
+
+
+ 4.0.0
+ org.example
+ demo
+ 1.0-SNAPSHOT
+
+ 10.0.0
+
+
+
+ org.apache.arrow
+ flight-sql-jdbc-driver
+ ${arrow.version}
+
+
+
+
+Connecting to a Database
+========================
+
+The URI format is as follows::
+
+ jdbc:arrow-flight-sql://HOSTNAME:PORT[/?param1=val1¶m2=val2&...]
+
+For example, take this URI::
+
+ jdbc:arrow-flight-sql://localhost:12345/?username=admin&password=pass&useEncryption=1
+
+This will connect to a Flight SQL service running on ``localhost`` on
+port 12345. It will create a secure, encrypted connection, and
+authenticate using the username ``admin`` and the password ``pass``.
+
+The components of the URI are as follows.
+
+* The URI scheme must be ``jdbc:arrow-flight-sql://``.
+* **HOSTNAME** is the hostname of the Flight SQL service.
+* **PORT** is the port of the Flight SQL service.
+
+Additional options can be passed as query parameters. The supported
+parameters are:
+
+.. list-table::
+ :header-rows: 1
+
+ * - Parameter
+ - Default
+ - Description
+
+ * - disableCertificateVerification
+ - false
+ - When TLS is enabled, whether to verify the server certificate
+
+ * - password
+ - null
+ - The password for user/password authentication
+
+ * - threadPoolSize
+ - 1
+ - The size of an internal thread pool
+
+ * - token
+ - null
+ - The token used for token authentication
+
+ * - trustStore
+ - null
+ - When TLS is enabled, the path to the certificate store
+
+ * - trustStorePassword
+ - null
+ - When TLS is enabled, the password for the certificate store
+
+ * - useEncryption
+ - false
+ - Whether to use TLS (the default is an insecure, plaintext
+ connection)
+
+ * - username
+ - null
+ - The username for user/password authentication
+
+ * - useSystemTrustStore
+ - true
+ - When TLS is enabled, whether to use the system certificate store
diff --git a/docs/source/java/index.rst b/docs/source/java/index.rst
index 3c9bde6ba53..a1e924f9c09 100644
--- a/docs/source/java/index.rst
+++ b/docs/source/java/index.rst
@@ -34,6 +34,8 @@ on the Arrow format and other language bindings see the :doc:`parent documentati
ipc
algorithm
flight
+ flight_sql
+ flight_sql_jdbc_driver
dataset
cdata
jdbc
diff --git a/go/arrow/bitutil/_lib/bitmap_ops.c b/go/arrow/bitutil/_lib/bitmap_ops.c
index 96817b2f2b5..f48b4d4d821 100644
--- a/go/arrow/bitutil/_lib/bitmap_ops.c
+++ b/go/arrow/bitutil/_lib/bitmap_ops.c
@@ -31,4 +31,16 @@ void FULL_NAME(bitmap_aligned_or)(const uint8_t* left, const uint8_t* right, uin
for (int64_t i = 0; i < nbytes; ++i) {
out[i] = left[i] | right[i];
}
-}
\ No newline at end of file
+}
+
+void FULL_NAME(bitmap_aligned_and_not)(const uint8_t* left, const uint8_t* right, uint8_t* out, const int64_t nbytes) {
+ for (int64_t i = 0; i < nbytes; ++i) {
+ out[i] = left[i] & ~right[i];
+ }
+}
+
+void FULL_NAME(bitmap_aligned_xor)(const uint8_t* left, const uint8_t* right, uint8_t* out, const int64_t nbytes) {
+ for (int64_t i = 0; i < nbytes; ++i) {
+ out[i] = left[i] ^ right[i];
+ }
+}
diff --git a/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s b/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s
index 69f69d29708..a4010dab55b 100644
--- a/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s
+++ b/go/arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s
@@ -207,6 +207,204 @@ bitmap_aligned_or_avx2: # @bitmap_aligned_or_avx2
.Lfunc_end1:
.size bitmap_aligned_or_avx2, .Lfunc_end1-bitmap_aligned_or_avx2
# -- End function
+ .globl bitmap_aligned_and_not_avx2 # -- Begin function bitmap_aligned_and_not_avx2
+ .p2align 4, 0x90
+ .type bitmap_aligned_and_not_avx2,@function
+bitmap_aligned_and_not_avx2: # @bitmap_aligned_and_not_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ push rbx
+ and rsp, -8
+ test rcx, rcx
+ jle .LBB2_12
+# %bb.1:
+ cmp rcx, 127
+ ja .LBB2_7
+# %bb.2:
+ xor r8d, r8d
+ jmp .LBB2_3
+.LBB2_7:
+ lea r8, [rdx + rcx]
+ lea rax, [rdi + rcx]
+ cmp rax, rdx
+ seta r11b
+ lea rax, [rsi + rcx]
+ cmp r8, rdi
+ seta bl
+ cmp rax, rdx
+ seta r10b
+ cmp r8, rsi
+ seta r9b
+ xor r8d, r8d
+ test r11b, bl
+ jne .LBB2_3
+# %bb.8:
+ and r10b, r9b
+ jne .LBB2_3
+# %bb.9:
+ mov r8, rcx
+ and r8, -128
+ xor eax, eax
+ .p2align 4, 0x90
+.LBB2_10: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rsi + rax]
+ vmovups ymm1, ymmword ptr [rsi + rax + 32]
+ vmovups ymm2, ymmword ptr [rsi + rax + 64]
+ vmovups ymm3, ymmword ptr [rsi + rax + 96]
+ vandnps ymm0, ymm0, ymmword ptr [rdi + rax]
+ vandnps ymm1, ymm1, ymmword ptr [rdi + rax + 32]
+ vandnps ymm2, ymm2, ymmword ptr [rdi + rax + 64]
+ vandnps ymm3, ymm3, ymmword ptr [rdi + rax + 96]
+ vmovups ymmword ptr [rdx + rax], ymm0
+ vmovups ymmword ptr [rdx + rax + 32], ymm1
+ vmovups ymmword ptr [rdx + rax + 64], ymm2
+ vmovups ymmword ptr [rdx + rax + 96], ymm3
+ sub rax, -128
+ cmp r8, rax
+ jne .LBB2_10
+# %bb.11:
+ cmp r8, rcx
+ je .LBB2_12
+.LBB2_3:
+ mov r9, r8
+ not r9
+ test cl, 1
+ je .LBB2_5
+# %bb.4:
+ mov al, byte ptr [rsi + r8]
+ not al
+ and al, byte ptr [rdi + r8]
+ mov byte ptr [rdx + r8], al
+ or r8, 1
+.LBB2_5:
+ add r9, rcx
+ je .LBB2_12
+ .p2align 4, 0x90
+.LBB2_6: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rsi + r8]
+ not al
+ and al, byte ptr [rdi + r8]
+ mov byte ptr [rdx + r8], al
+ movzx eax, byte ptr [rsi + r8 + 1]
+ not al
+ and al, byte ptr [rdi + r8 + 1]
+ mov byte ptr [rdx + r8 + 1], al
+ add r8, 2
+ cmp rcx, r8
+ jne .LBB2_6
+.LBB2_12:
+ lea rsp, [rbp - 8]
+ pop rbx
+ pop rbp
+ vzeroupper
+ ret
+.Lfunc_end2:
+ .size bitmap_aligned_and_not_avx2, .Lfunc_end2-bitmap_aligned_and_not_avx2
+ # -- End function
+ .globl bitmap_aligned_xor_avx2 # -- Begin function bitmap_aligned_xor_avx2
+ .p2align 4, 0x90
+ .type bitmap_aligned_xor_avx2,@function
+bitmap_aligned_xor_avx2: # @bitmap_aligned_xor_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ push rbx
+ and rsp, -8
+ test rcx, rcx
+ jle .LBB3_12
+# %bb.1:
+ cmp rcx, 127
+ ja .LBB3_7
+# %bb.2:
+ xor r10d, r10d
+ jmp .LBB3_3
+.LBB3_7:
+ lea r9, [rdx + rcx]
+ lea rax, [rdi + rcx]
+ cmp rax, rdx
+ seta r11b
+ lea rax, [rsi + rcx]
+ cmp r9, rdi
+ seta bl
+ cmp rax, rdx
+ seta r8b
+ cmp r9, rsi
+ seta r9b
+ xor r10d, r10d
+ test r11b, bl
+ jne .LBB3_3
+# %bb.8:
+ and r8b, r9b
+ jne .LBB3_3
+# %bb.9:
+ mov r10, rcx
+ and r10, -128
+ xor r8d, r8d
+ .p2align 4, 0x90
+.LBB3_10: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rsi + r8]
+ vmovups ymm1, ymmword ptr [rsi + r8 + 32]
+ vmovups ymm2, ymmword ptr [rsi + r8 + 64]
+ vmovups ymm3, ymmword ptr [rsi + r8 + 96]
+ vxorps ymm0, ymm0, ymmword ptr [rdi + r8]
+ vxorps ymm1, ymm1, ymmword ptr [rdi + r8 + 32]
+ vxorps ymm2, ymm2, ymmword ptr [rdi + r8 + 64]
+ vxorps ymm3, ymm3, ymmword ptr [rdi + r8 + 96]
+ vmovups ymmword ptr [rdx + r8], ymm0
+ vmovups ymmword ptr [rdx + r8 + 32], ymm1
+ vmovups ymmword ptr [rdx + r8 + 64], ymm2
+ vmovups ymmword ptr [rdx + r8 + 96], ymm3
+ sub r8, -128
+ cmp r10, r8
+ jne .LBB3_10
+# %bb.11:
+ cmp r10, rcx
+ je .LBB3_12
+.LBB3_3:
+ mov r8, r10
+ not r8
+ add r8, rcx
+ mov r9, rcx
+ and r9, 3
+ je .LBB3_5
+ .p2align 4, 0x90
+.LBB3_4: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rsi + r10]
+ xor al, byte ptr [rdi + r10]
+ mov byte ptr [rdx + r10], al
+ add r10, 1
+ add r9, -1
+ jne .LBB3_4
+.LBB3_5:
+ cmp r8, 3
+ jb .LBB3_12
+ .p2align 4, 0x90
+.LBB3_6: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rsi + r10]
+ xor al, byte ptr [rdi + r10]
+ mov byte ptr [rdx + r10], al
+ movzx eax, byte ptr [rsi + r10 + 1]
+ xor al, byte ptr [rdi + r10 + 1]
+ mov byte ptr [rdx + r10 + 1], al
+ movzx eax, byte ptr [rsi + r10 + 2]
+ xor al, byte ptr [rdi + r10 + 2]
+ mov byte ptr [rdx + r10 + 2], al
+ movzx eax, byte ptr [rsi + r10 + 3]
+ xor al, byte ptr [rdi + r10 + 3]
+ mov byte ptr [rdx + r10 + 3], al
+ add r10, 4
+ cmp rcx, r10
+ jne .LBB3_6
+.LBB3_12:
+ lea rsp, [rbp - 8]
+ pop rbx
+ pop rbp
+ vzeroupper
+ ret
+.Lfunc_end3:
+ .size bitmap_aligned_xor_avx2, .Lfunc_end3-bitmap_aligned_xor_avx2
+ # -- End function
.ident "Ubuntu clang version 11.1.0-6"
.section ".note.GNU-stack","",@progbits
.addrsig
diff --git a/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s b/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s
index 9d028155b72..840c1a623bb 100644
--- a/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s
+++ b/go/arrow/bitutil/_lib/bitmap_ops_sse4_amd64.s
@@ -267,6 +267,264 @@ bitmap_aligned_or_sse4: # @bitmap_aligned_or_sse4
.Lfunc_end1:
.size bitmap_aligned_or_sse4, .Lfunc_end1-bitmap_aligned_or_sse4
# -- End function
+ .globl bitmap_aligned_and_not_sse4 # -- Begin function bitmap_aligned_and_not_sse4
+ .p2align 4, 0x90
+ .type bitmap_aligned_and_not_sse4,@function
+bitmap_aligned_and_not_sse4: # @bitmap_aligned_and_not_sse4
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ push rbx
+ and rsp, -8
+ test rcx, rcx
+ jle .LBB2_16
+# %bb.1:
+ cmp rcx, 31
+ ja .LBB2_7
+# %bb.2:
+ xor r11d, r11d
+.LBB2_3:
+ mov r8, r11
+ not r8
+ test cl, 1
+ je .LBB2_5
+# %bb.4:
+ mov al, byte ptr [rsi + r11]
+ not al
+ and al, byte ptr [rdi + r11]
+ mov byte ptr [rdx + r11], al
+ or r11, 1
+.LBB2_5:
+ add r8, rcx
+ je .LBB2_16
+ .p2align 4, 0x90
+.LBB2_6: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rsi + r11]
+ not al
+ and al, byte ptr [rdi + r11]
+ mov byte ptr [rdx + r11], al
+ movzx eax, byte ptr [rsi + r11 + 1]
+ not al
+ and al, byte ptr [rdi + r11 + 1]
+ mov byte ptr [rdx + r11 + 1], al
+ add r11, 2
+ cmp rcx, r11
+ jne .LBB2_6
+ jmp .LBB2_16
+.LBB2_7:
+ lea r9, [rdx + rcx]
+ lea rax, [rdi + rcx]
+ cmp rax, rdx
+ seta r10b
+ lea rax, [rsi + rcx]
+ cmp r9, rdi
+ seta bl
+ cmp rax, rdx
+ seta r8b
+ cmp r9, rsi
+ seta r9b
+ xor r11d, r11d
+ test r10b, bl
+ jne .LBB2_3
+# %bb.8:
+ and r8b, r9b
+ jne .LBB2_3
+# %bb.9:
+ mov r11, rcx
+ and r11, -32
+ lea rax, [r11 - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB2_10
+# %bb.11:
+ mov r10, r9
+ and r10, -2
+ neg r10
+ xor r8d, r8d
+ .p2align 4, 0x90
+.LBB2_12: # =>This Inner Loop Header: Depth=1
+ movups xmm0, xmmword ptr [rdi + r8]
+ movups xmm1, xmmword ptr [rdi + r8 + 16]
+ movups xmm2, xmmword ptr [rsi + r8]
+ andnps xmm2, xmm0
+ movups xmm0, xmmword ptr [rsi + r8 + 16]
+ andnps xmm0, xmm1
+ movups xmmword ptr [rdx + r8], xmm2
+ movups xmmword ptr [rdx + r8 + 16], xmm0
+ movups xmm0, xmmword ptr [rdi + r8 + 32]
+ movups xmm1, xmmword ptr [rdi + r8 + 48]
+ movups xmm2, xmmword ptr [rsi + r8 + 32]
+ andnps xmm2, xmm0
+ movups xmm0, xmmword ptr [rsi + r8 + 48]
+ andnps xmm0, xmm1
+ movups xmmword ptr [rdx + r8 + 32], xmm2
+ movups xmmword ptr [rdx + r8 + 48], xmm0
+ add r8, 64
+ add r10, 2
+ jne .LBB2_12
+# %bb.13:
+ test r9b, 1
+ je .LBB2_15
+.LBB2_14:
+ movups xmm0, xmmword ptr [rdi + r8]
+ movups xmm1, xmmword ptr [rdi + r8 + 16]
+ movups xmm2, xmmword ptr [rsi + r8]
+ andnps xmm2, xmm0
+ movups xmm0, xmmword ptr [rsi + r8 + 16]
+ andnps xmm0, xmm1
+ movups xmmword ptr [rdx + r8], xmm2
+ movups xmmword ptr [rdx + r8 + 16], xmm0
+.LBB2_15:
+ cmp r11, rcx
+ jne .LBB2_3
+.LBB2_16:
+ lea rsp, [rbp - 8]
+ pop rbx
+ pop rbp
+ ret
+.LBB2_10:
+ xor r8d, r8d
+ test r9b, 1
+ jne .LBB2_14
+ jmp .LBB2_15
+.Lfunc_end2:
+ .size bitmap_aligned_and_not_sse4, .Lfunc_end2-bitmap_aligned_and_not_sse4
+ # -- End function
+ .globl bitmap_aligned_xor_sse4 # -- Begin function bitmap_aligned_xor_sse4
+ .p2align 4, 0x90
+ .type bitmap_aligned_xor_sse4,@function
+bitmap_aligned_xor_sse4: # @bitmap_aligned_xor_sse4
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ push rbx
+ and rsp, -8
+ test rcx, rcx
+ jle .LBB3_16
+# %bb.1:
+ cmp rcx, 31
+ ja .LBB3_7
+# %bb.2:
+ xor r11d, r11d
+.LBB3_3:
+ mov r8, r11
+ not r8
+ add r8, rcx
+ mov r9, rcx
+ and r9, 3
+ je .LBB3_5
+ .p2align 4, 0x90
+.LBB3_4: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rsi + r11]
+ xor al, byte ptr [rdi + r11]
+ mov byte ptr [rdx + r11], al
+ add r11, 1
+ add r9, -1
+ jne .LBB3_4
+.LBB3_5:
+ cmp r8, 3
+ jb .LBB3_16
+ .p2align 4, 0x90
+.LBB3_6: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rsi + r11]
+ xor al, byte ptr [rdi + r11]
+ mov byte ptr [rdx + r11], al
+ movzx eax, byte ptr [rsi + r11 + 1]
+ xor al, byte ptr [rdi + r11 + 1]
+ mov byte ptr [rdx + r11 + 1], al
+ movzx eax, byte ptr [rsi + r11 + 2]
+ xor al, byte ptr [rdi + r11 + 2]
+ mov byte ptr [rdx + r11 + 2], al
+ movzx eax, byte ptr [rsi + r11 + 3]
+ xor al, byte ptr [rdi + r11 + 3]
+ mov byte ptr [rdx + r11 + 3], al
+ add r11, 4
+ cmp rcx, r11
+ jne .LBB3_6
+ jmp .LBB3_16
+.LBB3_7:
+ lea r9, [rdx + rcx]
+ lea rax, [rdi + rcx]
+ cmp rax, rdx
+ seta r10b
+ lea rax, [rsi + rcx]
+ cmp r9, rdi
+ seta bl
+ cmp rax, rdx
+ seta r8b
+ cmp r9, rsi
+ seta r9b
+ xor r11d, r11d
+ test r10b, bl
+ jne .LBB3_3
+# %bb.8:
+ and r8b, r9b
+ jne .LBB3_3
+# %bb.9:
+ mov r11, rcx
+ and r11, -32
+ lea rax, [r11 - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB3_10
+# %bb.11:
+ mov r10, r9
+ and r10, -2
+ neg r10
+ xor r8d, r8d
+ .p2align 4, 0x90
+.LBB3_12: # =>This Inner Loop Header: Depth=1
+ movups xmm0, xmmword ptr [rdi + r8]
+ movups xmm1, xmmword ptr [rdi + r8 + 16]
+ movups xmm2, xmmword ptr [rsi + r8]
+ xorps xmm2, xmm0
+ movups xmm0, xmmword ptr [rsi + r8 + 16]
+ xorps xmm0, xmm1
+ movups xmmword ptr [rdx + r8], xmm2
+ movups xmmword ptr [rdx + r8 + 16], xmm0
+ movups xmm0, xmmword ptr [rdi + r8 + 32]
+ movups xmm1, xmmword ptr [rdi + r8 + 48]
+ movups xmm2, xmmword ptr [rsi + r8 + 32]
+ xorps xmm2, xmm0
+ movups xmm0, xmmword ptr [rsi + r8 + 48]
+ xorps xmm0, xmm1
+ movups xmmword ptr [rdx + r8 + 32], xmm2
+ movups xmmword ptr [rdx + r8 + 48], xmm0
+ add r8, 64
+ add r10, 2
+ jne .LBB3_12
+# %bb.13:
+ test r9b, 1
+ je .LBB3_15
+.LBB3_14:
+ movups xmm0, xmmword ptr [rdi + r8]
+ movups xmm1, xmmword ptr [rdi + r8 + 16]
+ movups xmm2, xmmword ptr [rsi + r8]
+ xorps xmm2, xmm0
+ movups xmm0, xmmword ptr [rsi + r8 + 16]
+ xorps xmm0, xmm1
+ movups xmmword ptr [rdx + r8], xmm2
+ movups xmmword ptr [rdx + r8 + 16], xmm0
+.LBB3_15:
+ cmp r11, rcx
+ jne .LBB3_3
+.LBB3_16:
+ lea rsp, [rbp - 8]
+ pop rbx
+ pop rbp
+ ret
+.LBB3_10:
+ xor r8d, r8d
+ test r9b, 1
+ jne .LBB3_14
+ jmp .LBB3_15
+.Lfunc_end3:
+ .size bitmap_aligned_xor_sse4, .Lfunc_end3-bitmap_aligned_xor_sse4
+ # -- End function
.ident "Ubuntu clang version 11.1.0-6"
.section ".note.GNU-stack","",@progbits
.addrsig
diff --git a/go/arrow/bitutil/bitmap_ops.go b/go/arrow/bitutil/bitmap_ops.go
index 62322b04b9d..7db750a6dd9 100644
--- a/go/arrow/bitutil/bitmap_ops.go
+++ b/go/arrow/bitutil/bitmap_ops.go
@@ -39,6 +39,29 @@ func alignedBitAndGo(left, right, out []byte) {
}
}
+func alignedBitAndNotGo(left, right, out []byte) {
+ var (
+ nbytes = len(out)
+ i = 0
+ )
+ if nbytes > uint64SizeBytes {
+ // case where we have enough bytes to operate on words
+ leftWords := bytesToUint64(left[i:])
+ rightWords := bytesToUint64(right[i:])
+ outWords := bytesToUint64(out[i:])
+
+ for w := range outWords {
+ outWords[w] = leftWords[w] &^ rightWords[w]
+ }
+
+ i += len(outWords) * uint64SizeBytes
+ }
+ // grab any remaining bytes that were fewer than a word
+ for ; i < nbytes; i++ {
+ out[i] = left[i] &^ right[i]
+ }
+}
+
func alignedBitOrGo(left, right, out []byte) {
var (
nbytes = len(out)
@@ -61,3 +84,26 @@ func alignedBitOrGo(left, right, out []byte) {
out[i] = left[i] | right[i]
}
}
+
+func alignedBitXorGo(left, right, out []byte) {
+ var (
+ nbytes = len(out)
+ i = 0
+ )
+ if nbytes > uint64SizeBytes {
+ // case where we have enough bytes to operate on words
+ leftWords := bytesToUint64(left[i:])
+ rightWords := bytesToUint64(right[i:])
+ outWords := bytesToUint64(out[i:])
+
+ for w := range outWords {
+ outWords[w] = leftWords[w] ^ rightWords[w]
+ }
+
+ i += len(outWords) * uint64SizeBytes
+ }
+ // grab any remaining bytes that were fewer than a word
+ for ; i < nbytes; i++ {
+ out[i] = left[i] ^ right[i]
+ }
+}
diff --git a/go/arrow/bitutil/bitmap_ops_amd64.go b/go/arrow/bitutil/bitmap_ops_amd64.go
index 9aa5a6dd56b..ad0fd674ab9 100644
--- a/go/arrow/bitutil/bitmap_ops_amd64.go
+++ b/go/arrow/bitutil/bitmap_ops_amd64.go
@@ -25,11 +25,17 @@ func init() {
if cpu.X86.HasAVX2 {
bitAndOp.opAligned = bitmapAlignedAndAVX2
bitOrOp.opAligned = bitmapAlignedOrAVX2
+ bitAndNotOp.opAligned = bitmapAlignedAndNotAVX2
+ bitXorOp.opAligned = bitmapAlignedXorAVX2
} else if cpu.X86.HasSSE42 {
bitAndOp.opAligned = bitmapAlignedAndSSE4
bitOrOp.opAligned = bitmapAlignedOrSSE4
+ bitAndNotOp.opAligned = bitmapAlignedAndNotSSE4
+ bitXorOp.opAligned = bitmapAlignedXorSSE4
} else {
bitAndOp.opAligned = alignedBitAndGo
bitOrOp.opAligned = alignedBitOrGo
+ bitAndNotOp.opAligned = alignedBitAndNotGo
+ bitXorOp.opAligned = alignedBitXorGo
}
}
diff --git a/go/arrow/bitutil/bitmap_ops_arm64.go b/go/arrow/bitutil/bitmap_ops_arm64.go
index 86c47639a9e..28d95d84ade 100644
--- a/go/arrow/bitutil/bitmap_ops_arm64.go
+++ b/go/arrow/bitutil/bitmap_ops_arm64.go
@@ -22,4 +22,6 @@ package bitutil
func init() {
bitAndOp.opAligned = alignedBitAndGo
bitOrOp.opAligned = alignedBitOrGo
+ bitAndNotOp.opAligned = alignedBitAndNotGo
+ bitXorOp.opAligned = alignedBitXorGo
}
diff --git a/go/arrow/bitutil/bitmap_ops_avx2_amd64.go b/go/arrow/bitutil/bitmap_ops_avx2_amd64.go
index 731b9807b79..1c01bd0f380 100644
--- a/go/arrow/bitutil/bitmap_ops_avx2_amd64.go
+++ b/go/arrow/bitutil/bitmap_ops_avx2_amd64.go
@@ -36,3 +36,17 @@ func _bitmap_aligned_or_avx2(left, right, out unsafe.Pointer, length int64)
func bitmapAlignedOrAVX2(left, right, out []byte) {
_bitmap_aligned_or_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
}
+
+//go:noescape
+func _bitmap_aligned_and_not_avx2(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedAndNotAVX2(left, right, out []byte) {
+ _bitmap_aligned_and_not_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
+
+//go:noescape
+func _bitmap_aligned_xor_avx2(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedXorAVX2(left, right, out []byte) {
+ _bitmap_aligned_xor_avx2(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
diff --git a/go/arrow/bitutil/bitmap_ops_avx2_amd64.s b/go/arrow/bitutil/bitmap_ops_avx2_amd64.s
index 2e2ade89617..00172e86592 100644
--- a/go/arrow/bitutil/bitmap_ops_avx2_amd64.s
+++ b/go/arrow/bitutil/bitmap_ops_avx2_amd64.s
@@ -190,3 +190,184 @@ LBB1_6:
LBB1_12:
VZEROUPPER
RET
+
+TEXT ·_bitmap_aligned_and_not_avx2(SB), $0-32
+
+ MOVQ left+0(FP), DI
+ MOVQ right+8(FP), SI
+ MOVQ out+16(FP), DX
+ MOVQ length+24(FP), CX
+
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JLE LBB2_12
+ LONG $0x7ff98348 // cmp rcx, 127
+ JA LBB2_7
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+ JMP LBB2_3
+
+LBB2_7:
+ LONG $0x0a048d4c // lea r8, [rdx + rcx]
+ LONG $0x0f048d48 // lea rax, [rdi + rcx]
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd3970f41 // seta r11b
+ LONG $0x0e048d48 // lea rax, [rsi + rcx]
+ WORD $0x3949; BYTE $0xf8 // cmp r8, rdi
+ WORD $0x970f; BYTE $0xd3 // seta bl
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd2970f41 // seta r10b
+ WORD $0x3949; BYTE $0xf0 // cmp r8, rsi
+ LONG $0xd1970f41 // seta r9b
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+ WORD $0x8441; BYTE $0xdb // test r11b, bl
+ JNE LBB2_3
+ WORD $0x2045; BYTE $0xca // and r10b, r9b
+ JNE LBB2_3
+ WORD $0x8949; BYTE $0xc8 // mov r8, rcx
+ LONG $0x80e08349 // and r8, -128
+ WORD $0xc031 // xor eax, eax
+
+LBB2_10:
+ LONG $0x0410fcc5; BYTE $0x06 // vmovups ymm0, yword [rsi + rax]
+ LONG $0x4c10fcc5; WORD $0x2006 // vmovups ymm1, yword [rsi + rax + 32]
+ LONG $0x5410fcc5; WORD $0x4006 // vmovups ymm2, yword [rsi + rax + 64]
+ LONG $0x5c10fcc5; WORD $0x6006 // vmovups ymm3, yword [rsi + rax + 96]
+ LONG $0x0455fcc5; BYTE $0x07 // vandnps ymm0, ymm0, yword [rdi + rax]
+ LONG $0x4c55f4c5; WORD $0x2007 // vandnps ymm1, ymm1, yword [rdi + rax + 32]
+ LONG $0x5455ecc5; WORD $0x4007 // vandnps ymm2, ymm2, yword [rdi + rax + 64]
+ LONG $0x5c55e4c5; WORD $0x6007 // vandnps ymm3, ymm3, yword [rdi + rax + 96]
+ LONG $0x0411fcc5; BYTE $0x02 // vmovups yword [rdx + rax], ymm0
+ LONG $0x4c11fcc5; WORD $0x2002 // vmovups yword [rdx + rax + 32], ymm1
+ LONG $0x5411fcc5; WORD $0x4002 // vmovups yword [rdx + rax + 64], ymm2
+ LONG $0x5c11fcc5; WORD $0x6002 // vmovups yword [rdx + rax + 96], ymm3
+ LONG $0x80e88348 // sub rax, -128
+ WORD $0x3949; BYTE $0xc0 // cmp r8, rax
+ JNE LBB2_10
+ WORD $0x3949; BYTE $0xc8 // cmp r8, rcx
+ JE LBB2_12
+
+LBB2_3:
+ WORD $0x894d; BYTE $0xc1 // mov r9, r8
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0xc1f6; BYTE $0x01 // test cl, 1
+ JE LBB2_5
+ LONG $0x06048a42 // mov al, byte [rsi + r8]
+ WORD $0xd0f6 // not al
+ LONG $0x07042242 // and al, byte [rdi + r8]
+ LONG $0x02048842 // mov byte [rdx + r8], al
+ LONG $0x01c88349 // or r8, 1
+
+LBB2_5:
+ WORD $0x0149; BYTE $0xc9 // add r9, rcx
+ JE LBB2_12
+
+LBB2_6:
+ LONG $0x04b60f42; BYTE $0x06 // movzx eax, byte [rsi + r8]
+ WORD $0xd0f6 // not al
+ LONG $0x07042242 // and al, byte [rdi + r8]
+ LONG $0x02048842 // mov byte [rdx + r8], al
+ LONG $0x44b60f42; WORD $0x0106 // movzx eax, byte [rsi + r8 + 1]
+ WORD $0xd0f6 // not al
+ LONG $0x07442242; BYTE $0x01 // and al, byte [rdi + r8 + 1]
+ LONG $0x02448842; BYTE $0x01 // mov byte [rdx + r8 + 1], al
+ LONG $0x02c08349 // add r8, 2
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JNE LBB2_6
+
+LBB2_12:
+ VZEROUPPER
+ RET
+
+TEXT ·_bitmap_aligned_xor_avx2(SB), $0-32
+
+ MOVQ left+0(FP), DI
+ MOVQ right+8(FP), SI
+ MOVQ out+16(FP), DX
+ MOVQ length+24(FP), CX
+
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JLE LBB3_12
+ LONG $0x7ff98348 // cmp rcx, 127
+ JA LBB3_7
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ JMP LBB3_3
+
+LBB3_7:
+ LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
+ LONG $0x0f048d48 // lea rax, [rdi + rcx]
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd3970f41 // seta r11b
+ LONG $0x0e048d48 // lea rax, [rsi + rcx]
+ WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
+ WORD $0x970f; BYTE $0xd3 // seta bl
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd0970f41 // seta r8b
+ WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
+ LONG $0xd1970f41 // seta r9b
+ WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
+ WORD $0x8441; BYTE $0xdb // test r11b, bl
+ JNE LBB3_3
+ WORD $0x2045; BYTE $0xc8 // and r8b, r9b
+ JNE LBB3_3
+ WORD $0x8949; BYTE $0xca // mov r10, rcx
+ LONG $0x80e28349 // and r10, -128
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB3_10:
+ LONG $0x107ca1c4; WORD $0x0604 // vmovups ymm0, yword [rsi + r8]
+ LONG $0x107ca1c4; WORD $0x064c; BYTE $0x20 // vmovups ymm1, yword [rsi + r8 + 32]
+ LONG $0x107ca1c4; WORD $0x0654; BYTE $0x40 // vmovups ymm2, yword [rsi + r8 + 64]
+ LONG $0x107ca1c4; WORD $0x065c; BYTE $0x60 // vmovups ymm3, yword [rsi + r8 + 96]
+ LONG $0x577ca1c4; WORD $0x0704 // vxorps ymm0, ymm0, yword [rdi + r8]
+ LONG $0x5774a1c4; WORD $0x074c; BYTE $0x20 // vxorps ymm1, ymm1, yword [rdi + r8 + 32]
+ LONG $0x576ca1c4; WORD $0x0754; BYTE $0x40 // vxorps ymm2, ymm2, yword [rdi + r8 + 64]
+ LONG $0x5764a1c4; WORD $0x075c; BYTE $0x60 // vxorps ymm3, ymm3, yword [rdi + r8 + 96]
+ LONG $0x117ca1c4; WORD $0x0204 // vmovups yword [rdx + r8], ymm0
+ LONG $0x117ca1c4; WORD $0x024c; BYTE $0x20 // vmovups yword [rdx + r8 + 32], ymm1
+ LONG $0x117ca1c4; WORD $0x0254; BYTE $0x40 // vmovups yword [rdx + r8 + 64], ymm2
+ LONG $0x117ca1c4; WORD $0x025c; BYTE $0x60 // vmovups yword [rdx + r8 + 96], ymm3
+ LONG $0x80e88349 // sub r8, -128
+ WORD $0x394d; BYTE $0xc2 // cmp r10, r8
+ JNE LBB3_10
+ WORD $0x3949; BYTE $0xca // cmp r10, rcx
+ JE LBB3_12
+
+LBB3_3:
+ WORD $0x894d; BYTE $0xd0 // mov r8, r10
+ WORD $0xf749; BYTE $0xd0 // not r8
+ WORD $0x0149; BYTE $0xc8 // add r8, rcx
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e18349 // and r9, 3
+ JE LBB3_5
+
+LBB3_4:
+ LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
+ LONG $0x17043242 // xor al, byte [rdi + r10]
+ LONG $0x12048842 // mov byte [rdx + r10], al
+ LONG $0x01c28349 // add r10, 1
+ LONG $0xffc18349 // add r9, -1
+ JNE LBB3_4
+
+LBB3_5:
+ LONG $0x03f88349 // cmp r8, 3
+ JB LBB3_12
+
+LBB3_6:
+ LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
+ LONG $0x17043242 // xor al, byte [rdi + r10]
+ LONG $0x12048842 // mov byte [rdx + r10], al
+ LONG $0x44b60f42; WORD $0x0116 // movzx eax, byte [rsi + r10 + 1]
+ LONG $0x17443242; BYTE $0x01 // xor al, byte [rdi + r10 + 1]
+ LONG $0x12448842; BYTE $0x01 // mov byte [rdx + r10 + 1], al
+ LONG $0x44b60f42; WORD $0x0216 // movzx eax, byte [rsi + r10 + 2]
+ LONG $0x17443242; BYTE $0x02 // xor al, byte [rdi + r10 + 2]
+ LONG $0x12448842; BYTE $0x02 // mov byte [rdx + r10 + 2], al
+ LONG $0x44b60f42; WORD $0x0316 // movzx eax, byte [rsi + r10 + 3]
+ LONG $0x17443242; BYTE $0x03 // xor al, byte [rdi + r10 + 3]
+ LONG $0x12448842; BYTE $0x03 // mov byte [rdx + r10 + 3], al
+ LONG $0x04c28349 // add r10, 4
+ WORD $0x394c; BYTE $0xd1 // cmp rcx, r10
+ JNE LBB3_6
+
+LBB3_12:
+ VZEROUPPER
+ RET
diff --git a/go/arrow/bitutil/bitmap_ops_noasm.go b/go/arrow/bitutil/bitmap_ops_noasm.go
index 785531c1c23..e25347791fe 100644
--- a/go/arrow/bitutil/bitmap_ops_noasm.go
+++ b/go/arrow/bitutil/bitmap_ops_noasm.go
@@ -22,4 +22,6 @@ package bitutil
func init() {
bitAndOp.opAligned = alignedBitAndGo
bitOrOp.opAligned = alignedBitOrGo
+ bitAndNotOp.opAligned = alignedBitAndNotGo
+ bitXorOp.opAligned = alignedBitXorGo
}
diff --git a/go/arrow/bitutil/bitmap_ops_ppc64le.go b/go/arrow/bitutil/bitmap_ops_ppc64le.go
index 86c47639a9e..28d95d84ade 100644
--- a/go/arrow/bitutil/bitmap_ops_ppc64le.go
+++ b/go/arrow/bitutil/bitmap_ops_ppc64le.go
@@ -22,4 +22,6 @@ package bitutil
func init() {
bitAndOp.opAligned = alignedBitAndGo
bitOrOp.opAligned = alignedBitOrGo
+ bitAndNotOp.opAligned = alignedBitAndNotGo
+ bitXorOp.opAligned = alignedBitXorGo
}
diff --git a/go/arrow/bitutil/bitmap_ops_s390x.go b/go/arrow/bitutil/bitmap_ops_s390x.go
index 86c47639a9e..28d95d84ade 100644
--- a/go/arrow/bitutil/bitmap_ops_s390x.go
+++ b/go/arrow/bitutil/bitmap_ops_s390x.go
@@ -22,4 +22,6 @@ package bitutil
func init() {
bitAndOp.opAligned = alignedBitAndGo
bitOrOp.opAligned = alignedBitOrGo
+ bitAndNotOp.opAligned = alignedBitAndNotGo
+ bitXorOp.opAligned = alignedBitXorGo
}
diff --git a/go/arrow/bitutil/bitmap_ops_sse4_amd64.go b/go/arrow/bitutil/bitmap_ops_sse4_amd64.go
index 5d1fcf96829..f16bce12bbf 100644
--- a/go/arrow/bitutil/bitmap_ops_sse4_amd64.go
+++ b/go/arrow/bitutil/bitmap_ops_sse4_amd64.go
@@ -36,3 +36,17 @@ func _bitmap_aligned_or_sse4(left, right, out unsafe.Pointer, length int64)
func bitmapAlignedOrSSE4(left, right, out []byte) {
_bitmap_aligned_or_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
}
+
+//go:noescape
+func _bitmap_aligned_and_not_sse4(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedAndNotSSE4(left, right, out []byte) {
+ _bitmap_aligned_and_not_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
+
+//go:noescape
+func _bitmap_aligned_xor_sse4(left, right, out unsafe.Pointer, length int64)
+
+func bitmapAlignedXorSSE4(left, right, out []byte) {
+ _bitmap_aligned_xor_sse4(unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), int64(len(out)))
+}
diff --git a/go/arrow/bitutil/bitmap_ops_sse4_amd64.s b/go/arrow/bitutil/bitmap_ops_sse4_amd64.s
index ad81cf63720..c15e186253a 100644
--- a/go/arrow/bitutil/bitmap_ops_sse4_amd64.s
+++ b/go/arrow/bitutil/bitmap_ops_sse4_amd64.s
@@ -254,3 +254,248 @@ LBB1_10:
LONG $0x01c1f641 // test r9b, 1
JNE LBB1_14
JMP LBB1_15
+
+TEXT ·_bitmap_aligned_and_not_sse4(SB), $0-32
+
+ MOVQ left+0(FP), DI
+ MOVQ right+8(FP), SI
+ MOVQ out+16(FP), DX
+ MOVQ length+24(FP), CX
+
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JLE LBB2_16
+ LONG $0x1ff98348 // cmp rcx, 31
+ JA LBB2_7
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+
+LBB2_3:
+ WORD $0x894d; BYTE $0xd8 // mov r8, r11
+ WORD $0xf749; BYTE $0xd0 // not r8
+ WORD $0xc1f6; BYTE $0x01 // test cl, 1
+ JE LBB2_5
+ LONG $0x1e048a42 // mov al, byte [rsi + r11]
+ WORD $0xd0f6 // not al
+ LONG $0x1f042242 // and al, byte [rdi + r11]
+ LONG $0x1a048842 // mov byte [rdx + r11], al
+ LONG $0x01cb8349 // or r11, 1
+
+LBB2_5:
+ WORD $0x0149; BYTE $0xc8 // add r8, rcx
+ JE LBB2_16
+
+LBB2_6:
+ LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
+ WORD $0xd0f6 // not al
+ LONG $0x1f042242 // and al, byte [rdi + r11]
+ LONG $0x1a048842 // mov byte [rdx + r11], al
+ LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1]
+ WORD $0xd0f6 // not al
+ LONG $0x1f442242; BYTE $0x01 // and al, byte [rdi + r11 + 1]
+ LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al
+ LONG $0x02c38349 // add r11, 2
+ WORD $0x394c; BYTE $0xd9 // cmp rcx, r11
+ JNE LBB2_6
+ JMP LBB2_16
+
+LBB2_7:
+ LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
+ LONG $0x0f048d48 // lea rax, [rdi + rcx]
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd2970f41 // seta r10b
+ LONG $0x0e048d48 // lea rax, [rsi + rcx]
+ WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
+ WORD $0x970f; BYTE $0xd3 // seta bl
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd0970f41 // seta r8b
+ WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
+ LONG $0xd1970f41 // seta r9b
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ WORD $0x8441; BYTE $0xda // test r10b, bl
+ JNE LBB2_3
+ WORD $0x2045; BYTE $0xc8 // and r8b, r9b
+ JNE LBB2_3
+ WORD $0x8949; BYTE $0xcb // mov r11, rcx
+ LONG $0xe0e38349 // and r11, -32
+ LONG $0xe0438d49 // lea rax, [r11 - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB2_10
+ WORD $0x894d; BYTE $0xca // mov r10, r9
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB2_12:
+ LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
+ LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
+ LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
+ WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0
+ LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
+ WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1
+ LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
+ LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
+ LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32]
+ LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48]
+ LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32]
+ WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0
+ LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48]
+ WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1
+ LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2
+ LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0
+ LONG $0x40c08349 // add r8, 64
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB2_12
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_15
+
+LBB2_14:
+ LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
+ LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
+ LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
+ WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0
+ LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
+ WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1
+ LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
+ LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
+
+LBB2_15:
+ WORD $0x3949; BYTE $0xcb // cmp r11, rcx
+ JNE LBB2_3
+
+LBB2_16:
+ RET
+
+LBB2_10:
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+ LONG $0x01c1f641 // test r9b, 1
+ JNE LBB2_14
+ JMP LBB2_15
+
+TEXT ·_bitmap_aligned_xor_sse4(SB), $0-32
+
+ MOVQ left+0(FP), DI
+ MOVQ right+8(FP), SI
+ MOVQ out+16(FP), DX
+ MOVQ length+24(FP), CX
+
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JLE LBB3_16
+ LONG $0x1ff98348 // cmp rcx, 31
+ JA LBB3_7
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+
+LBB3_3:
+ WORD $0x894d; BYTE $0xd8 // mov r8, r11
+ WORD $0xf749; BYTE $0xd0 // not r8
+ WORD $0x0149; BYTE $0xc8 // add r8, rcx
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e18349 // and r9, 3
+ JE LBB3_5
+
+LBB3_4:
+ LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
+ LONG $0x1f043242 // xor al, byte [rdi + r11]
+ LONG $0x1a048842 // mov byte [rdx + r11], al
+ LONG $0x01c38349 // add r11, 1
+ LONG $0xffc18349 // add r9, -1
+ JNE LBB3_4
+
+LBB3_5:
+ LONG $0x03f88349 // cmp r8, 3
+ JB LBB3_16
+
+LBB3_6:
+ LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
+ LONG $0x1f043242 // xor al, byte [rdi + r11]
+ LONG $0x1a048842 // mov byte [rdx + r11], al
+ LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1]
+ LONG $0x1f443242; BYTE $0x01 // xor al, byte [rdi + r11 + 1]
+ LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al
+ LONG $0x44b60f42; WORD $0x021e // movzx eax, byte [rsi + r11 + 2]
+ LONG $0x1f443242; BYTE $0x02 // xor al, byte [rdi + r11 + 2]
+ LONG $0x1a448842; BYTE $0x02 // mov byte [rdx + r11 + 2], al
+ LONG $0x44b60f42; WORD $0x031e // movzx eax, byte [rsi + r11 + 3]
+ LONG $0x1f443242; BYTE $0x03 // xor al, byte [rdi + r11 + 3]
+ LONG $0x1a448842; BYTE $0x03 // mov byte [rdx + r11 + 3], al
+ LONG $0x04c38349 // add r11, 4
+ WORD $0x394c; BYTE $0xd9 // cmp rcx, r11
+ JNE LBB3_6
+ JMP LBB3_16
+
+LBB3_7:
+ LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
+ LONG $0x0f048d48 // lea rax, [rdi + rcx]
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd2970f41 // seta r10b
+ LONG $0x0e048d48 // lea rax, [rsi + rcx]
+ WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
+ WORD $0x970f; BYTE $0xd3 // seta bl
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ LONG $0xd0970f41 // seta r8b
+ WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
+ LONG $0xd1970f41 // seta r9b
+ WORD $0x3145; BYTE $0xdb // xor r11d, r11d
+ WORD $0x8441; BYTE $0xda // test r10b, bl
+ JNE LBB3_3
+ WORD $0x2045; BYTE $0xc8 // and r8b, r9b
+ JNE LBB3_3
+ WORD $0x8949; BYTE $0xcb // mov r11, rcx
+ LONG $0xe0e38349 // and r11, -32
+ LONG $0xe0438d49 // lea rax, [r11 - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB3_10
+ WORD $0x894d; BYTE $0xca // mov r10, r9
+ LONG $0xfee28349 // and r10, -2
+ WORD $0xf749; BYTE $0xda // neg r10
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+
+LBB3_12:
+ LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
+ LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
+ LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
+ WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0
+ LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
+ WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1
+ LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
+ LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
+ LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32]
+ LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48]
+ LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32]
+ WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0
+ LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48]
+ WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1
+ LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2
+ LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0
+ LONG $0x40c08349 // add r8, 64
+ LONG $0x02c28349 // add r10, 2
+ JNE LBB3_12
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB3_15
+
+LBB3_14:
+ LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
+ LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
+ LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
+ WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0
+ LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
+ WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1
+ LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
+ LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
+
+LBB3_15:
+ WORD $0x3949; BYTE $0xcb // cmp r11, rcx
+ JNE LBB3_3
+
+LBB3_16:
+ RET
+
+LBB3_10:
+ WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
+ LONG $0x01c1f641 // test r9b, 1
+ JNE LBB3_14
+ JMP LBB3_15
diff --git a/go/arrow/bitutil/bitmaps.go b/go/arrow/bitutil/bitmaps.go
index abd1b188a74..c23a1232921 100644
--- a/go/arrow/bitutil/bitmaps.go
+++ b/go/arrow/bitutil/bitmaps.go
@@ -18,6 +18,7 @@ package bitutil
import (
"bytes"
+ "errors"
"math/bits"
"unsafe"
@@ -374,9 +375,14 @@ func (bm *BitmapWordWriter) PutNextTrailingByte(b byte, validBits int) {
}
}
-// CopyBitmap copies the bitmap indicated by src, starting at bit offset srcOffset,
-// and copying length bits into dst, starting at bit offset dstOffset.
-func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
+type transferMode int8
+
+const (
+ transferCopy transferMode = iota
+ transferInvert
+)
+
+func transferBitmap(mode transferMode, src []byte, srcOffset, length int, dst []byte, dstOffset int) {
if length == 0 {
// if there's nothing to write, end early.
return
@@ -393,12 +399,19 @@ func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
nwords := rdr.Words()
for nwords > 0 {
nwords--
- wr.PutNextWord(rdr.NextWord())
+ if mode == transferInvert {
+ wr.PutNextWord(^rdr.NextWord())
+ } else {
+ wr.PutNextWord(rdr.NextWord())
+ }
}
nbytes := rdr.TrailingBytes()
for nbytes > 0 {
nbytes--
bt, validBits := rdr.NextTrailingByte()
+ if mode == transferInvert {
+ bt = ^bt
+ }
wr.PutNextTrailingByte(bt, validBits)
}
return
@@ -417,14 +430,33 @@ func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
// - high 5 bits: old bits from last byte of dest buffer
trailingBits := nbytes*8 - length
trailMask := byte(uint(1)<<(8-trailingBits)) - 1
-
- copy(dst, src[:nbytes-1])
- lastData := src[nbytes-1]
+ var lastData byte
+ if mode == transferInvert {
+ for i, b := range src[:nbytes-1] {
+ dst[i] = ^b
+ }
+ lastData = ^src[nbytes-1]
+ } else {
+ copy(dst, src[:nbytes-1])
+ lastData = src[nbytes-1]
+ }
dst[nbytes-1] &= ^trailMask
dst[nbytes-1] |= lastData & trailMask
}
+// CopyBitmap copies the bitmap indicated by src, starting at bit offset srcOffset,
+// and copying length bits into dst, starting at bit offset dstOffset.
+func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
+ transferBitmap(transferCopy, src, srcOffset, length, dst, dstOffset)
+}
+
+// InvertBitmap copies a bit range of a bitmap, inverting it as it copies
+// over into the destination.
+func InvertBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) {
+ transferBitmap(transferInvert, src, srcOffset, length, dst, dstOffset)
+}
+
type bitOp struct {
opWord func(uint64, uint64) uint64
opByte func(byte, byte) byte
@@ -440,6 +472,14 @@ var (
opWord: func(l, r uint64) uint64 { return l | r },
opByte: func(l, r byte) byte { return l | r },
}
+ bitAndNotOp = bitOp{
+ opWord: func(l, r uint64) uint64 { return l &^ r },
+ opByte: func(l, r byte) byte { return l &^ r },
+ }
+ bitXorOp = bitOp{
+ opWord: func(l, r uint64) uint64 { return l ^ r },
+ opByte: func(l, r byte) byte { return l ^ r },
+ }
)
func alignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) {
@@ -532,6 +572,22 @@ func BitmapOrAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset in
return BitmapOpAlloc(mem, bitOrOp, left, right, lOffset, rOffset, length, outOffset)
}
+func BitmapAndNot(left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) {
+ BitmapOp(bitAndNotOp, left, right, lOffset, rOffset, out, outOffset, length)
+}
+
+func BitmapAndNotAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset int64, length, outOffset int64) *memory.Buffer {
+ return BitmapOpAlloc(mem, bitAndNotOp, left, right, lOffset, rOffset, length, outOffset)
+}
+
+func BitmapXor(left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) {
+ BitmapOp(bitXorOp, left, right, lOffset, rOffset, out, outOffset, length)
+}
+
+func BitmapXorAlloc(mem memory.Allocator, left, right []byte, lOffset, rOffset int64, length, outOffset int64) *memory.Buffer {
+ return BitmapOpAlloc(mem, bitXorOp, left, right, lOffset, rOffset, length, outOffset)
+}
+
func BitmapEquals(left, right []byte, lOffset, rOffset int64, length int64) bool {
if lOffset%8 == 0 && rOffset%8 == 0 {
// byte aligned, fast path, can use bytes.Equal (memcmp)
@@ -584,3 +640,108 @@ type OptionalBitIndexer struct {
func (b *OptionalBitIndexer) GetBit(i int) bool {
return b.Bitmap == nil || BitIsSet(b.Bitmap, b.Offset+i)
}
+
+type Bitmap struct {
+ Data []byte
+ Offset, Len int64
+}
+
+func bitLength(bitmaps []Bitmap) (int64, error) {
+ for _, b := range bitmaps[1:] {
+ if b.Len != bitmaps[0].Len {
+ return -1, errors.New("bitmaps must be same length")
+ }
+ }
+ return bitmaps[0].Len, nil
+}
+
+func runVisitWordsAndWriteLoop(bitLen int64, rdrs []*BitmapWordReader, wrs []*BitmapWordWriter, visitor func(in, out []uint64)) {
+ const bitWidth int64 = int64(uint64SizeBits)
+
+ visited := make([]uint64, len(rdrs))
+ output := make([]uint64, len(wrs))
+
+ // every reader will have same number of words, since they are same
+ // length'ed. This will be inefficient in some cases. When there's
+ // offsets beyond the Word boundary, every word would have to be
+ // created from 2 adjoining words
+ nwords := int64(rdrs[0].Words())
+ bitLen -= nwords * bitWidth
+ for nwords > 0 {
+ nwords--
+ for i := range visited {
+ visited[i] = rdrs[i].NextWord()
+ }
+ visitor(visited, output)
+ for i := range output {
+ wrs[i].PutNextWord(output[i])
+ }
+ }
+
+ // every reader will have the same number of trailing bytes, because
+ // we already confirmed they have the same length. Because
+ // offsets beyond the Word boundary can cause adjoining words, the
+ // tailing portion could be more than one word remaining full/partial
+ // words to write.
+ if bitLen == 0 {
+ return
+ }
+
+ // convert the word visitor to a bytevisitor
+ byteVisitor := func(in, out []byte) {
+ for i, w := range in {
+ visited[i] = uint64(w)
+ }
+ visitor(visited, output)
+ for i, w := range output {
+ out[i] = byte(w)
+ }
+ }
+
+ visitedBytes := make([]byte, len(rdrs))
+ outputBytes := make([]byte, len(wrs))
+ nbytes := rdrs[0].trailingBytes
+ for nbytes > 0 {
+ nbytes--
+ memory.Set(visitedBytes, 0)
+ memory.Set(outputBytes, 0)
+
+ var validBits int
+ for i := range rdrs {
+ visitedBytes[i], validBits = rdrs[i].NextTrailingByte()
+ }
+ byteVisitor(visitedBytes, outputBytes)
+ for i, w := range outputBytes {
+ wrs[i].PutNextTrailingByte(w, validBits)
+ }
+ }
+}
+
+// VisitWordsAndWrite visits words of bits from each input bitmap and
+// collects outputs to a slice of output Bitmaps.
+//
+// All bitmaps must have identical lengths. The first bit in a visited
+// bitmap may be offset within the first visited word, but words will
+// otherwise contain densely packed bits loaded from the bitmap. That
+// offset within the first word is returned.
+//
+// NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+// It also has a large prolog/epilog overhead and should be used
+// carefully in other cases. For 2 or fewer bitmaps, and/or smaller
+// bitmaps, try BitmapReader and or other utilities.
+func VisitWordsAndWrite(args []Bitmap, out []Bitmap, visitor func(in, out []uint64)) error {
+ bitLen, err := bitLength(args)
+ if err != nil {
+ return err
+ }
+
+ rdrs, wrs := make([]*BitmapWordReader, len(args)), make([]*BitmapWordWriter, len(out))
+ for i, in := range args {
+ rdrs[i] = NewBitmapWordReader(in.Data, int(in.Offset), int(in.Len))
+ }
+ for i, o := range out {
+ wrs[i] = NewBitmapWordWriter(o.Data, int(o.Offset), int(o.Len))
+ }
+ runVisitWordsAndWriteLoop(bitLen, rdrs, wrs, visitor)
+ return nil
+}
diff --git a/go/arrow/compute/arithmetic.go b/go/arrow/compute/arithmetic.go
new file mode 100644
index 00000000000..49c3c24160e
--- /dev/null
+++ b/go/arrow/compute/arithmetic.go
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compute
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/apache/arrow/go/v10/arrow"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/kernels"
+)
+
+type arithmeticFunction struct {
+ ScalarFunction
+
+ promote decimalPromotion
+}
+
+func (fn *arithmeticFunction) checkDecimals(vals ...arrow.DataType) error {
+ if !hasDecimal(vals...) {
+ return nil
+ }
+
+ if len(vals) != 2 {
+ return nil
+ }
+
+ if fn.promote == decPromoteNone {
+ return fmt.Errorf("%w: invalid decimal function: %s", arrow.ErrInvalid, fn.name)
+ }
+
+ return castBinaryDecimalArgs(fn.promote, vals...)
+}
+
+func (fn *arithmeticFunction) DispatchBest(vals ...arrow.DataType) (exec.Kernel, error) {
+ if err := fn.checkArity(len(vals)); err != nil {
+ return nil, err
+ }
+
+ if err := fn.checkDecimals(vals...); err != nil {
+ return nil, err
+ }
+
+ if kn, err := fn.DispatchExact(vals...); err == nil {
+ return kn, nil
+ }
+
+ ensureDictionaryDecoded(vals...)
+
+ // only promote types for binary funcs
+ if len(vals) == 2 {
+ replaceNullWithOtherType(vals...)
+ if unit, istime := commonTemporalResolution(vals...); istime {
+ replaceTemporalTypes(unit, vals...)
+ } else {
+ if dt := commonNumeric(vals...); dt != nil {
+ replaceTypes(dt, vals...)
+ }
+ }
+ }
+
+ return fn.DispatchExact(vals...)
+}
+
+var (
+ addDoc FunctionDoc
+)
+
+func RegisterScalarArithmetic(reg FunctionRegistry) {
+ addFn := &arithmeticFunction{*NewScalarFunction("add_unchecked", Binary(), addDoc), decPromoteAdd}
+ for _, k := range kernels.GetArithmeticKernels(kernels.OpAdd) {
+ if err := addFn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(addFn, false)
+
+ addCheckedFn := &arithmeticFunction{*NewScalarFunction("add", Binary(), addDoc), decPromoteAdd}
+ for _, k := range kernels.GetArithmeticKernels(kernels.OpAddChecked) {
+ if err := addCheckedFn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(addCheckedFn, false)
+
+ subFn := &arithmeticFunction{*NewScalarFunction("sub_unchecked", Binary(), addDoc), decPromoteAdd}
+ for _, k := range kernels.GetArithmeticKernels(kernels.OpSub) {
+ if err := subFn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(subFn, false)
+
+ subCheckedFn := &arithmeticFunction{*NewScalarFunction("sub", Binary(), addDoc), decPromoteAdd}
+ for _, k := range kernels.GetArithmeticKernels(kernels.OpSubChecked) {
+ if err := subCheckedFn.AddKernel(k); err != nil {
+ panic(err)
+ }
+ }
+
+ reg.AddFunction(subCheckedFn, false)
+}
+
+// Add performs an addition between the passed in arguments (scalar or array)
+// and returns the result. If one argument is a scalar and the other is an
+// array, the scalar value is added to each value of the array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// performance is faster if not explicitly checking for overflows but
+// will error on an overflow if CheckOverflow is true.
+func Add(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+ fn := "add"
+ if opts.NoCheckOverflow {
+ fn = "add_unchecked"
+ }
+ return CallFunction(ctx, fn, nil, left, right)
+}
+
+// Sub performs a subtraction between the passed in arguments (scalar or array)
+// and returns the result. If one argument is a scalar and the other is an
+// array, the scalar value is subtracted from each value of the array.
+//
+// ArithmeticOptions specifies whether or not to check for overflows,
+// performance is faster if not explicitly checking for overflows but
+// will error on an overflow if CheckOverflow is true.
+func Subtract(ctx context.Context, opts ArithmeticOptions, left, right Datum) (Datum, error) {
+ fn := "sub"
+ if opts.NoCheckOverflow {
+ fn = "sub_unchecked"
+ }
+ return CallFunction(ctx, fn, nil, left, right)
+}
diff --git a/go/arrow/compute/arithmetic_test.go b/go/arrow/compute/arithmetic_test.go
new file mode 100644
index 00000000000..2da7a62fe86
--- /dev/null
+++ b/go/arrow/compute/arithmetic_test.go
@@ -0,0 +1,502 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compute_test
+
+import (
+ "context"
+ "fmt"
+ "math"
+ "strings"
+ "testing"
+
+ "github.com/apache/arrow/go/v10/arrow"
+ "github.com/apache/arrow/go/v10/arrow/array"
+ "github.com/apache/arrow/go/v10/arrow/compute"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/internal/testing/gen"
+ "github.com/apache/arrow/go/v10/arrow/memory"
+ "github.com/apache/arrow/go/v10/arrow/scalar"
+ "github.com/klauspost/cpuid/v2"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+ "github.com/stretchr/testify/suite"
+)
+
+var (
+ CpuCacheSizes = [...]int{ // defaults
+ 32 * 1024, // level 1: 32K
+ 256 * 1024, // level 2: 256K
+ 3072 * 1024, // level 3: 3M
+ }
+)
+
+func init() {
+ if cpuid.CPU.Cache.L1D != -1 {
+ CpuCacheSizes[0] = cpuid.CPU.Cache.L1D
+ }
+ if cpuid.CPU.Cache.L2 != -1 {
+ CpuCacheSizes[1] = cpuid.CPU.Cache.L2
+ }
+ if cpuid.CPU.Cache.L3 != -1 {
+ CpuCacheSizes[2] = cpuid.CPU.Cache.L3
+ }
+}
+
+type binaryArithmeticFunc = func(context.Context, compute.ArithmeticOptions, compute.Datum, compute.Datum) (compute.Datum, error)
+
+type binaryFunc = func(left, right compute.Datum) (compute.Datum, error)
+
+func assertScalarEquals(t *testing.T, expected, actual scalar.Scalar) {
+ assert.Truef(t, scalar.Equals(expected, actual), "expected: %s\ngot: %s", expected, actual)
+}
+
+func assertBinop(t *testing.T, fn binaryFunc, left, right, expected arrow.Array) {
+ actual, err := fn(&compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()})
+ require.NoError(t, err)
+ defer actual.Release()
+ assertDatumsEqual(t, &compute.ArrayDatum{Value: expected.Data()}, actual)
+
+ // also check (Scalar, Scalar) operations
+ for i := 0; i < expected.Len(); i++ {
+ s, err := scalar.GetScalar(expected, i)
+ require.NoError(t, err)
+ lhs, _ := scalar.GetScalar(left, i)
+ rhs, _ := scalar.GetScalar(right, i)
+
+ actual, err := fn(&compute.ScalarDatum{Value: lhs}, &compute.ScalarDatum{Value: rhs})
+ assert.NoError(t, err)
+ assertScalarEquals(t, s, actual.(*compute.ScalarDatum).Value)
+ }
+}
+
+func assertBinopErr(t *testing.T, fn binaryFunc, left, right arrow.Array, expectedMsg string) {
+ _, err := fn(&compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{Value: right.Data()})
+ assert.ErrorIs(t, err, arrow.ErrInvalid)
+ assert.ErrorContains(t, err, expectedMsg)
+}
+
+type BinaryFuncTestSuite struct {
+ suite.Suite
+
+ mem *memory.CheckedAllocator
+ ctx context.Context
+}
+
+func (b *BinaryFuncTestSuite) SetupTest() {
+ b.mem = memory.NewCheckedAllocator(memory.DefaultAllocator)
+ b.ctx = compute.WithAllocator(context.TODO(), b.mem)
+}
+
+func (b *BinaryFuncTestSuite) TearDownTest() {
+ b.mem.AssertSize(b.T(), 0)
+}
+
+type Float16BinaryFuncTestSuite struct {
+ BinaryFuncTestSuite
+}
+
+func (b *Float16BinaryFuncTestSuite) assertBinopErr(fn binaryFunc, lhs, rhs string) {
+ left, _, _ := array.FromJSON(b.mem, arrow.FixedWidthTypes.Float16, strings.NewReader(lhs), array.WithUseNumber())
+ defer left.Release()
+ right, _, _ := array.FromJSON(b.mem, arrow.FixedWidthTypes.Float16, strings.NewReader(rhs), array.WithUseNumber())
+ defer right.Release()
+
+ _, err := fn(&compute.ArrayDatum{left.Data()}, &compute.ArrayDatum{right.Data()})
+ b.ErrorIs(err, arrow.ErrNotImplemented)
+}
+
+func (b *Float16BinaryFuncTestSuite) TestAdd() {
+ for _, overflow := range []bool{false, true} {
+ b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+ opts := compute.ArithmeticOptions{NoCheckOverflow: overflow}
+ b.assertBinopErr(func(left, right compute.Datum) (compute.Datum, error) {
+ return compute.Add(b.ctx, opts, left, right)
+ }, `[1.5]`, `[1.5]`)
+ })
+ }
+}
+
+func (b *Float16BinaryFuncTestSuite) TestSub() {
+ for _, overflow := range []bool{false, true} {
+ b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+ opts := compute.ArithmeticOptions{NoCheckOverflow: overflow}
+ b.assertBinopErr(func(left, right compute.Datum) (compute.Datum, error) {
+ return compute.Subtract(b.ctx, opts, left, right)
+ }, `[1.5]`, `[1.5]`)
+ })
+ }
+}
+
+type BinaryArithmeticSuite[T exec.NumericTypes] struct {
+ BinaryFuncTestSuite
+
+ opts compute.ArithmeticOptions
+ min, max T
+}
+
+func (BinaryArithmeticSuite[T]) DataType() arrow.DataType {
+ return exec.GetDataType[T]()
+}
+
+func (b *BinaryArithmeticSuite[T]) SetupTest() {
+ b.BinaryFuncTestSuite.SetupTest()
+ b.opts.NoCheckOverflow = false
+}
+
+func (b *BinaryArithmeticSuite[T]) makeNullScalar() scalar.Scalar {
+ return scalar.MakeNullScalar(b.DataType())
+}
+
+func (b *BinaryArithmeticSuite[T]) makeScalar(val T) scalar.Scalar {
+ return scalar.MakeScalar(val)
+}
+
+func (b *BinaryArithmeticSuite[T]) assertBinopScalars(fn binaryArithmeticFunc, lhs, rhs T, expected T) {
+ left, right := b.makeScalar(lhs), b.makeScalar(rhs)
+ exp := b.makeScalar(expected)
+
+ actual, err := fn(b.ctx, b.opts, &compute.ScalarDatum{Value: left}, &compute.ScalarDatum{Value: right})
+ b.NoError(err)
+ sc := actual.(*compute.ScalarDatum).Value
+
+ assertScalarEquals(b.T(), exp, sc)
+}
+
+func (b *BinaryArithmeticSuite[T]) assertBinopScalarValArr(fn binaryArithmeticFunc, lhs T, rhs, expected string) {
+ left := b.makeScalar(lhs)
+ b.assertBinopScalarArr(fn, left, rhs, expected)
+}
+
+func (b *BinaryArithmeticSuite[T]) assertBinopScalarArr(fn binaryArithmeticFunc, lhs scalar.Scalar, rhs, expected string) {
+ right, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(rhs))
+ defer right.Release()
+ exp, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(expected))
+ defer exp.Release()
+
+ actual, err := fn(b.ctx, b.opts, &compute.ScalarDatum{Value: lhs}, &compute.ArrayDatum{Value: right.Data()})
+ b.NoError(err)
+ defer actual.Release()
+ assertDatumsEqual(b.T(), &compute.ArrayDatum{Value: exp.Data()}, actual)
+}
+
+func (b *BinaryArithmeticSuite[T]) assertBinopArrScalarVal(fn binaryArithmeticFunc, lhs string, rhs T, expected string) {
+ right := b.makeScalar(rhs)
+ b.assertBinopArrScalar(fn, lhs, right, expected)
+}
+
+func (b *BinaryArithmeticSuite[T]) assertBinopArrScalar(fn binaryArithmeticFunc, lhs string, rhs scalar.Scalar, expected string) {
+ left, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(lhs))
+ defer left.Release()
+ exp, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(expected))
+ defer exp.Release()
+
+ actual, err := fn(b.ctx, b.opts, &compute.ArrayDatum{Value: left.Data()}, &compute.ScalarDatum{Value: rhs})
+ b.NoError(err)
+ defer actual.Release()
+ assertDatumsEqual(b.T(), &compute.ArrayDatum{Value: exp.Data()}, actual)
+}
+
+func (b *BinaryArithmeticSuite[T]) assertBinop(fn binaryArithmeticFunc, lhs, rhs, expected string) {
+ left, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(lhs))
+ defer left.Release()
+ right, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(rhs))
+ defer right.Release()
+ exp, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(expected))
+ defer exp.Release()
+
+ assertBinop(b.T(), func(left, right compute.Datum) (compute.Datum, error) {
+ return fn(b.ctx, b.opts, left, right)
+ }, left, right, exp)
+}
+
+func (b *BinaryArithmeticSuite[T]) setOverflowCheck(value bool) {
+ b.opts.NoCheckOverflow = value
+}
+
+func (b *BinaryArithmeticSuite[T]) assertBinopErr(fn binaryArithmeticFunc, lhs, rhs, expectedMsg string) {
+ left, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(lhs), array.WithUseNumber())
+ defer left.Release()
+ right, _, _ := array.FromJSON(b.mem, b.DataType(), strings.NewReader(rhs), array.WithUseNumber())
+ defer right.Release()
+
+ assertBinopErr(b.T(), func(left, right compute.Datum) (compute.Datum, error) {
+ return fn(b.ctx, b.opts, left, right)
+ }, left, right, expectedMsg)
+}
+
+func (b *BinaryArithmeticSuite[T]) TestAdd() {
+ b.Run(b.DataType().String(), func() {
+ for _, overflow := range []bool{false, true} {
+ b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+ b.setOverflowCheck(overflow)
+
+ b.assertBinop(compute.Add, `[]`, `[]`, `[]`)
+ b.assertBinop(compute.Add, `[3, 2, 6]`, `[1, 0, 2]`, `[4, 2, 8]`)
+ // nulls on one side
+ b.assertBinop(compute.Add, `[null, 1, null]`, `[3, 4, 5]`, `[null, 5, null]`)
+ b.assertBinop(compute.Add, `[3, 4, 5]`, `[null, 1, null]`, `[null, 5, null]`)
+ // nulls on both sides
+ b.assertBinop(compute.Add, `[null, 1, 2]`, `[3, 4, null]`, `[null, 5, null]`)
+ // all nulls
+ b.assertBinop(compute.Add, `[null]`, `[null]`, `[null]`)
+
+ // scalar on the left
+ b.assertBinopScalarValArr(compute.Add, 3, `[1, 2]`, `[4, 5]`)
+ b.assertBinopScalarValArr(compute.Add, 3, `[null, 2]`, `[null, 5]`)
+ b.assertBinopScalarArr(compute.Add, b.makeNullScalar(), `[1, 2]`, `[null, null]`)
+ b.assertBinopScalarArr(compute.Add, b.makeNullScalar(), `[null, 2]`, `[null, null]`)
+ // scalar on the right
+ b.assertBinopArrScalarVal(compute.Add, `[1, 2]`, 3, `[4, 5]`)
+ b.assertBinopArrScalarVal(compute.Add, `[null, 2]`, 3, `[null, 5]`)
+ b.assertBinopArrScalar(compute.Add, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
+ b.assertBinopArrScalar(compute.Add, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
+
+ if !arrow.IsFloating(b.DataType().ID()) && !overflow {
+ val := fmt.Sprintf("[%v]", b.max)
+ b.assertBinopErr(compute.Add, val, val, "overflow")
+ }
+ })
+ }
+ })
+}
+
+func (b *BinaryArithmeticSuite[T]) TestSub() {
+ b.Run(b.DataType().String(), func() {
+ for _, overflow := range []bool{false, true} {
+ b.Run(fmt.Sprintf("no_overflow_check=%t", overflow), func() {
+ b.setOverflowCheck(overflow)
+
+ b.assertBinop(compute.Subtract, `[]`, `[]`, `[]`)
+ b.assertBinop(compute.Subtract, `[3, 2, 6]`, `[1, 0, 2]`, `[2, 2, 4]`)
+ // nulls on one side
+ b.assertBinop(compute.Subtract, `[null, 4, null]`, `[2, 1, 0]`, `[null, 3, null]`)
+ b.assertBinop(compute.Subtract, `[3, 4, 5]`, `[null, 1, null]`, `[null, 3, null]`)
+ // nulls on both sides
+ b.assertBinop(compute.Subtract, `[null, 4, 3]`, `[2, 1, null]`, `[null, 3, null]`)
+ // all nulls
+ b.assertBinop(compute.Subtract, `[null]`, `[null]`, `[null]`)
+
+ // scalar on the left
+ b.assertBinopScalarValArr(compute.Subtract, 3, `[1, 2]`, `[2, 1]`)
+ b.assertBinopScalarValArr(compute.Subtract, 3, `[null, 2]`, `[null, 1]`)
+ b.assertBinopScalarArr(compute.Subtract, b.makeNullScalar(), `[1, 2]`, `[null, null]`)
+ b.assertBinopScalarArr(compute.Subtract, b.makeNullScalar(), `[null, 2]`, `[null, null]`)
+ // scalar on the right
+ b.assertBinopArrScalarVal(compute.Subtract, `[4, 5]`, 3, `[1, 2]`)
+ b.assertBinopArrScalarVal(compute.Subtract, `[null, 5]`, 3, `[null, 2]`)
+ b.assertBinopArrScalar(compute.Subtract, `[1, 2]`, b.makeNullScalar(), `[null, null]`)
+ b.assertBinopArrScalar(compute.Subtract, `[null, 2]`, b.makeNullScalar(), `[null, null]`)
+
+ if !arrow.IsFloating(b.DataType().ID()) && !overflow {
+ b.assertBinopErr(compute.Subtract, fmt.Sprintf("[%v]", b.min), fmt.Sprintf("[%v]", b.max), "overflow")
+ }
+ })
+ }
+ })
+}
+
+func TestBinaryArithmetic(t *testing.T) {
+ suite.Run(t, &BinaryArithmeticSuite[int8]{min: math.MinInt8, max: math.MaxInt8})
+ suite.Run(t, &BinaryArithmeticSuite[uint8]{min: 0, max: math.MaxUint8})
+ suite.Run(t, &BinaryArithmeticSuite[int16]{min: math.MinInt16, max: math.MaxInt16})
+ suite.Run(t, &BinaryArithmeticSuite[uint16]{min: 0, max: math.MaxUint16})
+ suite.Run(t, &BinaryArithmeticSuite[int32]{min: math.MinInt32, max: math.MaxInt32})
+ suite.Run(t, &BinaryArithmeticSuite[uint32]{min: 0, max: math.MaxUint32})
+ suite.Run(t, &BinaryArithmeticSuite[int64]{min: math.MinInt64, max: math.MaxInt64})
+ suite.Run(t, &BinaryArithmeticSuite[uint64]{min: 0, max: math.MaxUint64})
+ suite.Run(t, &BinaryArithmeticSuite[float32]{min: -math.MaxFloat32, max: math.MaxFloat32})
+ suite.Run(t, &BinaryArithmeticSuite[float64]{min: -math.MaxFloat64, max: math.MaxFloat64})
+ suite.Run(t, new(Float16BinaryFuncTestSuite))
+}
+
+func TestBinaryArithmeticDispatchBest(t *testing.T) {
+ for _, name := range []string{"add", "sub"} {
+ for _, suffix := range []string{"", "_unchecked"} {
+ name += suffix
+ t.Run(name, func(t *testing.T) {
+
+ tests := []struct {
+ left, right arrow.DataType
+ expected arrow.DataType
+ }{
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int32},
+ {arrow.PrimitiveTypes.Int32, arrow.Null, arrow.PrimitiveTypes.Int32},
+ {arrow.Null, arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int32},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int8, arrow.PrimitiveTypes.Int32},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int16, arrow.PrimitiveTypes.Int32},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int32},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Int64, arrow.PrimitiveTypes.Int64},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Uint8, arrow.PrimitiveTypes.Int32},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Uint16, arrow.PrimitiveTypes.Int32},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Uint32, arrow.PrimitiveTypes.Int64},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Uint64, arrow.PrimitiveTypes.Int64},
+ {arrow.PrimitiveTypes.Uint8, arrow.PrimitiveTypes.Uint8, arrow.PrimitiveTypes.Uint8},
+ {arrow.PrimitiveTypes.Uint8, arrow.PrimitiveTypes.Uint16, arrow.PrimitiveTypes.Uint16},
+ {arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Float32},
+ {arrow.PrimitiveTypes.Float32, arrow.PrimitiveTypes.Int64, arrow.PrimitiveTypes.Float32},
+ {arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Int32, arrow.PrimitiveTypes.Float64},
+ {&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int8, ValueType: arrow.PrimitiveTypes.Float64},
+ arrow.PrimitiveTypes.Float64, arrow.PrimitiveTypes.Float64},
+ {&arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int8, ValueType: arrow.PrimitiveTypes.Float64},
+ arrow.PrimitiveTypes.Int16, arrow.PrimitiveTypes.Float64},
+ }
+
+ for _, tt := range tests {
+ CheckDispatchBest(t, name, []arrow.DataType{tt.left, tt.right}, []arrow.DataType{tt.expected, tt.expected})
+ }
+ })
+ }
+ }
+}
+
+const seed = 0x94378165
+
+type binaryOp = func(ctx context.Context, left, right compute.Datum) (compute.Datum, error)
+
+func Add(ctx context.Context, left, right compute.Datum) (compute.Datum, error) {
+ var opts compute.ArithmeticOptions
+ return compute.Add(ctx, opts, left, right)
+}
+
+func Subtract(ctx context.Context, left, right compute.Datum) (compute.Datum, error) {
+ var opts compute.ArithmeticOptions
+ return compute.Subtract(ctx, opts, left, right)
+}
+
+func AddUnchecked(ctx context.Context, left, right compute.Datum) (compute.Datum, error) {
+ opts := compute.ArithmeticOptions{NoCheckOverflow: true}
+ return compute.Add(ctx, opts, left, right)
+}
+
+func SubtractUnchecked(ctx context.Context, left, right compute.Datum) (compute.Datum, error) {
+ opts := compute.ArithmeticOptions{NoCheckOverflow: true}
+ return compute.Subtract(ctx, opts, left, right)
+}
+
+func arrayScalarKernel(b *testing.B, sz int, nullProp float64, op binaryOp, dt arrow.DataType) {
+ b.Run("array scalar", func(b *testing.B) {
+ var (
+ mem = memory.NewCheckedAllocator(memory.DefaultAllocator)
+ arraySize = int64(sz / dt.(arrow.FixedWidthDataType).Bytes())
+ min int64 = 6
+ max = min + 15
+ sc, _ = scalar.MakeScalarParam(6, dt)
+ rhs compute.Datum = &compute.ScalarDatum{Value: sc}
+ rng = gen.NewRandomArrayGenerator(seed, mem)
+ )
+
+ lhs := rng.Numeric(dt.ID(), arraySize, min, max, nullProp)
+ b.Cleanup(func() {
+ lhs.Release()
+ })
+
+ var (
+ res compute.Datum
+ err error
+ ctx = context.Background()
+ left = &compute.ArrayDatum{Value: lhs.Data()}
+ )
+
+ b.SetBytes(arraySize)
+ b.ResetTimer()
+ for n := 0; n < b.N; n++ {
+ res, err = op(ctx, left, rhs)
+ b.StopTimer()
+ if err != nil {
+ b.Fatal(err)
+ }
+ res.Release()
+ b.StartTimer()
+ }
+ })
+}
+
+func arrayArrayKernel(b *testing.B, sz int, nullProp float64, op binaryOp, dt arrow.DataType) {
+ b.Run("array array", func(b *testing.B) {
+ var (
+ mem = memory.NewCheckedAllocator(memory.DefaultAllocator)
+ arraySize = int64(sz / dt.(arrow.FixedWidthDataType).Bytes())
+ rmin int64 = 1
+ rmax = rmin + 6 // 7
+ lmin = rmax + 1 // 8
+ lmax = lmin + 6 // 14
+ rng = gen.NewRandomArrayGenerator(seed, mem)
+ )
+
+ lhs := rng.Numeric(dt.ID(), arraySize, lmin, lmax, nullProp)
+ rhs := rng.Numeric(dt.ID(), arraySize, rmin, rmax, nullProp)
+ b.Cleanup(func() {
+ lhs.Release()
+ rhs.Release()
+ })
+ var (
+ res compute.Datum
+ err error
+ ctx = context.Background()
+ left = &compute.ArrayDatum{Value: lhs.Data()}
+ right = &compute.ArrayDatum{Value: rhs.Data()}
+ )
+
+ b.SetBytes(arraySize)
+ b.ResetTimer()
+ for n := 0; n < b.N; n++ {
+ res, err = op(ctx, left, right)
+ b.StopTimer()
+ if err != nil {
+ b.Fatal(err)
+ }
+ res.Release()
+ b.StartTimer()
+ }
+ })
+}
+
+func BenchmarkScalarArithmetic(b *testing.B) {
+ args := []struct {
+ sz int
+ nullProb float64
+ }{
+ {CpuCacheSizes[2], 0},
+ {CpuCacheSizes[2], 0.5},
+ {CpuCacheSizes[2], 1},
+ }
+
+ testfns := []struct {
+ name string
+ op binaryOp
+ }{
+ {"Add", Add},
+ {"AddUnchecked", AddUnchecked},
+ {"Subtract", Subtract},
+ {"SubtractUnchecked", SubtractUnchecked},
+ }
+
+ for _, dt := range numericTypes {
+ b.Run(dt.String(), func(b *testing.B) {
+ for _, benchArgs := range args {
+ b.Run(fmt.Sprintf("sz=%d/nullprob=%.2f", benchArgs.sz, benchArgs.nullProb), func(b *testing.B) {
+ for _, tfn := range testfns {
+ b.Run(tfn.name, func(b *testing.B) {
+ arrayArrayKernel(b, benchArgs.sz, benchArgs.nullProb, tfn.op, dt)
+ arrayScalarKernel(b, benchArgs.sz, benchArgs.nullProb, tfn.op, dt)
+ })
+ }
+ })
+ }
+ })
+ }
+}
diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go
index c8f07e23aef..cb5c4f8a758 100644
--- a/go/arrow/compute/cast_test.go
+++ b/go/arrow/compute/cast_test.go
@@ -34,7 +34,6 @@ import (
"github.com/apache/arrow/go/v10/arrow/internal/testing/types"
"github.com/apache/arrow/go/v10/arrow/memory"
"github.com/apache/arrow/go/v10/arrow/scalar"
- "github.com/klauspost/cpuid/v2"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
@@ -2732,26 +2731,6 @@ func TestCasts(t *testing.T) {
const rngseed = 0x94378165
-var (
- CpuCacheSizes = [...]int{ // defaults
- 32 * 1024, // level 1: 32K
- 256 * 1024, // level 2: 256K
- 3072 * 1024, // level 3: 3M
- }
-)
-
-func init() {
- if cpuid.CPU.Cache.L1D != -1 {
- CpuCacheSizes[0] = cpuid.CPU.Cache.L1D
- }
- if cpuid.CPU.Cache.L2 != -1 {
- CpuCacheSizes[1] = cpuid.CPU.Cache.L2
- }
- if cpuid.CPU.Cache.L3 != -1 {
- CpuCacheSizes[2] = cpuid.CPU.Cache.L3
- }
-}
-
func benchmarkNumericCast(b *testing.B, fromType, toType arrow.DataType, opts compute.CastOptions, size, min, max int64, nullprob float64) {
rng := gen.NewRandomArrayGenerator(rngseed, memory.DefaultAllocator)
arr := rng.Numeric(fromType.ID(), size, min, max, nullprob)
diff --git a/go/arrow/compute/exec.go b/go/arrow/compute/exec.go
index 3709424b9e4..b7f4962806c 100644
--- a/go/arrow/compute/exec.go
+++ b/go/arrow/compute/exec.go
@@ -99,6 +99,17 @@ func execInternal(ctx context.Context, fn Function, opts FunctionOptions, passed
return
}
+ // cast arguments if necessary
+ for i, arg := range args {
+ if !arrow.TypeEqual(inTypes[i], arg.(ArrayLikeDatum).Type()) {
+ args[i], err = CastDatum(ctx, arg, SafeCastOptions(inTypes[i]))
+ if err != nil {
+ return nil, err
+ }
+ defer args[i].Release()
+ }
+ }
+
kctx := &exec.KernelCtx{Ctx: ctx, Kernel: k}
init := k.GetInitFn()
kinitArgs := exec.KernelInitArgs{Kernel: k, Inputs: inTypes, Options: opts}
diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go
index 8098f2f8edd..f51c59deaf0 100644
--- a/go/arrow/compute/executor.go
+++ b/go/arrow/compute/executor.go
@@ -242,7 +242,7 @@ func propagateNulls(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ArraySp
}
var (
- arrsWithNulls = make([]*exec.ArraySpan, 0)
+ arrsWithNulls = make([]*exec.ArraySpan, 0, len(batch.Values))
isAllNull bool
prealloc bool = out.Buffers[0].Buf != nil
)
@@ -596,6 +596,7 @@ func (s *scalarExecutor) executeSpans(data chan<- Datum) (err error) {
resultOffset = nextOffset
}
if err != nil {
+ prealloc.Release()
return
}
diff --git a/go/arrow/compute/expression.go b/go/arrow/compute/expression.go
index 644de5cf5c9..aa6e3661afa 100644
--- a/go/arrow/compute/expression.go
+++ b/go/arrow/compute/expression.go
@@ -485,7 +485,7 @@ const (
)
type ArithmeticOptions struct {
- CheckOverflow bool `compute:"check_overflow"`
+ NoCheckOverflow bool `compute:"check_overflow"`
}
func (ArithmeticOptions) TypeName() string { return "ArithmeticOptions" }
diff --git a/go/arrow/compute/functions_test.go b/go/arrow/compute/functions_test.go
index 78dbd8be5e4..1f167f0232c 100644
--- a/go/arrow/compute/functions_test.go
+++ b/go/arrow/compute/functions_test.go
@@ -19,8 +19,10 @@ package compute_test
import (
"testing"
+ "github.com/apache/arrow/go/v10/arrow"
"github.com/apache/arrow/go/v10/arrow/compute"
"github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
)
func TestArityBasics(t *testing.T) {
@@ -44,3 +46,22 @@ func TestArityBasics(t *testing.T) {
assert.Equal(t, 2, varargs.NArgs)
assert.True(t, varargs.IsVarArgs)
}
+
+func CheckDispatchBest(t *testing.T, funcName string, originalTypes, expected []arrow.DataType) {
+ fn, exists := compute.GetFunctionRegistry().GetFunction(funcName)
+ require.True(t, exists)
+
+ vals := make([]arrow.DataType, len(originalTypes))
+ copy(vals, originalTypes)
+
+ actualKernel, err := fn.DispatchBest(vals...)
+ require.NoError(t, err)
+ expKernel, err := fn.DispatchExact(expected...)
+ require.NoError(t, err)
+
+ assert.Same(t, expKernel, actualKernel)
+ assert.Equal(t, len(expected), len(vals))
+ for i, v := range vals {
+ assert.True(t, arrow.TypeEqual(v, expected[i]), v.String(), expected[i].String())
+ }
+}
diff --git a/go/arrow/compute/internal/exec/span.go b/go/arrow/compute/internal/exec/span.go
index ca6caf436b9..1e8a719d347 100644
--- a/go/arrow/compute/internal/exec/span.go
+++ b/go/arrow/compute/internal/exec/span.go
@@ -86,6 +86,21 @@ type ArraySpan struct {
Children []ArraySpan
}
+// if an error is encountered, call Release on a preallocated span
+// to ensure it releases any self-allocated buffers, it will
+// not call release on buffers it doesn't own (SelfAlloc != true)
+func (a *ArraySpan) Release() {
+ for _, c := range a.Children {
+ c.Release()
+ }
+
+ for _, b := range a.Buffers {
+ if b.SelfAlloc {
+ b.Owner.Release()
+ }
+ }
+}
+
func (a *ArraySpan) MayHaveNulls() bool {
return atomic.LoadInt64(&a.Nulls) != 0 && a.Buffers[0].Buf != nil
}
@@ -114,7 +129,7 @@ func (a *ArraySpan) NumBuffers() int { return getNumBuffers(a.Type) }
// MakeData generates an arrow.ArrayData object for this ArraySpan,
// properly updating the buffer ref count if necessary.
func (a *ArraySpan) MakeData() arrow.ArrayData {
- bufs := make([]*memory.Buffer, a.NumBuffers())
+ var bufs [3]*memory.Buffer
for i := range bufs {
b := a.GetBuffer(i)
bufs[i] = b
@@ -155,7 +170,7 @@ func (a *ArraySpan) MakeData() arrow.ArrayData {
}
if dt.ID() == arrow.DICTIONARY {
- result := array.NewData(a.Type, length, bufs, nil, nulls, off)
+ result := array.NewData(a.Type, length, bufs[:a.NumBuffers()], nil, nulls, off)
dict := a.Dictionary().MakeData()
defer dict.Release()
result.SetDictionary(dict)
@@ -173,7 +188,7 @@ func (a *ArraySpan) MakeData() arrow.ArrayData {
children[i] = d
}
}
- return array.NewData(a.Type, length, bufs, children, nulls, off)
+ return array.NewData(a.Type, length, bufs[:a.NumBuffers()], children, nulls, off)
}
// MakeArray is a convenience function for calling array.MakeFromData(a.MakeData())
@@ -186,14 +201,24 @@ func (a *ArraySpan) MakeArray() arrow.Array {
// SetSlice updates the offset and length of this ArraySpan to refer to
// a specific slice of the underlying buffers.
func (a *ArraySpan) SetSlice(off, length int64) {
- a.Offset, a.Len = off, length
+ if off == a.Offset && length == a.Len {
+ // don't modify the nulls if the slice is the entire span
+ return
+ }
+
if a.Type.ID() != arrow.NULL {
if a.Nulls != 0 {
- a.Nulls = array.UnknownNullCount
+ if a.Nulls == a.Len {
+ a.Nulls = length
+ } else {
+ a.Nulls = array.UnknownNullCount
+ }
}
} else {
- a.Nulls = a.Len
+ a.Nulls = length
}
+
+ a.Offset, a.Len = off, length
}
// GetBuffer returns the buffer for the requested index. If this buffer
diff --git a/go/arrow/compute/internal/exec/utils.go b/go/arrow/compute/internal/exec/utils.go
index 876e3f38ece..57fe3183c6e 100644
--- a/go/arrow/compute/internal/exec/utils.go
+++ b/go/arrow/compute/internal/exec/utils.go
@@ -135,6 +135,13 @@ func Min[T constraints.Ordered](a, b T) T {
return b
}
+func Max[T constraints.Ordered](a, b T) T {
+ if a > b {
+ return a
+ }
+ return b
+}
+
// OptionsInit should be used in the case where a KernelState is simply
// represented with a specific type by value (instead of pointer).
// This will initialize the KernelState as a value-copied instance of
@@ -165,13 +172,26 @@ var typMap = map[reflect.Type]arrow.DataType{
reflect.TypeOf(arrow.Date32(0)): arrow.FixedWidthTypes.Date32,
reflect.TypeOf(arrow.Date64(0)): arrow.FixedWidthTypes.Date64,
reflect.TypeOf(true): arrow.FixedWidthTypes.Boolean,
+ reflect.TypeOf(float16.Num{}): arrow.FixedWidthTypes.Float16,
}
-func GetDataType[T NumericTypes | bool | string]() arrow.DataType {
+// GetDataType returns the appropriate arrow.DataType for the given type T
+// only for non-parametric types. This uses a map and reflection internally
+// so don't call this in a tight loop, instead call this once and then use
+// a closure with the result.
+func GetDataType[T NumericTypes | bool | string | float16.Num]() arrow.DataType {
var z T
return typMap[reflect.TypeOf(z)]
}
+// GetType returns the appropriate arrow.Type type T, only for non-parameteric
+// types. This uses a map and reflection internally so don't call this in
+// a tight loop, instead call it once and then use a closure with the result.
+func GetType[T NumericTypes | bool | string]() arrow.Type {
+ var z T
+ return typMap[reflect.TypeOf(z)].ID()
+}
+
type arrayBuilder[T NumericTypes] interface {
array.Builder
Append(T)
diff --git a/go/arrow/compute/internal/kernels/Makefile b/go/arrow/compute/internal/kernels/Makefile
index 752c38d412d..96238cc9a12 100644
--- a/go/arrow/compute/internal/kernels/Makefile
+++ b/go/arrow/compute/internal/kernels/Makefile
@@ -36,7 +36,8 @@ ALL_SOURCES := $(shell find . -path ./_lib -prune -o -name '*.go' -name '*.s' -n
.PHONEY: assembly
INTEL_SOURCES := \
- cast_numeric_avx2_amd64.s cast_numeric_sse4_amd64.s constant_factor_avx2_amd64.s constant_factor_sse4_amd64.s
+ cast_numeric_avx2_amd64.s cast_numeric_sse4_amd64.s constant_factor_avx2_amd64.s \
+ constant_factor_sse4_amd64.s base_arithmetic_avx2_amd64.s base_arithmetic_sse4_amd64.s
#
# ARROW-15336: DO NOT add the assembly target for Arm64 (ARM_SOURCES) until c2goasm added the Arm64 support.
@@ -55,6 +56,15 @@ _lib/cast_numeric_sse4_amd64.s: _lib/cast_numeric.cc
_lib/cast_numeric_neon.s: _lib/cast_numeric.cc
$(CXX) -std=c++17 -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+_lib/base_arithmetic_avx2_amd64.s: _lib/base_arithmetic.cc
+ $(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/base_arithmetic_sse4_amd64.s: _lib/base_arithmetic.cc
+ $(CXX) -std=c++17 -S $(C_FLAGS) $(ASM_FLAGS_SSE4) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
+_lib/base_arithmetic_neon.s: _lib/base_arithmetic.cc
+ $(CXX) -std=c++17 -S $(C_FLAGS_NEON) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
+
_lib/constant_factor_avx2_amd64.s: _lib/constant_factor.c
$(CC) -S $(C_FLAGS) $(ASM_FLAGS_AVX2) $^ -o $@ ; $(PERL_FIXUP_ROTATE) $@
@@ -76,6 +86,12 @@ constant_factor_avx2_amd64.s: _lib/constant_factor_avx2_amd64.s
constant_factor_sse4_amd64.s: _lib/constant_factor_sse4_amd64.s
$(C2GOASM) -a -f $^ $@
+base_arithmetic_avx2_amd64.s: _lib/base_arithmetic_avx2_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
+base_arithmetic_sse4_amd64.s: _lib/base_arithmetic_sse4_amd64.s
+ $(C2GOASM) -a -f $^ $@
+
clean:
rm -f $(INTEL_SOURCES)
rm -f $(addprefix _lib/,$(INTEL_SOURCES))
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
new file mode 100644
index 00000000000..dc2234bfb35
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic.cc
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+#include
+#include "types.h"
+#include "vendored/safe-math.h"
+
+// Corresponds to equivalent ArithmeticOp enum in base_arithmetic.go
+// for passing across which operation to perform. This allows simpler
+// implementation at the cost of having to pass the extra int8 and
+// perform a switch.
+//
+// In cases of small arrays, this is completely negligible. In cases
+// of large arrays, the time saved by using SIMD here is significantly
+// worth the cost.
+enum class optype : int8_t {
+ ADD,
+ SUB,
+
+ // this impl doesn't actually perform any overflow checks as we need
+ // to only run overflow checks on non-null entries
+ ADD_CHECKED,
+ SUB_CHECKED,
+};
+
+struct Add {
+ template
+ static constexpr T Call(Arg0 left, Arg1 right) {
+ if constexpr (is_arithmetic_v)
+ return left + right;
+ }
+};
+
+struct Sub {
+ template
+ static constexpr T Call(Arg0 left, Arg1 right) {
+ if constexpr (is_arithmetic_v)
+ return left - right;
+ }
+};
+
+struct AddChecked {
+ template
+ static constexpr T Call(Arg0 left, Arg1 right) {
+ static_assert(is_same::value && is_same::value, "");
+ if constexpr(is_arithmetic_v) {
+ return left + right;
+ }
+ }
+};
+
+
+struct SubChecked {
+ template
+ static constexpr T Call(Arg0 left, Arg1 right) {
+ static_assert(is_same::value && is_same::value, "");
+ if constexpr(is_arithmetic_v) {
+ return left - right;
+ }
+ }
+};
+
+template
+struct arithmetic_op_arr_arr_impl {
+ static inline void exec(const void* in_left, const void* in_right, void* out, const int len) {
+ const T* left = reinterpret_cast(in_left);
+ const T* right = reinterpret_cast(in_right);
+ T* output = reinterpret_cast(out);
+
+ for (int i = 0; i < len; ++i) {
+ output[i] = Op::template Call(left[i], right[i]);
+ }
+ }
+};
+
+template
+struct arithmetic_op_arr_scalar_impl {
+ static inline void exec(const void* in_left, const void* scalar_right, void* out, const int len) {
+ const T* left = reinterpret_cast(in_left);
+ const T right = *reinterpret_cast(scalar_right);
+ T* output = reinterpret_cast(out);
+
+ for (int i = 0; i < len; ++i) {
+ output[i] = Op::template Call(left[i], right);
+ }
+ }
+};
+
+template
+struct arithmetic_op_scalar_arr_impl {
+ static inline void exec(const void* scalar_left, const void* in_right, void* out, const int len) {
+ const T left = *reinterpret_cast(scalar_left);
+ const T* right = reinterpret_cast(in_right);
+ T* output = reinterpret_cast(out);
+
+ for (int i = 0; i < len; ++i) {
+ output[i] = Op::template Call(left, right[i]);
+ }
+ }
+};
+
+
+template typename Impl>
+static inline void arithmetic_op(const int type, const void* in_left, const void* in_right, void* output, const int len) {
+ const auto intype = static_cast(type);
+
+ switch (intype) {
+ case arrtype::UINT8:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::INT8:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::UINT16:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::INT16:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::UINT32:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::INT32:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::UINT64:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::INT64:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::FLOAT32:
+ return Impl::exec(in_left, in_right, output, len);
+ case arrtype::FLOAT64:
+ return Impl::exec(in_left, in_right, output, len);
+ default:
+ break;
+ }
+}
+
+template class Impl>
+static inline void arithmetic_impl(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
+ const auto opt = static_cast(op);
+
+ switch (opt) {
+ case optype::ADD:
+ return arithmetic_op(type, in_left, in_right, out, len);
+ case optype::ADD_CHECKED:
+ return arithmetic_op(type, in_left, in_right, out, len);
+ case optype::SUB:
+ return arithmetic_op(type, in_left, in_right, out, len);
+ case optype::SUB_CHECKED:
+ return arithmetic_op(type, in_left, in_right, out, len);
+ default:
+ break;
+ }
+}
+
+extern "C" void FULL_NAME(arithmetic)(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
+ arithmetic_impl(type, op, in_left, in_right, out, len);
+}
+
+extern "C" void FULL_NAME(arithmetic_arr_scalar)(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
+ arithmetic_impl(type, op, in_left, in_right, out, len);
+}
+
+extern "C" void FULL_NAME(arithmetic_scalar_arr)(const int type, const int8_t op, const void* in_left, const void* in_right, void* out, const int len) {
+ arithmetic_impl(type, op, in_left, in_right, out, len);
+}
\ No newline at end of file
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
new file mode 100644
index 00000000000..76355712b8a
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_avx2_amd64.s
@@ -0,0 +1,12469 @@
+ .text
+ .intel_syntax noprefix
+ .file "base_arithmetic.cc"
+ .globl arithmetic_avx2 # -- Begin function arithmetic_avx2
+ .p2align 4, 0x90
+ .type arithmetic_avx2,@function
+arithmetic_avx2: # @arithmetic_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 1
+ jg .LBB0_10
+# %bb.1:
+ test sil, sil
+ je .LBB0_19
+# %bb.2:
+ cmp sil, 1
+ jne .LBB0_537
+# %bb.3:
+ cmp edi, 6
+ jg .LBB0_291
+# %bb.4:
+ cmp edi, 3
+ jle .LBB0_5
+# %bb.285:
+ cmp edi, 4
+ je .LBB0_324
+# %bb.286:
+ cmp edi, 5
+ je .LBB0_336
+# %bb.287:
+ cmp edi, 6
+ jne .LBB0_537
+# %bb.288:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.289:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_348
+# %bb.290:
+ xor esi, esi
+ jmp .LBB0_353
+.LBB0_10:
+ cmp sil, 2
+ je .LBB0_152
+# %bb.11:
+ cmp sil, 3
+ jne .LBB0_537
+# %bb.12:
+ cmp edi, 6
+ jg .LBB0_417
+# %bb.13:
+ cmp edi, 3
+ jle .LBB0_14
+# %bb.411:
+ cmp edi, 4
+ je .LBB0_450
+# %bb.412:
+ cmp edi, 5
+ je .LBB0_462
+# %bb.413:
+ cmp edi, 6
+ jne .LBB0_537
+# %bb.414:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.415:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_474
+# %bb.416:
+ xor esi, esi
+ jmp .LBB0_479
+.LBB0_19:
+ cmp edi, 6
+ jg .LBB0_32
+# %bb.20:
+ cmp edi, 3
+ jle .LBB0_21
+# %bb.26:
+ cmp edi, 4
+ je .LBB0_65
+# %bb.27:
+ cmp edi, 5
+ je .LBB0_77
+# %bb.28:
+ cmp edi, 6
+ jne .LBB0_537
+# %bb.29:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.30:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_89
+# %bb.31:
+ xor esi, esi
+ jmp .LBB0_94
+.LBB0_152:
+ cmp edi, 6
+ jg .LBB0_165
+# %bb.153:
+ cmp edi, 3
+ jle .LBB0_154
+# %bb.159:
+ cmp edi, 4
+ je .LBB0_198
+# %bb.160:
+ cmp edi, 5
+ je .LBB0_210
+# %bb.161:
+ cmp edi, 6
+ jne .LBB0_537
+# %bb.162:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.163:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_222
+# %bb.164:
+ xor esi, esi
+ jmp .LBB0_227
+.LBB0_291:
+ cmp edi, 8
+ jle .LBB0_292
+# %bb.297:
+ cmp edi, 9
+ je .LBB0_378
+# %bb.298:
+ cmp edi, 11
+ je .LBB0_390
+# %bb.299:
+ cmp edi, 12
+ jne .LBB0_537
+# %bb.300:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.301:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_402
+# %bb.302:
+ xor esi, esi
+ jmp .LBB0_407
+.LBB0_417:
+ cmp edi, 8
+ jle .LBB0_418
+# %bb.423:
+ cmp edi, 9
+ je .LBB0_504
+# %bb.424:
+ cmp edi, 11
+ je .LBB0_516
+# %bb.425:
+ cmp edi, 12
+ jne .LBB0_537
+# %bb.426:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.427:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_528
+# %bb.428:
+ xor esi, esi
+ jmp .LBB0_533
+.LBB0_32:
+ cmp edi, 8
+ jle .LBB0_33
+# %bb.38:
+ cmp edi, 9
+ je .LBB0_119
+# %bb.39:
+ cmp edi, 11
+ je .LBB0_131
+# %bb.40:
+ cmp edi, 12
+ jne .LBB0_537
+# %bb.41:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.42:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_143
+# %bb.43:
+ xor esi, esi
+ jmp .LBB0_148
+.LBB0_165:
+ cmp edi, 8
+ jle .LBB0_166
+# %bb.171:
+ cmp edi, 9
+ je .LBB0_252
+# %bb.172:
+ cmp edi, 11
+ je .LBB0_264
+# %bb.173:
+ cmp edi, 12
+ jne .LBB0_537
+# %bb.174:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.175:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_276
+# %bb.176:
+ xor esi, esi
+ jmp .LBB0_281
+.LBB0_5:
+ cmp edi, 2
+ je .LBB0_303
+# %bb.6:
+ cmp edi, 3
+ jne .LBB0_537
+# %bb.7:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.8:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_315
+# %bb.9:
+ xor esi, esi
+ jmp .LBB0_320
+.LBB0_14:
+ cmp edi, 2
+ je .LBB0_429
+# %bb.15:
+ cmp edi, 3
+ jne .LBB0_537
+# %bb.16:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.17:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_441
+# %bb.18:
+ xor esi, esi
+ jmp .LBB0_446
+.LBB0_21:
+ cmp edi, 2
+ je .LBB0_44
+# %bb.22:
+ cmp edi, 3
+ jne .LBB0_537
+# %bb.23:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.24:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_56
+# %bb.25:
+ xor esi, esi
+ jmp .LBB0_61
+.LBB0_154:
+ cmp edi, 2
+ je .LBB0_177
+# %bb.155:
+ cmp edi, 3
+ jne .LBB0_537
+# %bb.156:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.157:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_189
+# %bb.158:
+ xor esi, esi
+ jmp .LBB0_194
+.LBB0_292:
+ cmp edi, 7
+ je .LBB0_357
+# %bb.293:
+ cmp edi, 8
+ jne .LBB0_537
+# %bb.294:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.295:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_369
+# %bb.296:
+ xor esi, esi
+ jmp .LBB0_374
+.LBB0_418:
+ cmp edi, 7
+ je .LBB0_483
+# %bb.419:
+ cmp edi, 8
+ jne .LBB0_537
+# %bb.420:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.421:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_495
+# %bb.422:
+ xor esi, esi
+ jmp .LBB0_500
+.LBB0_33:
+ cmp edi, 7
+ je .LBB0_98
+# %bb.34:
+ cmp edi, 8
+ jne .LBB0_537
+# %bb.35:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.36:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_110
+# %bb.37:
+ xor esi, esi
+ jmp .LBB0_115
+.LBB0_166:
+ cmp edi, 7
+ je .LBB0_231
+# %bb.167:
+ cmp edi, 8
+ jne .LBB0_537
+# %bb.168:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.169:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_243
+# %bb.170:
+ xor esi, esi
+ jmp .LBB0_248
+.LBB0_324:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.325:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_327
+# %bb.326:
+ xor esi, esi
+ jmp .LBB0_332
+.LBB0_336:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.337:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_339
+# %bb.338:
+ xor esi, esi
+ jmp .LBB0_344
+.LBB0_450:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.451:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_453
+# %bb.452:
+ xor esi, esi
+ jmp .LBB0_458
+.LBB0_462:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.463:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_465
+# %bb.464:
+ xor esi, esi
+ jmp .LBB0_470
+.LBB0_65:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.66:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_68
+# %bb.67:
+ xor esi, esi
+ jmp .LBB0_73
+.LBB0_77:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.78:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_80
+# %bb.79:
+ xor esi, esi
+ jmp .LBB0_85
+.LBB0_198:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.199:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_201
+# %bb.200:
+ xor esi, esi
+ jmp .LBB0_206
+.LBB0_210:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.211:
+ mov r10d, r9d
+ cmp r9d, 64
+ jae .LBB0_213
+# %bb.212:
+ xor esi, esi
+ jmp .LBB0_218
+.LBB0_378:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.379:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_381
+# %bb.380:
+ xor esi, esi
+ jmp .LBB0_386
+.LBB0_390:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.391:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_393
+# %bb.392:
+ xor esi, esi
+ jmp .LBB0_398
+.LBB0_504:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.505:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_507
+# %bb.506:
+ xor esi, esi
+ jmp .LBB0_512
+.LBB0_516:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.517:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_519
+# %bb.518:
+ xor esi, esi
+ jmp .LBB0_524
+.LBB0_119:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.120:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_122
+# %bb.121:
+ xor esi, esi
+ jmp .LBB0_127
+.LBB0_131:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.132:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_134
+# %bb.133:
+ xor esi, esi
+ jmp .LBB0_139
+.LBB0_252:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.253:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_255
+# %bb.254:
+ xor esi, esi
+ jmp .LBB0_260
+.LBB0_264:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.265:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_267
+# %bb.266:
+ xor esi, esi
+ jmp .LBB0_272
+.LBB0_303:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.304:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_306
+# %bb.305:
+ xor esi, esi
+ jmp .LBB0_311
+.LBB0_429:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.430:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_432
+# %bb.431:
+ xor esi, esi
+ jmp .LBB0_437
+.LBB0_44:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.45:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_47
+# %bb.46:
+ xor esi, esi
+ jmp .LBB0_52
+.LBB0_177:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.178:
+ mov r10d, r9d
+ cmp r9d, 128
+ jae .LBB0_180
+# %bb.179:
+ xor esi, esi
+ jmp .LBB0_185
+.LBB0_357:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.358:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_360
+# %bb.359:
+ xor esi, esi
+ jmp .LBB0_365
+.LBB0_483:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.484:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_486
+# %bb.485:
+ xor esi, esi
+ jmp .LBB0_491
+.LBB0_98:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.99:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_101
+# %bb.100:
+ xor esi, esi
+ jmp .LBB0_106
+.LBB0_231:
+ test r9d, r9d
+ jle .LBB0_537
+# %bb.232:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_234
+# %bb.233:
+ xor esi, esi
+ jmp .LBB0_239
+.LBB0_348:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_353
+# %bb.349:
+ and al, dil
+ jne .LBB0_353
+# %bb.350:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_351: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_351
+# %bb.352:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_353:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_355
+.LBB0_354: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_354
+.LBB0_355:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_356: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_356
+ jmp .LBB0_537
+.LBB0_474:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_479
+# %bb.475:
+ and al, dil
+ jne .LBB0_479
+# %bb.476:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_477: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_477
+# %bb.478:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_479:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_481
+.LBB0_480: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_480
+.LBB0_481:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_482: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_482
+ jmp .LBB0_537
+.LBB0_89:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_94
+# %bb.90:
+ and al, dil
+ jne .LBB0_94
+# %bb.91:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_92: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_92
+# %bb.93:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_94:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_96
+.LBB0_95: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_95
+.LBB0_96:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_97: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_97
+ jmp .LBB0_537
+.LBB0_222:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_227
+# %bb.223:
+ and al, dil
+ jne .LBB0_227
+# %bb.224:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_225: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_225
+# %bb.226:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_227:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_229
+.LBB0_228: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_228
+.LBB0_229:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_230: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_230
+ jmp .LBB0_537
+.LBB0_402:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_407
+# %bb.403:
+ and al, dil
+ jne .LBB0_407
+# %bb.404:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_405: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vsubpd ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_405
+# %bb.406:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_407:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_409
+.LBB0_408: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_408
+.LBB0_409:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_410: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_410
+ jmp .LBB0_537
+.LBB0_528:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_533
+# %bb.529:
+ and al, dil
+ jne .LBB0_533
+# %bb.530:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_531: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vsubpd ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_531
+# %bb.532:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_533:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_535
+.LBB0_534: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_534
+.LBB0_535:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_536: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vsubsd xmm0, xmm0, qword ptr [rcx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_536
+ jmp .LBB0_537
+.LBB0_143:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_148
+# %bb.144:
+ and al, dil
+ jne .LBB0_148
+# %bb.145:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_146: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vaddpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_146
+# %bb.147:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_148:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_150
+.LBB0_149: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_149
+.LBB0_150:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_151: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_151
+ jmp .LBB0_537
+.LBB0_276:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_281
+# %bb.277:
+ and al, dil
+ jne .LBB0_281
+# %bb.278:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_279: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovupd ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovupd ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovupd ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vaddpd ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm0
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_279
+# %bb.280:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_281:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_283
+.LBB0_282: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_282
+.LBB0_283:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_284: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi]
+ vmovsd qword ptr [r8 + 8*rsi], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 8]
+ vmovsd qword ptr [r8 + 8*rsi + 8], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 16]
+ vmovsd qword ptr [r8 + 8*rsi + 16], xmm0
+ vmovsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ vaddsd xmm0, xmm0, qword ptr [rdx + 8*rsi + 24]
+ vmovsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_284
+ jmp .LBB0_537
+.LBB0_315:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_320
+# %bb.316:
+ and al, dil
+ jne .LBB0_320
+# %bb.317:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_318: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_318
+# %bb.319:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_320:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_322
+.LBB0_321: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_321
+.LBB0_322:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_323: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_323
+ jmp .LBB0_537
+.LBB0_441:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_446
+# %bb.442:
+ and al, dil
+ jne .LBB0_446
+# %bb.443:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_444: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_444
+# %bb.445:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_446:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_448
+.LBB0_447: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_447
+.LBB0_448:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_449: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_449
+ jmp .LBB0_537
+.LBB0_56:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_61
+# %bb.57:
+ and al, dil
+ jne .LBB0_61
+# %bb.58:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_59: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_59
+# %bb.60:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_61:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_63
+.LBB0_62: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_62
+.LBB0_63:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_64: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_64
+ jmp .LBB0_537
+.LBB0_189:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_194
+# %bb.190:
+ and al, dil
+ jne .LBB0_194
+# %bb.191:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_192: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_192
+# %bb.193:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_194:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_196
+.LBB0_195: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_195
+.LBB0_196:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_197: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_197
+ jmp .LBB0_537
+.LBB0_369:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_374
+# %bb.370:
+ and al, dil
+ jne .LBB0_374
+# %bb.371:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_372: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_372
+# %bb.373:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_374:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_376
+.LBB0_375: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_375
+.LBB0_376:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_377: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_377
+ jmp .LBB0_537
+.LBB0_495:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_500
+# %bb.496:
+ and al, dil
+ jne .LBB0_500
+# %bb.497:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_498: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_498
+# %bb.499:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_500:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_502
+.LBB0_501: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_501
+.LBB0_502:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_503: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_503
+ jmp .LBB0_537
+.LBB0_110:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_115
+# %bb.111:
+ and al, dil
+ jne .LBB0_115
+# %bb.112:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_113: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_113
+# %bb.114:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_115:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_117
+.LBB0_116: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_116
+.LBB0_117:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_118: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_118
+ jmp .LBB0_537
+.LBB0_243:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_248
+# %bb.244:
+ and al, dil
+ jne .LBB0_248
+# %bb.245:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_246: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_246
+# %bb.247:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_248:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_250
+.LBB0_249: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_249
+.LBB0_250:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_251: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_251
+ jmp .LBB0_537
+.LBB0_327:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_332
+# %bb.328:
+ and al, dil
+ jne .LBB0_332
+# %bb.329:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_330: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_330
+# %bb.331:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_332:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_334
+.LBB0_333: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_333
+.LBB0_334:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_335: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_335
+ jmp .LBB0_537
+.LBB0_339:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_344
+# %bb.340:
+ and al, dil
+ jne .LBB0_344
+# %bb.341:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_342: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_342
+# %bb.343:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_344:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_346
+.LBB0_345: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_345
+.LBB0_346:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_347: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_347
+ jmp .LBB0_537
+.LBB0_453:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_458
+# %bb.454:
+ and al, dil
+ jne .LBB0_458
+# %bb.455:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_456: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_456
+# %bb.457:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_458:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_460
+.LBB0_459: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_459
+.LBB0_460:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_461: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_461
+ jmp .LBB0_537
+.LBB0_465:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_470
+# %bb.466:
+ and al, dil
+ jne .LBB0_470
+# %bb.467:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_468: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm1, ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vpsubw ymm2, ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm3, ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_468
+# %bb.469:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_470:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_472
+.LBB0_471: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_471
+.LBB0_472:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_473: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_473
+ jmp .LBB0_537
+.LBB0_68:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_73
+# %bb.69:
+ and al, dil
+ jne .LBB0_73
+# %bb.70:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_71: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_71
+# %bb.72:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_73:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_75
+.LBB0_74: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_74
+.LBB0_75:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_76: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_76
+ jmp .LBB0_537
+.LBB0_80:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_85
+# %bb.81:
+ and al, dil
+ jne .LBB0_85
+# %bb.82:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_83: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_83
+# %bb.84:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_85:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_87
+.LBB0_86: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_86
+.LBB0_87:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_88: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_88
+ jmp .LBB0_537
+.LBB0_201:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_206
+# %bb.202:
+ and al, dil
+ jne .LBB0_206
+# %bb.203:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_204: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_204
+# %bb.205:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_206:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_208
+.LBB0_207: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_207
+.LBB0_208:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_209: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_209
+ jmp .LBB0_537
+.LBB0_213:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_218
+# %bb.214:
+ and al, dil
+ jne .LBB0_218
+# %bb.215:
+ mov esi, r10d
+ and esi, -64
+ xor edi, edi
+.LBB0_216: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 2*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 2*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 2*rdi + 96]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm1, ymm1, ymmword ptr [rdx + 2*rdi + 32]
+ vpaddw ymm2, ymm2, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm3, ymm3, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm3
+ add rdi, 64
+ cmp rsi, rdi
+ jne .LBB0_216
+# %bb.217:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_218:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_220
+.LBB0_219: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_219
+.LBB0_220:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_221: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_221
+ jmp .LBB0_537
+.LBB0_381:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_386
+# %bb.382:
+ and al, dil
+ jne .LBB0_386
+# %bb.383:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_384: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_384
+# %bb.385:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_386:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_388
+.LBB0_387: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_387
+.LBB0_388:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_389: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_389
+ jmp .LBB0_537
+.LBB0_393:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_398
+# %bb.394:
+ and al, dil
+ jne .LBB0_398
+# %bb.395:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_396: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vsubps ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_396
+# %bb.397:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_398:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_400
+.LBB0_399: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_399
+.LBB0_400:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_401: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_401
+ jmp .LBB0_537
+.LBB0_507:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_512
+# %bb.508:
+ and al, dil
+ jne .LBB0_512
+# %bb.509:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_510: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm2, ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm3, ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_510
+# %bb.511:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_512:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_514
+.LBB0_513: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rdx + 8*rsi]
+ sub rdi, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_513
+.LBB0_514:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_515: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_515
+ jmp .LBB0_537
+.LBB0_519:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_524
+# %bb.520:
+ and al, dil
+ jne .LBB0_524
+# %bb.521:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_522: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vsubps ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_522
+# %bb.523:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_524:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_526
+.LBB0_525: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_525
+.LBB0_526:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_527: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vsubss xmm0, xmm0, dword ptr [rcx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_527
+ jmp .LBB0_537
+.LBB0_122:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_127
+# %bb.123:
+ and al, dil
+ jne .LBB0_127
+# %bb.124:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_125: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_125
+# %bb.126:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_127:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_129
+.LBB0_128: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_128
+.LBB0_129:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_130: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_130
+ jmp .LBB0_537
+.LBB0_134:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_139
+# %bb.135:
+ and al, dil
+ jne .LBB0_139
+# %bb.136:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_137: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vaddps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_137
+# %bb.138:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_139:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_141
+.LBB0_140: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_140
+.LBB0_141:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_142: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_142
+ jmp .LBB0_537
+.LBB0_255:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_260
+# %bb.256:
+ and al, dil
+ jne .LBB0_260
+# %bb.257:
+ mov esi, r10d
+ and esi, -16
+ xor edi, edi
+.LBB0_258: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 8*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 8*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 8*rdi + 96]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm2, ymm2, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm3, ymm3, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm3
+ add rdi, 16
+ cmp rsi, rdi
+ jne .LBB0_258
+# %bb.259:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_260:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_262
+.LBB0_261: # =>This Inner Loop Header: Depth=1
+ mov rdi, qword ptr [rcx + 8*rsi]
+ add rdi, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rdi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_261
+.LBB0_262:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_263: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_263
+ jmp .LBB0_537
+.LBB0_267:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_272
+# %bb.268:
+ and al, dil
+ jne .LBB0_272
+# %bb.269:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_270: # =>This Inner Loop Header: Depth=1
+ vmovups ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovups ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovups ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovups ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vaddps ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm0
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_270
+# %bb.271:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_272:
+ mov rdi, rsi
+ not rdi
+ add rdi, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_274
+.LBB0_273: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_273
+.LBB0_274:
+ cmp rdi, 3
+ jb .LBB0_537
+.LBB0_275: # =>This Inner Loop Header: Depth=1
+ vmovss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi]
+ vmovss dword ptr [r8 + 4*rsi], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 4]
+ vmovss dword ptr [r8 + 4*rsi + 4], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 8]
+ vmovss dword ptr [r8 + 4*rsi + 8], xmm0
+ vmovss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ vaddss xmm0, xmm0, dword ptr [rdx + 4*rsi + 12]
+ vmovss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_275
+ jmp .LBB0_537
+.LBB0_306:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_311
+# %bb.307:
+ and al, dil
+ jne .LBB0_311
+# %bb.308:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_309: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_309
+# %bb.310:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_311:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_313
+.LBB0_312: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_312
+.LBB0_313:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_314: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_314
+ jmp .LBB0_537
+.LBB0_432:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_437
+# %bb.433:
+ and al, dil
+ jne .LBB0_437
+# %bb.434:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_435: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm1, ymm1, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm2, ymm2, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm3, ymm3, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_435
+# %bb.436:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_437:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_439
+.LBB0_438: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_438
+.LBB0_439:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_440: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_440
+ jmp .LBB0_537
+.LBB0_47:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_52
+# %bb.48:
+ and al, dil
+ jne .LBB0_52
+# %bb.49:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_50: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_50
+# %bb.51:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_52:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_54
+.LBB0_53: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_53
+.LBB0_54:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_55: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_55
+ jmp .LBB0_537
+.LBB0_180:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_185
+# %bb.181:
+ and al, dil
+ jne .LBB0_185
+# %bb.182:
+ mov esi, r10d
+ and esi, -128
+ xor edi, edi
+.LBB0_183: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + rdi + 96]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm1, ymm1, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm2, ymm2, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm3, ymm3, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm3
+ sub rdi, -128
+ cmp rsi, rdi
+ jne .LBB0_183
+# %bb.184:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_185:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_187
+.LBB0_186: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_186
+.LBB0_187:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_188: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_188
+ jmp .LBB0_537
+.LBB0_360:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_365
+# %bb.361:
+ and al, dil
+ jne .LBB0_365
+# %bb.362:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_363: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_363
+# %bb.364:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_365:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_367
+.LBB0_366: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_366
+.LBB0_367:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_368: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_368
+ jmp .LBB0_537
+.LBB0_486:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_491
+# %bb.487:
+ and al, dil
+ jne .LBB0_491
+# %bb.488:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_489: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm2, ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm3, ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_489
+# %bb.490:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_491:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_493
+.LBB0_492: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rdx + 4*rsi]
+ sub edi, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_492
+.LBB0_493:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_494: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_494
+ jmp .LBB0_537
+.LBB0_101:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_106
+# %bb.102:
+ and al, dil
+ jne .LBB0_106
+# %bb.103:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_104: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_104
+# %bb.105:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_106:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_108
+.LBB0_107: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_107
+.LBB0_108:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_109: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_109
+ jmp .LBB0_537
+.LBB0_234:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_239
+# %bb.235:
+ and al, dil
+ jne .LBB0_239
+# %bb.236:
+ mov esi, r10d
+ and esi, -32
+ xor edi, edi
+.LBB0_237: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
+ vmovdqu ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vmovdqu ymm2, ymmword ptr [rcx + 4*rdi + 64]
+ vmovdqu ymm3, ymmword ptr [rcx + 4*rdi + 96]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm2, ymm2, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm3, ymm3, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm3
+ add rdi, 32
+ cmp rsi, rdi
+ jne .LBB0_237
+# %bb.238:
+ cmp rsi, r10
+ je .LBB0_537
+.LBB0_239:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rax, r10
+ and rax, 3
+ je .LBB0_241
+.LBB0_240: # =>This Inner Loop Header: Depth=1
+ mov edi, dword ptr [rcx + 4*rsi]
+ add edi, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], edi
+ add rsi, 1
+ add rax, -1
+ jne .LBB0_240
+.LBB0_241:
+ cmp r9, 3
+ jb .LBB0_537
+.LBB0_242: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_242
+.LBB0_537:
+ mov rsp, rbp
+ pop rbp
+ vzeroupper
+ ret
+.Lfunc_end0:
+ .size arithmetic_avx2, .Lfunc_end0-arithmetic_avx2
+ # -- End function
+ .globl arithmetic_arr_scalar_avx2 # -- Begin function arithmetic_arr_scalar_avx2
+ .p2align 4, 0x90
+ .type arithmetic_arr_scalar_avx2,@function
+arithmetic_arr_scalar_avx2: # @arithmetic_arr_scalar_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 1
+ jg .LBB1_11
+# %bb.1:
+ test sil, sil
+ je .LBB1_21
+# %bb.2:
+ cmp sil, 1
+ jne .LBB1_737
+# %bb.3:
+ cmp edi, 6
+ jg .LBB1_37
+# %bb.4:
+ cmp edi, 3
+ jle .LBB1_65
+# %bb.5:
+ cmp edi, 4
+ je .LBB1_105
+# %bb.6:
+ cmp edi, 5
+ je .LBB1_108
+# %bb.7:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.8:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.9:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_10
+# %bb.177:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_297
+# %bb.178:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_297
+.LBB1_10:
+ xor esi, esi
+.LBB1_421:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_423
+.LBB1_422: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_422
+.LBB1_423:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_424: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_424
+ jmp .LBB1_737
+.LBB1_11:
+ cmp sil, 2
+ je .LBB1_29
+# %bb.12:
+ cmp sil, 3
+ jne .LBB1_737
+# %bb.13:
+ cmp edi, 6
+ jg .LBB1_44
+# %bb.14:
+ cmp edi, 3
+ jle .LBB1_70
+# %bb.15:
+ cmp edi, 4
+ je .LBB1_111
+# %bb.16:
+ cmp edi, 5
+ je .LBB1_114
+# %bb.17:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.18:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.19:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_20
+# %bb.180:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_300
+# %bb.181:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_300
+.LBB1_20:
+ xor esi, esi
+.LBB1_429:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_431
+.LBB1_430: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_430
+.LBB1_431:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_432: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_432
+ jmp .LBB1_737
+.LBB1_21:
+ cmp edi, 6
+ jg .LBB1_51
+# %bb.22:
+ cmp edi, 3
+ jle .LBB1_75
+# %bb.23:
+ cmp edi, 4
+ je .LBB1_117
+# %bb.24:
+ cmp edi, 5
+ je .LBB1_120
+# %bb.25:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.26:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.27:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_28
+# %bb.183:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_303
+# %bb.184:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_303
+.LBB1_28:
+ xor esi, esi
+.LBB1_437:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_439
+.LBB1_438: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_438
+.LBB1_439:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_440: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_440
+ jmp .LBB1_737
+.LBB1_29:
+ cmp edi, 6
+ jg .LBB1_58
+# %bb.30:
+ cmp edi, 3
+ jle .LBB1_80
+# %bb.31:
+ cmp edi, 4
+ je .LBB1_123
+# %bb.32:
+ cmp edi, 5
+ je .LBB1_126
+# %bb.33:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.34:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.35:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_36
+# %bb.186:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_306
+# %bb.187:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_306
+.LBB1_36:
+ xor esi, esi
+.LBB1_445:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_447
+.LBB1_446: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_446
+.LBB1_447:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_448: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_448
+ jmp .LBB1_737
+.LBB1_37:
+ cmp edi, 8
+ jle .LBB1_85
+# %bb.38:
+ cmp edi, 9
+ je .LBB1_129
+# %bb.39:
+ cmp edi, 11
+ je .LBB1_132
+# %bb.40:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.41:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.42:
+ vmovsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB1_43
+# %bb.189:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_309
+# %bb.190:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_309
+.LBB1_43:
+ xor ecx, ecx
+.LBB1_453:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_455
+.LBB1_454: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_454
+.LBB1_455:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_456: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx + 8], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx + 16], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_456
+ jmp .LBB1_737
+.LBB1_44:
+ cmp edi, 8
+ jle .LBB1_90
+# %bb.45:
+ cmp edi, 9
+ je .LBB1_135
+# %bb.46:
+ cmp edi, 11
+ je .LBB1_138
+# %bb.47:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.48:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.49:
+ vmovsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB1_50
+# %bb.192:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_312
+# %bb.193:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_312
+.LBB1_50:
+ xor ecx, ecx
+.LBB1_461:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_463
+.LBB1_462: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_462
+.LBB1_463:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_464: # =>This Inner Loop Header: Depth=1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx + 8], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx + 16], xmm1
+ vmovsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
+ vsubsd xmm1, xmm1, xmm0
+ vmovsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_464
+ jmp .LBB1_737
+.LBB1_51:
+ cmp edi, 8
+ jle .LBB1_95
+# %bb.52:
+ cmp edi, 9
+ je .LBB1_141
+# %bb.53:
+ cmp edi, 11
+ je .LBB1_144
+# %bb.54:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.55:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.56:
+ vmovsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB1_57
+# %bb.195:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_315
+# %bb.196:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_315
+.LBB1_57:
+ xor ecx, ecx
+.LBB1_469:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_471
+.LBB1_470: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_470
+.LBB1_471:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_472: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 8]
+ vmovsd qword ptr [r8 + 8*rcx + 8], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 16]
+ vmovsd qword ptr [r8 + 8*rcx + 16], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 24]
+ vmovsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_472
+ jmp .LBB1_737
+.LBB1_58:
+ cmp edi, 8
+ jle .LBB1_100
+# %bb.59:
+ cmp edi, 9
+ je .LBB1_147
+# %bb.60:
+ cmp edi, 11
+ je .LBB1_150
+# %bb.61:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.62:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.63:
+ vmovsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB1_64
+# %bb.198:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_318
+# %bb.199:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_318
+.LBB1_64:
+ xor ecx, ecx
+.LBB1_477:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_479
+.LBB1_478: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_478
+.LBB1_479:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_480: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx]
+ vmovsd qword ptr [r8 + 8*rcx], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 8]
+ vmovsd qword ptr [r8 + 8*rcx + 8], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 16]
+ vmovsd qword ptr [r8 + 8*rcx + 16], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rdx + 8*rcx + 24]
+ vmovsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_480
+ jmp .LBB1_737
+.LBB1_65:
+ cmp edi, 2
+ je .LBB1_153
+# %bb.66:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.67:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.68:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_69
+# %bb.201:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_321
+# %bb.202:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_321
+.LBB1_69:
+ xor esi, esi
+.LBB1_485:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_487
+.LBB1_486: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_486
+.LBB1_487:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_488: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_488
+ jmp .LBB1_737
+.LBB1_70:
+ cmp edi, 2
+ je .LBB1_156
+# %bb.71:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.72:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.73:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_74
+# %bb.204:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_324
+# %bb.205:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_324
+.LBB1_74:
+ xor esi, esi
+.LBB1_493:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_495
+.LBB1_494: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_494
+.LBB1_495:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_496: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_496
+ jmp .LBB1_737
+.LBB1_75:
+ cmp edi, 2
+ je .LBB1_159
+# %bb.76:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.77:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.78:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_79
+# %bb.207:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_327
+# %bb.208:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_327
+.LBB1_79:
+ xor esi, esi
+.LBB1_501:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_503
+.LBB1_502: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_502
+.LBB1_503:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_504: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_504
+ jmp .LBB1_737
+.LBB1_80:
+ cmp edi, 2
+ je .LBB1_162
+# %bb.81:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.82:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.83:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_84
+# %bb.210:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_330
+# %bb.211:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_330
+.LBB1_84:
+ xor esi, esi
+.LBB1_509:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_511
+.LBB1_510: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_510
+.LBB1_511:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_512: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_512
+ jmp .LBB1_737
+.LBB1_85:
+ cmp edi, 7
+ je .LBB1_165
+# %bb.86:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.87:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.88:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_89
+# %bb.213:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_333
+# %bb.214:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_333
+.LBB1_89:
+ xor esi, esi
+.LBB1_517:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_519
+.LBB1_518: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_518
+.LBB1_519:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_520: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_520
+ jmp .LBB1_737
+.LBB1_90:
+ cmp edi, 7
+ je .LBB1_168
+# %bb.91:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.92:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.93:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_94
+# %bb.216:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_336
+# %bb.217:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_336
+.LBB1_94:
+ xor esi, esi
+.LBB1_525:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_527
+.LBB1_526: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_526
+.LBB1_527:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_528: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_528
+ jmp .LBB1_737
+.LBB1_95:
+ cmp edi, 7
+ je .LBB1_171
+# %bb.96:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.97:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.98:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_99
+# %bb.219:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_339
+# %bb.220:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_339
+.LBB1_99:
+ xor esi, esi
+.LBB1_533:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_535
+.LBB1_534: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_534
+.LBB1_535:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_536: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_536
+ jmp .LBB1_737
+.LBB1_100:
+ cmp edi, 7
+ je .LBB1_174
+# %bb.101:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.102:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.103:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_104
+# %bb.222:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_342
+# %bb.223:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_342
+.LBB1_104:
+ xor esi, esi
+.LBB1_541:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_543
+.LBB1_542: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_542
+.LBB1_543:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_544: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_544
+ jmp .LBB1_737
+.LBB1_105:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.106:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_107
+# %bb.225:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_345
+# %bb.226:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_345
+.LBB1_107:
+ xor esi, esi
+.LBB1_549:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_551
+.LBB1_550: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub edi, eax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_550
+.LBB1_551:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_552: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_552
+ jmp .LBB1_737
+.LBB1_108:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.109:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_110
+# %bb.228:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_348
+# %bb.229:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_348
+.LBB1_110:
+ xor esi, esi
+.LBB1_557:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_559
+.LBB1_558: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub edi, eax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_558
+.LBB1_559:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_560: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_560
+ jmp .LBB1_737
+.LBB1_111:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.112:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_113
+# %bb.231:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_351
+# %bb.232:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_351
+.LBB1_113:
+ xor esi, esi
+.LBB1_565:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_567
+.LBB1_566: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub edi, eax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_566
+.LBB1_567:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_568: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_568
+ jmp .LBB1_737
+.LBB1_114:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.115:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_116
+# %bb.234:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_354
+# %bb.235:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_354
+.LBB1_116:
+ xor esi, esi
+.LBB1_573:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_575
+.LBB1_574: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ sub edi, eax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_574
+.LBB1_575:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_576: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_576
+ jmp .LBB1_737
+.LBB1_117:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.118:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_119
+# %bb.237:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_357
+# %bb.238:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_357
+.LBB1_119:
+ xor esi, esi
+.LBB1_581:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_583
+.LBB1_582: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_582
+.LBB1_583:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_584: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_584
+ jmp .LBB1_737
+.LBB1_120:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.121:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_122
+# %bb.240:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_360
+# %bb.241:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_360
+.LBB1_122:
+ xor esi, esi
+.LBB1_589:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_591
+.LBB1_590: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_590
+.LBB1_591:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_592: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_592
+ jmp .LBB1_737
+.LBB1_123:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.124:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_125
+# %bb.243:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_363
+# %bb.244:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_363
+.LBB1_125:
+ xor esi, esi
+.LBB1_597:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_599
+.LBB1_598: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_598
+.LBB1_599:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_600: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_600
+ jmp .LBB1_737
+.LBB1_126:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.127:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_128
+# %bb.246:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_366
+# %bb.247:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_366
+.LBB1_128:
+ xor esi, esi
+.LBB1_605:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rcx, r10
+ and rcx, 3
+ je .LBB1_607
+.LBB1_606: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rdx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rcx, -1
+ jne .LBB1_606
+.LBB1_607:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_608: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_608
+ jmp .LBB1_737
+.LBB1_129:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.130:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_131
+# %bb.249:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_369
+# %bb.250:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_369
+.LBB1_131:
+ xor esi, esi
+.LBB1_613:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_615
+.LBB1_614: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_614
+.LBB1_615:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_616: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_616
+ jmp .LBB1_737
+.LBB1_132:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.133:
+ vmovss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB1_134
+# %bb.252:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_372
+# %bb.253:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_372
+.LBB1_134:
+ xor ecx, ecx
+.LBB1_621:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_623
+.LBB1_622: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_622
+.LBB1_623:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_624: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx + 4], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx + 8], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_624
+ jmp .LBB1_737
+.LBB1_135:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.136:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_137
+# %bb.255:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_375
+# %bb.256:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_375
+.LBB1_137:
+ xor esi, esi
+.LBB1_629:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_631
+.LBB1_630: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_630
+.LBB1_631:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_632: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_632
+ jmp .LBB1_737
+.LBB1_138:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.139:
+ vmovss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB1_140
+# %bb.258:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_378
+# %bb.259:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_378
+.LBB1_140:
+ xor ecx, ecx
+.LBB1_637:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_639
+.LBB1_638: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_638
+.LBB1_639:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_640: # =>This Inner Loop Header: Depth=1
+ vmovss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx + 4], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx + 8], xmm1
+ vmovss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
+ vsubss xmm1, xmm1, xmm0
+ vmovss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_640
+ jmp .LBB1_737
+.LBB1_141:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.142:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_143
+# %bb.261:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_381
+# %bb.262:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_381
+.LBB1_143:
+ xor esi, esi
+.LBB1_645:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_647
+.LBB1_646: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_646
+.LBB1_647:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_648: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_648
+ jmp .LBB1_737
+.LBB1_144:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.145:
+ vmovss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB1_146
+# %bb.264:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_384
+# %bb.265:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_384
+.LBB1_146:
+ xor ecx, ecx
+.LBB1_653:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_655
+.LBB1_654: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx]
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_654
+.LBB1_655:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_656: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx]
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx + 4]
+ vmovss dword ptr [r8 + 4*rcx + 4], xmm1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx + 8]
+ vmovss dword ptr [r8 + 4*rcx + 8], xmm1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx + 12]
+ vmovss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_656
+ jmp .LBB1_737
+.LBB1_147:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.148:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_149
+# %bb.267:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_387
+# %bb.268:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_387
+.LBB1_149:
+ xor esi, esi
+.LBB1_661:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_663
+.LBB1_662: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_662
+.LBB1_663:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_664: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_664
+ jmp .LBB1_737
+.LBB1_150:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.151:
+ vmovss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB1_152
+# %bb.270:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_390
+# %bb.271:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_390
+.LBB1_152:
+ xor ecx, ecx
+.LBB1_669:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_671
+.LBB1_670: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx]
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_670
+.LBB1_671:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_672: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx]
+ vmovss dword ptr [r8 + 4*rcx], xmm1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx + 4]
+ vmovss dword ptr [r8 + 4*rcx + 4], xmm1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx + 8]
+ vmovss dword ptr [r8 + 4*rcx + 8], xmm1
+ vaddss xmm1, xmm0, dword ptr [rdx + 4*rcx + 12]
+ vmovss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_672
+ jmp .LBB1_737
+.LBB1_153:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.154:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_155
+# %bb.273:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_393
+# %bb.274:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_393
+.LBB1_155:
+ xor esi, esi
+.LBB1_677:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_679
+.LBB1_678: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_678
+.LBB1_679:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_680: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_680
+ jmp .LBB1_737
+.LBB1_156:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.157:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_158
+# %bb.276:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_396
+# %bb.277:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_396
+.LBB1_158:
+ xor esi, esi
+.LBB1_685:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_687
+.LBB1_686: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_686
+.LBB1_687:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_688: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_688
+ jmp .LBB1_737
+.LBB1_159:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.160:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_161
+# %bb.279:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_399
+# %bb.280:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_399
+.LBB1_161:
+ xor esi, esi
+.LBB1_693:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_695
+.LBB1_694: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_694
+.LBB1_695:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_696: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_696
+ jmp .LBB1_737
+.LBB1_162:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.163:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB1_164
+# %bb.282:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_402
+# %bb.283:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_402
+.LBB1_164:
+ xor esi, esi
+.LBB1_701:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_703
+.LBB1_702: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_702
+.LBB1_703:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_704: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_704
+ jmp .LBB1_737
+.LBB1_165:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.166:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_167
+# %bb.285:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_405
+# %bb.286:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_405
+.LBB1_167:
+ xor esi, esi
+.LBB1_709:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_711
+.LBB1_710: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_710
+.LBB1_711:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_712: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_712
+ jmp .LBB1_737
+.LBB1_168:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.169:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_170
+# %bb.288:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_408
+# %bb.289:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_408
+.LBB1_170:
+ xor esi, esi
+.LBB1_717:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_719
+.LBB1_718: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_718
+.LBB1_719:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_720: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_720
+ jmp .LBB1_737
+.LBB1_171:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.172:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_173
+# %bb.291:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_411
+# %bb.292:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_411
+.LBB1_173:
+ xor esi, esi
+.LBB1_725:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_727
+.LBB1_726: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_726
+.LBB1_727:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_728: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_728
+ jmp .LBB1_737
+.LBB1_174:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.175:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_176
+# %bb.294:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_414
+# %bb.295:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_414
+.LBB1_176:
+ xor esi, esi
+.LBB1_733:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_735
+.LBB1_734: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_734
+.LBB1_735:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_736: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_736
+ jmp .LBB1_737
+.LBB1_297:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_417
+# %bb.298:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_299: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 224]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_299
+ jmp .LBB1_418
+.LBB1_300:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_425
+# %bb.301:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_302: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 224]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_302
+ jmp .LBB1_426
+.LBB1_303:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_433
+# %bb.304:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_305: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_305
+ jmp .LBB1_434
+.LBB1_306:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_441
+# %bb.307:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_308: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_308
+ jmp .LBB1_442
+.LBB1_309:
+ mov ecx, eax
+ and ecx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rcx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_449
+# %bb.310:
+ mov rdi, r9
+ and rdi, -2
+ neg rdi
+ xor esi, esi
+.LBB1_311: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm2, ymmword ptr [rdx + 8*rsi]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rsi + 32]
+ vmovupd ymm4, ymmword ptr [rdx + 8*rsi + 64]
+ vmovupd ymm5, ymmword ptr [rdx + 8*rsi + 96]
+ vsubpd ymm2, ymm2, ymm1
+ vsubpd ymm3, ymm3, ymm1
+ vsubpd ymm4, ymm4, ymm1
+ vsubpd ymm5, ymm5, ymm1
+ vmovupd ymmword ptr [r8 + 8*rsi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rsi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rsi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rsi + 96], ymm5
+ vmovupd ymm2, ymmword ptr [rdx + 8*rsi + 128]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rsi + 160]
+ vmovupd ymm4, ymmword ptr [rdx + 8*rsi + 192]
+ vmovupd ymm5, ymmword ptr [rdx + 8*rsi + 224]
+ vsubpd ymm2, ymm2, ymm1
+ vsubpd ymm3, ymm3, ymm1
+ vsubpd ymm4, ymm4, ymm1
+ vsubpd ymm5, ymm5, ymm1
+ vmovupd ymmword ptr [r8 + 8*rsi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rsi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rsi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rsi + 224], ymm5
+ add rsi, 32
+ add rdi, 2
+ jne .LBB1_311
+ jmp .LBB1_450
+.LBB1_312:
+ mov ecx, eax
+ and ecx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rcx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_457
+# %bb.313:
+ mov rdi, r9
+ and rdi, -2
+ neg rdi
+ xor esi, esi
+.LBB1_314: # =>This Inner Loop Header: Depth=1
+ vmovupd ymm2, ymmword ptr [rdx + 8*rsi]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rsi + 32]
+ vmovupd ymm4, ymmword ptr [rdx + 8*rsi + 64]
+ vmovupd ymm5, ymmword ptr [rdx + 8*rsi + 96]
+ vsubpd ymm2, ymm2, ymm1
+ vsubpd ymm3, ymm3, ymm1
+ vsubpd ymm4, ymm4, ymm1
+ vsubpd ymm5, ymm5, ymm1
+ vmovupd ymmword ptr [r8 + 8*rsi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rsi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rsi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rsi + 96], ymm5
+ vmovupd ymm2, ymmword ptr [rdx + 8*rsi + 128]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rsi + 160]
+ vmovupd ymm4, ymmword ptr [rdx + 8*rsi + 192]
+ vmovupd ymm5, ymmword ptr [rdx + 8*rsi + 224]
+ vsubpd ymm2, ymm2, ymm1
+ vsubpd ymm3, ymm3, ymm1
+ vsubpd ymm4, ymm4, ymm1
+ vsubpd ymm5, ymm5, ymm1
+ vmovupd ymmword ptr [r8 + 8*rsi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rsi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rsi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rsi + 224], ymm5
+ add rsi, 32
+ add rdi, 2
+ jne .LBB1_314
+ jmp .LBB1_458
+.LBB1_315:
+ mov ecx, eax
+ and ecx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rcx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_465
+# %bb.316:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_317: # =>This Inner Loop Header: Depth=1
+ vaddpd ymm2, ymm1, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm5, ymm1, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm5
+ vaddpd ymm2, ymm1, ymmword ptr [rdx + 8*rdi + 128]
+ vaddpd ymm3, ymm1, ymmword ptr [rdx + 8*rdi + 160]
+ vaddpd ymm4, ymm1, ymmword ptr [rdx + 8*rdi + 192]
+ vaddpd ymm5, ymm1, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [r8 + 8*rdi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 224], ymm5
+ add rdi, 32
+ add rsi, 2
+ jne .LBB1_317
+ jmp .LBB1_466
+.LBB1_318:
+ mov ecx, eax
+ and ecx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rcx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_473
+# %bb.319:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_320: # =>This Inner Loop Header: Depth=1
+ vaddpd ymm2, ymm1, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm5, ymm1, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm5
+ vaddpd ymm2, ymm1, ymmword ptr [rdx + 8*rdi + 128]
+ vaddpd ymm3, ymm1, ymmword ptr [rdx + 8*rdi + 160]
+ vaddpd ymm4, ymm1, ymmword ptr [rdx + 8*rdi + 192]
+ vaddpd ymm5, ymm1, ymmword ptr [rdx + 8*rdi + 224]
+ vmovupd ymmword ptr [r8 + 8*rdi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 224], ymm5
+ add rdi, 32
+ add rsi, 2
+ jne .LBB1_320
+ jmp .LBB1_474
+.LBB1_321:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_481
+# %bb.322:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_323: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 224]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_323
+ jmp .LBB1_482
+.LBB1_324:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_489
+# %bb.325:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_326: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 224]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_326
+ jmp .LBB1_490
+.LBB1_327:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_497
+# %bb.328:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_329: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_329
+ jmp .LBB1_498
+.LBB1_330:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_505
+# %bb.331:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_332: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_332
+ jmp .LBB1_506
+.LBB1_333:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_513
+# %bb.334:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_335: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 224]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_335
+ jmp .LBB1_514
+.LBB1_336:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_521
+# %bb.337:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_338: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 224]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_338
+ jmp .LBB1_522
+.LBB1_339:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_529
+# %bb.340:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_341: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_341
+ jmp .LBB1_530
+.LBB1_342:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_537
+# %bb.343:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_344: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_344
+ jmp .LBB1_538
+.LBB1_345:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_545
+# %bb.346:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_347: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_347
+ jmp .LBB1_546
+.LBB1_348:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_553
+# %bb.349:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_350: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_350
+ jmp .LBB1_554
+.LBB1_351:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_561
+# %bb.352:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_353: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_353
+ jmp .LBB1_562
+.LBB1_354:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_569
+# %bb.355:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_356: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi + 64]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 96]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm2, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_356
+ jmp .LBB1_570
+.LBB1_357:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_577
+# %bb.358:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_359: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_359
+ jmp .LBB1_578
+.LBB1_360:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_585
+# %bb.361:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_362: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_362
+ jmp .LBB1_586
+.LBB1_363:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_593
+# %bb.364:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_365: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_365
+ jmp .LBB1_594
+.LBB1_366:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_601
+# %bb.367:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_368: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rdx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_368
+ jmp .LBB1_602
+.LBB1_369:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_609
+# %bb.370:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_371: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 224]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_371
+ jmp .LBB1_610
+.LBB1_372:
+ mov ecx, eax
+ and ecx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rcx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_617
+# %bb.373:
+ mov rdi, r9
+ and rdi, -2
+ neg rdi
+ xor esi, esi
+.LBB1_374: # =>This Inner Loop Header: Depth=1
+ vmovups ymm2, ymmword ptr [rdx + 4*rsi]
+ vmovups ymm3, ymmword ptr [rdx + 4*rsi + 32]
+ vmovups ymm4, ymmword ptr [rdx + 4*rsi + 64]
+ vmovups ymm5, ymmword ptr [rdx + 4*rsi + 96]
+ vsubps ymm2, ymm2, ymm1
+ vsubps ymm3, ymm3, ymm1
+ vsubps ymm4, ymm4, ymm1
+ vsubps ymm5, ymm5, ymm1
+ vmovups ymmword ptr [r8 + 4*rsi], ymm2
+ vmovups ymmword ptr [r8 + 4*rsi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rsi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rsi + 96], ymm5
+ vmovups ymm2, ymmword ptr [rdx + 4*rsi + 128]
+ vmovups ymm3, ymmword ptr [rdx + 4*rsi + 160]
+ vmovups ymm4, ymmword ptr [rdx + 4*rsi + 192]
+ vmovups ymm5, ymmword ptr [rdx + 4*rsi + 224]
+ vsubps ymm2, ymm2, ymm1
+ vsubps ymm3, ymm3, ymm1
+ vsubps ymm4, ymm4, ymm1
+ vsubps ymm5, ymm5, ymm1
+ vmovups ymmword ptr [r8 + 4*rsi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rsi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rsi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rsi + 224], ymm5
+ add rsi, 64
+ add rdi, 2
+ jne .LBB1_374
+ jmp .LBB1_618
+.LBB1_375:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_625
+# %bb.376:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_377: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 224]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_377
+ jmp .LBB1_626
+.LBB1_378:
+ mov ecx, eax
+ and ecx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rcx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_633
+# %bb.379:
+ mov rdi, r9
+ and rdi, -2
+ neg rdi
+ xor esi, esi
+.LBB1_380: # =>This Inner Loop Header: Depth=1
+ vmovups ymm2, ymmword ptr [rdx + 4*rsi]
+ vmovups ymm3, ymmword ptr [rdx + 4*rsi + 32]
+ vmovups ymm4, ymmword ptr [rdx + 4*rsi + 64]
+ vmovups ymm5, ymmword ptr [rdx + 4*rsi + 96]
+ vsubps ymm2, ymm2, ymm1
+ vsubps ymm3, ymm3, ymm1
+ vsubps ymm4, ymm4, ymm1
+ vsubps ymm5, ymm5, ymm1
+ vmovups ymmword ptr [r8 + 4*rsi], ymm2
+ vmovups ymmword ptr [r8 + 4*rsi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rsi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rsi + 96], ymm5
+ vmovups ymm2, ymmword ptr [rdx + 4*rsi + 128]
+ vmovups ymm3, ymmword ptr [rdx + 4*rsi + 160]
+ vmovups ymm4, ymmword ptr [rdx + 4*rsi + 192]
+ vmovups ymm5, ymmword ptr [rdx + 4*rsi + 224]
+ vsubps ymm2, ymm2, ymm1
+ vsubps ymm3, ymm3, ymm1
+ vsubps ymm4, ymm4, ymm1
+ vsubps ymm5, ymm5, ymm1
+ vmovups ymmword ptr [r8 + 4*rsi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rsi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rsi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rsi + 224], ymm5
+ add rsi, 64
+ add rdi, 2
+ jne .LBB1_380
+ jmp .LBB1_634
+.LBB1_381:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_641
+# %bb.382:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_383: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_383
+ jmp .LBB1_642
+.LBB1_384:
+ mov ecx, eax
+ and ecx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rcx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_649
+# %bb.385:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_386: # =>This Inner Loop Header: Depth=1
+ vaddps ymm2, ymm1, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm5, ymm1, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm5
+ vaddps ymm2, ymm1, ymmword ptr [rdx + 4*rdi + 128]
+ vaddps ymm3, ymm1, ymmword ptr [rdx + 4*rdi + 160]
+ vaddps ymm4, ymm1, ymmword ptr [rdx + 4*rdi + 192]
+ vaddps ymm5, ymm1, ymmword ptr [rdx + 4*rdi + 224]
+ vmovups ymmword ptr [r8 + 4*rdi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 224], ymm5
+ add rdi, 64
+ add rsi, 2
+ jne .LBB1_386
+ jmp .LBB1_650
+.LBB1_387:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_657
+# %bb.388:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_389: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rdx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_389
+ jmp .LBB1_658
+.LBB1_390:
+ mov ecx, eax
+ and ecx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rcx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_665
+# %bb.391:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_392: # =>This Inner Loop Header: Depth=1
+ vaddps ymm2, ymm1, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm5, ymm1, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm5
+ vaddps ymm2, ymm1, ymmword ptr [rdx + 4*rdi + 128]
+ vaddps ymm3, ymm1, ymmword ptr [rdx + 4*rdi + 160]
+ vaddps ymm4, ymm1, ymmword ptr [rdx + 4*rdi + 192]
+ vaddps ymm5, ymm1, ymmword ptr [rdx + 4*rdi + 224]
+ vmovups ymmword ptr [r8 + 4*rdi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 224], ymm5
+ add rdi, 64
+ add rsi, 2
+ jne .LBB1_392
+ jmp .LBB1_666
+.LBB1_393:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_673
+# %bb.394:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_395: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 224]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_395
+ jmp .LBB1_674
+.LBB1_396:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_681
+# %bb.397:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_398: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 224]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_398
+ jmp .LBB1_682
+.LBB1_399:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_689
+# %bb.400:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_401: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_401
+ jmp .LBB1_690
+.LBB1_402:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rcx, [rsi - 128]
+ mov r9, rcx
+ shr r9, 7
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_697
+# %bb.403:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_404: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rdx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rcx, 2
+ jne .LBB1_404
+ jmp .LBB1_698
+.LBB1_405:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_705
+# %bb.406:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_407: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 224]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_407
+ jmp .LBB1_706
+.LBB1_408:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_713
+# %bb.409:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_410: # =>This Inner Loop Header: Depth=1
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi + 128]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 160]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 192]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 224]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm4, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_410
+ jmp .LBB1_714
+.LBB1_411:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_721
+# %bb.412:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_413: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_413
+ jmp .LBB1_722
+.LBB1_414:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_729
+# %bb.415:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_416: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rdx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_416
+ jmp .LBB1_730
+.LBB1_417:
+ xor edi, edi
+.LBB1_418:
+ test r9b, 1
+ je .LBB1_420
+# %bb.419:
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_420:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_421
+.LBB1_425:
+ xor edi, edi
+.LBB1_426:
+ test r9b, 1
+ je .LBB1_428
+# %bb.427:
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_428:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_429
+.LBB1_433:
+ xor edi, edi
+.LBB1_434:
+ test r9b, 1
+ je .LBB1_436
+# %bb.435:
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_436:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_437
+.LBB1_441:
+ xor edi, edi
+.LBB1_442:
+ test r9b, 1
+ je .LBB1_444
+# %bb.443:
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_444:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_445
+.LBB1_449:
+ xor esi, esi
+.LBB1_450:
+ test r9b, 1
+ je .LBB1_452
+# %bb.451:
+ vmovupd ymm2, ymmword ptr [rdx + 8*rsi]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rsi + 32]
+ vmovupd ymm4, ymmword ptr [rdx + 8*rsi + 64]
+ vmovupd ymm5, ymmword ptr [rdx + 8*rsi + 96]
+ vsubpd ymm2, ymm2, ymm1
+ vsubpd ymm3, ymm3, ymm1
+ vsubpd ymm4, ymm4, ymm1
+ vsubpd ymm1, ymm5, ymm1
+ vmovupd ymmword ptr [r8 + 8*rsi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rsi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rsi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rsi + 96], ymm1
+.LBB1_452:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_453
+.LBB1_457:
+ xor esi, esi
+.LBB1_458:
+ test r9b, 1
+ je .LBB1_460
+# %bb.459:
+ vmovupd ymm2, ymmword ptr [rdx + 8*rsi]
+ vmovupd ymm3, ymmword ptr [rdx + 8*rsi + 32]
+ vmovupd ymm4, ymmword ptr [rdx + 8*rsi + 64]
+ vmovupd ymm5, ymmword ptr [rdx + 8*rsi + 96]
+ vsubpd ymm2, ymm2, ymm1
+ vsubpd ymm3, ymm3, ymm1
+ vsubpd ymm4, ymm4, ymm1
+ vsubpd ymm1, ymm5, ymm1
+ vmovupd ymmword ptr [r8 + 8*rsi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rsi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rsi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rsi + 96], ymm1
+.LBB1_460:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_461
+.LBB1_465:
+ xor edi, edi
+.LBB1_466:
+ test r9b, 1
+ je .LBB1_468
+# %bb.467:
+ vaddpd ymm2, ymm1, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm1
+.LBB1_468:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_469
+.LBB1_473:
+ xor edi, edi
+.LBB1_474:
+ test r9b, 1
+ je .LBB1_476
+# %bb.475:
+ vaddpd ymm2, ymm1, ymmword ptr [rdx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rdx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rdx + 8*rdi + 64]
+ vaddpd ymm1, ymm1, ymmword ptr [rdx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm1
+.LBB1_476:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_477
+.LBB1_481:
+ xor edi, edi
+.LBB1_482:
+ test r9b, 1
+ je .LBB1_484
+# %bb.483:
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_484:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_485
+.LBB1_489:
+ xor edi, edi
+.LBB1_490:
+ test r9b, 1
+ je .LBB1_492
+# %bb.491:
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_492:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_493
+.LBB1_497:
+ xor edi, edi
+.LBB1_498:
+ test r9b, 1
+ je .LBB1_500
+# %bb.499:
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_500:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_501
+.LBB1_505:
+ xor edi, edi
+.LBB1_506:
+ test r9b, 1
+ je .LBB1_508
+# %bb.507:
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_508:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_509
+.LBB1_513:
+ xor edi, edi
+.LBB1_514:
+ test r9b, 1
+ je .LBB1_516
+# %bb.515:
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_516:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_517
+.LBB1_521:
+ xor edi, edi
+.LBB1_522:
+ test r9b, 1
+ je .LBB1_524
+# %bb.523:
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_524:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_525
+.LBB1_529:
+ xor edi, edi
+.LBB1_530:
+ test r9b, 1
+ je .LBB1_532
+# %bb.531:
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_532:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_533
+.LBB1_537:
+ xor edi, edi
+.LBB1_538:
+ test r9b, 1
+ je .LBB1_540
+# %bb.539:
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_540:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_541
+.LBB1_545:
+ xor edi, edi
+.LBB1_546:
+ test r9b, 1
+ je .LBB1_548
+# %bb.547:
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm0, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_548:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_549
+.LBB1_553:
+ xor edi, edi
+.LBB1_554:
+ test r9b, 1
+ je .LBB1_556
+# %bb.555:
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm0, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_556:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_557
+.LBB1_561:
+ xor edi, edi
+.LBB1_562:
+ test r9b, 1
+ je .LBB1_564
+# %bb.563:
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm0, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_564:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_565
+.LBB1_569:
+ xor edi, edi
+.LBB1_570:
+ test r9b, 1
+ je .LBB1_572
+# %bb.571:
+ vmovdqu ymm1, ymmword ptr [rdx + 2*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 2*rdi + 32]
+ vpsubw ymm1, ymm1, ymm0
+ vpsubw ymm0, ymm2, ymm0
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_572:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_573
+.LBB1_577:
+ xor edi, edi
+.LBB1_578:
+ test r9b, 1
+ je .LBB1_580
+# %bb.579:
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_580:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_581
+.LBB1_585:
+ xor edi, edi
+.LBB1_586:
+ test r9b, 1
+ je .LBB1_588
+# %bb.587:
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_588:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_589
+.LBB1_593:
+ xor edi, edi
+.LBB1_594:
+ test r9b, 1
+ je .LBB1_596
+# %bb.595:
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_596:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_597
+.LBB1_601:
+ xor edi, edi
+.LBB1_602:
+ test r9b, 1
+ je .LBB1_604
+# %bb.603:
+ vpaddw ymm1, ymm0, ymmword ptr [rdx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rdx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB1_604:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_605
+.LBB1_609:
+ xor edi, edi
+.LBB1_610:
+ test r9b, 1
+ je .LBB1_612
+# %bb.611:
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_612:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_613
+.LBB1_617:
+ xor esi, esi
+.LBB1_618:
+ test r9b, 1
+ je .LBB1_620
+# %bb.619:
+ vmovups ymm2, ymmword ptr [rdx + 4*rsi]
+ vmovups ymm3, ymmword ptr [rdx + 4*rsi + 32]
+ vmovups ymm4, ymmword ptr [rdx + 4*rsi + 64]
+ vmovups ymm5, ymmword ptr [rdx + 4*rsi + 96]
+ vsubps ymm2, ymm2, ymm1
+ vsubps ymm3, ymm3, ymm1
+ vsubps ymm4, ymm4, ymm1
+ vsubps ymm1, ymm5, ymm1
+ vmovups ymmword ptr [r8 + 4*rsi], ymm2
+ vmovups ymmword ptr [r8 + 4*rsi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rsi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rsi + 96], ymm1
+.LBB1_620:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_621
+.LBB1_625:
+ xor edi, edi
+.LBB1_626:
+ test r9b, 1
+ je .LBB1_628
+# %bb.627:
+ vmovdqu ymm1, ymmword ptr [rdx + 8*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 8*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 8*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 8*rdi + 96]
+ vpsubq ymm1, ymm1, ymm0
+ vpsubq ymm2, ymm2, ymm0
+ vpsubq ymm3, ymm3, ymm0
+ vpsubq ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_628:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_629
+.LBB1_633:
+ xor esi, esi
+.LBB1_634:
+ test r9b, 1
+ je .LBB1_636
+# %bb.635:
+ vmovups ymm2, ymmword ptr [rdx + 4*rsi]
+ vmovups ymm3, ymmword ptr [rdx + 4*rsi + 32]
+ vmovups ymm4, ymmword ptr [rdx + 4*rsi + 64]
+ vmovups ymm5, ymmword ptr [rdx + 4*rsi + 96]
+ vsubps ymm2, ymm2, ymm1
+ vsubps ymm3, ymm3, ymm1
+ vsubps ymm4, ymm4, ymm1
+ vsubps ymm1, ymm5, ymm1
+ vmovups ymmword ptr [r8 + 4*rsi], ymm2
+ vmovups ymmword ptr [r8 + 4*rsi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rsi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rsi + 96], ymm1
+.LBB1_636:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_637
+.LBB1_641:
+ xor edi, edi
+.LBB1_642:
+ test r9b, 1
+ je .LBB1_644
+# %bb.643:
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_644:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_645
+.LBB1_649:
+ xor edi, edi
+.LBB1_650:
+ test r9b, 1
+ je .LBB1_652
+# %bb.651:
+ vaddps ymm2, ymm1, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm1
+.LBB1_652:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_653
+.LBB1_657:
+ xor edi, edi
+.LBB1_658:
+ test r9b, 1
+ je .LBB1_660
+# %bb.659:
+ vpaddq ymm1, ymm0, ymmword ptr [rdx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rdx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rdx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rdx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB1_660:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_661
+.LBB1_665:
+ xor edi, edi
+.LBB1_666:
+ test r9b, 1
+ je .LBB1_668
+# %bb.667:
+ vaddps ymm2, ymm1, ymmword ptr [rdx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rdx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rdx + 4*rdi + 64]
+ vaddps ymm1, ymm1, ymmword ptr [rdx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm1
+.LBB1_668:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_669
+.LBB1_673:
+ xor edi, edi
+.LBB1_674:
+ test r9b, 1
+ je .LBB1_676
+# %bb.675:
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_676:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_677
+.LBB1_681:
+ xor edi, edi
+.LBB1_682:
+ test r9b, 1
+ je .LBB1_684
+# %bb.683:
+ vmovdqu ymm1, ymmword ptr [rdx + rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + rdi + 96]
+ vpsubb ymm1, ymm1, ymm0
+ vpsubb ymm2, ymm2, ymm0
+ vpsubb ymm3, ymm3, ymm0
+ vpsubb ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_684:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_685
+.LBB1_689:
+ xor edi, edi
+.LBB1_690:
+ test r9b, 1
+ je .LBB1_692
+# %bb.691:
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_692:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_693
+.LBB1_697:
+ xor edi, edi
+.LBB1_698:
+ test r9b, 1
+ je .LBB1_700
+# %bb.699:
+ vpaddb ymm1, ymm0, ymmword ptr [rdx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rdx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rdx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rdx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB1_700:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_701
+.LBB1_705:
+ xor edi, edi
+.LBB1_706:
+ test r9b, 1
+ je .LBB1_708
+# %bb.707:
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_708:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_709
+.LBB1_713:
+ xor edi, edi
+.LBB1_714:
+ test r9b, 1
+ je .LBB1_716
+# %bb.715:
+ vmovdqu ymm1, ymmword ptr [rdx + 4*rdi]
+ vmovdqu ymm2, ymmword ptr [rdx + 4*rdi + 32]
+ vmovdqu ymm3, ymmword ptr [rdx + 4*rdi + 64]
+ vmovdqu ymm4, ymmword ptr [rdx + 4*rdi + 96]
+ vpsubd ymm1, ymm1, ymm0
+ vpsubd ymm2, ymm2, ymm0
+ vpsubd ymm3, ymm3, ymm0
+ vpsubd ymm0, ymm4, ymm0
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_716:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_717
+.LBB1_721:
+ xor edi, edi
+.LBB1_722:
+ test r9b, 1
+ je .LBB1_724
+# %bb.723:
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_724:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_725
+.LBB1_729:
+ xor edi, edi
+.LBB1_730:
+ test r9b, 1
+ je .LBB1_732
+# %bb.731:
+ vpaddd ymm1, ymm0, ymmword ptr [rdx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rdx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rdx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB1_732:
+ cmp rsi, r10
+ jne .LBB1_733
+.LBB1_737:
+ mov rsp, rbp
+ pop rbp
+ vzeroupper
+ ret
+.Lfunc_end1:
+ .size arithmetic_arr_scalar_avx2, .Lfunc_end1-arithmetic_arr_scalar_avx2
+ # -- End function
+ .globl arithmetic_scalar_arr_avx2 # -- Begin function arithmetic_scalar_arr_avx2
+ .p2align 4, 0x90
+ .type arithmetic_scalar_arr_avx2,@function
+arithmetic_scalar_arr_avx2: # @arithmetic_scalar_arr_avx2
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 1
+ jg .LBB2_11
+# %bb.1:
+ test sil, sil
+ je .LBB2_21
+# %bb.2:
+ cmp sil, 1
+ jne .LBB2_737
+# %bb.3:
+ cmp edi, 6
+ jg .LBB2_37
+# %bb.4:
+ cmp edi, 3
+ jle .LBB2_65
+# %bb.5:
+ cmp edi, 4
+ je .LBB2_105
+# %bb.6:
+ cmp edi, 5
+ je .LBB2_108
+# %bb.7:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.8:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.9:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_10
+# %bb.177:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_297
+# %bb.178:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_297
+.LBB2_10:
+ xor esi, esi
+.LBB2_421:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_423
+.LBB2_422: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_422
+.LBB2_423:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_424: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_424
+ jmp .LBB2_737
+.LBB2_11:
+ cmp sil, 2
+ je .LBB2_29
+# %bb.12:
+ cmp sil, 3
+ jne .LBB2_737
+# %bb.13:
+ cmp edi, 6
+ jg .LBB2_44
+# %bb.14:
+ cmp edi, 3
+ jle .LBB2_70
+# %bb.15:
+ cmp edi, 4
+ je .LBB2_111
+# %bb.16:
+ cmp edi, 5
+ je .LBB2_114
+# %bb.17:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.18:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.19:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_20
+# %bb.180:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_300
+# %bb.181:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_300
+.LBB2_20:
+ xor esi, esi
+.LBB2_429:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_431
+.LBB2_430: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_430
+.LBB2_431:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_432: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_432
+ jmp .LBB2_737
+.LBB2_21:
+ cmp edi, 6
+ jg .LBB2_51
+# %bb.22:
+ cmp edi, 3
+ jle .LBB2_75
+# %bb.23:
+ cmp edi, 4
+ je .LBB2_117
+# %bb.24:
+ cmp edi, 5
+ je .LBB2_120
+# %bb.25:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.26:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.27:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_28
+# %bb.183:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_303
+# %bb.184:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_303
+.LBB2_28:
+ xor esi, esi
+.LBB2_437:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_439
+.LBB2_438: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_438
+.LBB2_439:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_440: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_440
+ jmp .LBB2_737
+.LBB2_29:
+ cmp edi, 6
+ jg .LBB2_58
+# %bb.30:
+ cmp edi, 3
+ jle .LBB2_80
+# %bb.31:
+ cmp edi, 4
+ je .LBB2_123
+# %bb.32:
+ cmp edi, 5
+ je .LBB2_126
+# %bb.33:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.34:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.35:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_36
+# %bb.186:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_306
+# %bb.187:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_306
+.LBB2_36:
+ xor esi, esi
+.LBB2_445:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_447
+.LBB2_446: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_446
+.LBB2_447:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_448: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_448
+ jmp .LBB2_737
+.LBB2_37:
+ cmp edi, 8
+ jle .LBB2_85
+# %bb.38:
+ cmp edi, 9
+ je .LBB2_129
+# %bb.39:
+ cmp edi, 11
+ je .LBB2_132
+# %bb.40:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.41:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.42:
+ vmovsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB2_43
+# %bb.189:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_309
+# %bb.190:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_309
+.LBB2_43:
+ xor edx, edx
+.LBB2_453:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_455
+.LBB2_454: # =>This Inner Loop Header: Depth=1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_454
+.LBB2_455:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_456: # =>This Inner Loop Header: Depth=1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 8]
+ vmovsd qword ptr [r8 + 8*rdx + 8], xmm1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 16]
+ vmovsd qword ptr [r8 + 8*rdx + 16], xmm1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 24]
+ vmovsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_456
+ jmp .LBB2_737
+.LBB2_44:
+ cmp edi, 8
+ jle .LBB2_90
+# %bb.45:
+ cmp edi, 9
+ je .LBB2_135
+# %bb.46:
+ cmp edi, 11
+ je .LBB2_138
+# %bb.47:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.48:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.49:
+ vmovsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB2_50
+# %bb.192:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_312
+# %bb.193:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_312
+.LBB2_50:
+ xor edx, edx
+.LBB2_461:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_463
+.LBB2_462: # =>This Inner Loop Header: Depth=1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_462
+.LBB2_463:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_464: # =>This Inner Loop Header: Depth=1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 8]
+ vmovsd qword ptr [r8 + 8*rdx + 8], xmm1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 16]
+ vmovsd qword ptr [r8 + 8*rdx + 16], xmm1
+ vsubsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 24]
+ vmovsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_464
+ jmp .LBB2_737
+.LBB2_51:
+ cmp edi, 8
+ jle .LBB2_95
+# %bb.52:
+ cmp edi, 9
+ je .LBB2_141
+# %bb.53:
+ cmp edi, 11
+ je .LBB2_144
+# %bb.54:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.55:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.56:
+ vmovsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB2_57
+# %bb.195:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_315
+# %bb.196:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_315
+.LBB2_57:
+ xor edx, edx
+.LBB2_469:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_471
+.LBB2_470: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_470
+.LBB2_471:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_472: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 8]
+ vmovsd qword ptr [r8 + 8*rdx + 8], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 16]
+ vmovsd qword ptr [r8 + 8*rdx + 16], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 24]
+ vmovsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_472
+ jmp .LBB2_737
+.LBB2_58:
+ cmp edi, 8
+ jle .LBB2_100
+# %bb.59:
+ cmp edi, 9
+ je .LBB2_147
+# %bb.60:
+ cmp edi, 11
+ je .LBB2_150
+# %bb.61:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.62:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.63:
+ vmovsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 16
+ jb .LBB2_64
+# %bb.198:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_318
+# %bb.199:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_318
+.LBB2_64:
+ xor edx, edx
+.LBB2_477:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_479
+.LBB2_478: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_478
+.LBB2_479:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_480: # =>This Inner Loop Header: Depth=1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx]
+ vmovsd qword ptr [r8 + 8*rdx], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 8]
+ vmovsd qword ptr [r8 + 8*rdx + 8], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 16]
+ vmovsd qword ptr [r8 + 8*rdx + 16], xmm1
+ vaddsd xmm1, xmm0, qword ptr [rcx + 8*rdx + 24]
+ vmovsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_480
+ jmp .LBB2_737
+.LBB2_65:
+ cmp edi, 2
+ je .LBB2_153
+# %bb.66:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.67:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.68:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_69
+# %bb.201:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_321
+# %bb.202:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_321
+.LBB2_69:
+ xor esi, esi
+.LBB2_485:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_487
+.LBB2_486: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_486
+.LBB2_487:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_488: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_488
+ jmp .LBB2_737
+.LBB2_70:
+ cmp edi, 2
+ je .LBB2_156
+# %bb.71:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.72:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.73:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_74
+# %bb.204:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_324
+# %bb.205:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_324
+.LBB2_74:
+ xor esi, esi
+.LBB2_493:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_495
+.LBB2_494: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_494
+.LBB2_495:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_496: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_496
+ jmp .LBB2_737
+.LBB2_75:
+ cmp edi, 2
+ je .LBB2_159
+# %bb.76:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.77:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.78:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_79
+# %bb.207:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_327
+# %bb.208:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_327
+.LBB2_79:
+ xor esi, esi
+.LBB2_501:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_503
+.LBB2_502: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_502
+.LBB2_503:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_504: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_504
+ jmp .LBB2_737
+.LBB2_80:
+ cmp edi, 2
+ je .LBB2_162
+# %bb.81:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.82:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.83:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_84
+# %bb.210:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_330
+# %bb.211:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_330
+.LBB2_84:
+ xor esi, esi
+.LBB2_509:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_511
+.LBB2_510: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_510
+.LBB2_511:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_512: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_512
+ jmp .LBB2_737
+.LBB2_85:
+ cmp edi, 7
+ je .LBB2_165
+# %bb.86:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.87:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.88:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_89
+# %bb.213:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_333
+# %bb.214:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_333
+.LBB2_89:
+ xor esi, esi
+.LBB2_517:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_519
+.LBB2_518: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_518
+.LBB2_519:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_520: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_520
+ jmp .LBB2_737
+.LBB2_90:
+ cmp edi, 7
+ je .LBB2_168
+# %bb.91:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.92:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.93:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_94
+# %bb.216:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_336
+# %bb.217:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_336
+.LBB2_94:
+ xor esi, esi
+.LBB2_525:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_527
+.LBB2_526: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_526
+.LBB2_527:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_528: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_528
+ jmp .LBB2_737
+.LBB2_95:
+ cmp edi, 7
+ je .LBB2_171
+# %bb.96:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.97:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.98:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_99
+# %bb.219:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_339
+# %bb.220:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_339
+.LBB2_99:
+ xor esi, esi
+.LBB2_533:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_535
+.LBB2_534: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_534
+.LBB2_535:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_536: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_536
+ jmp .LBB2_737
+.LBB2_100:
+ cmp edi, 7
+ je .LBB2_174
+# %bb.101:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.102:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.103:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_104
+# %bb.222:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_342
+# %bb.223:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_342
+.LBB2_104:
+ xor esi, esi
+.LBB2_541:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_543
+.LBB2_542: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_542
+.LBB2_543:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_544: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_544
+ jmp .LBB2_737
+.LBB2_105:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.106:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_107
+# %bb.225:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_345
+# %bb.226:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_345
+.LBB2_107:
+ xor esi, esi
+.LBB2_549:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_551
+.LBB2_550: # =>This Inner Loop Header: Depth=1
+ mov edi, eax
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_550
+.LBB2_551:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_552: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_552
+ jmp .LBB2_737
+.LBB2_108:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.109:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_110
+# %bb.228:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_348
+# %bb.229:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_348
+.LBB2_110:
+ xor esi, esi
+.LBB2_557:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_559
+.LBB2_558: # =>This Inner Loop Header: Depth=1
+ mov edi, eax
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_558
+.LBB2_559:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_560: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_560
+ jmp .LBB2_737
+.LBB2_111:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.112:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_113
+# %bb.231:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_351
+# %bb.232:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_351
+.LBB2_113:
+ xor esi, esi
+.LBB2_565:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_567
+.LBB2_566: # =>This Inner Loop Header: Depth=1
+ mov edi, eax
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_566
+.LBB2_567:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_568: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_568
+ jmp .LBB2_737
+.LBB2_114:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.115:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_116
+# %bb.234:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_354
+# %bb.235:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_354
+.LBB2_116:
+ xor esi, esi
+.LBB2_573:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_575
+.LBB2_574: # =>This Inner Loop Header: Depth=1
+ mov edi, eax
+ sub di, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_574
+.LBB2_575:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_576: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_576
+ jmp .LBB2_737
+.LBB2_117:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.118:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_119
+# %bb.237:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_357
+# %bb.238:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_357
+.LBB2_119:
+ xor esi, esi
+.LBB2_581:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_583
+.LBB2_582: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_582
+.LBB2_583:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_584: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_584
+ jmp .LBB2_737
+.LBB2_120:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.121:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_122
+# %bb.240:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_360
+# %bb.241:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_360
+.LBB2_122:
+ xor esi, esi
+.LBB2_589:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_591
+.LBB2_590: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_590
+.LBB2_591:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_592: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_592
+ jmp .LBB2_737
+.LBB2_123:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.124:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_125
+# %bb.243:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_363
+# %bb.244:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_363
+.LBB2_125:
+ xor esi, esi
+.LBB2_597:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_599
+.LBB2_598: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_598
+.LBB2_599:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_600: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_600
+ jmp .LBB2_737
+.LBB2_126:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.127:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_128
+# %bb.246:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_366
+# %bb.247:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_366
+.LBB2_128:
+ xor esi, esi
+.LBB2_605:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdx, r10
+ and rdx, 3
+ je .LBB2_607
+.LBB2_606: # =>This Inner Loop Header: Depth=1
+ movzx edi, word ptr [rcx + 2*rsi]
+ add di, ax
+ mov word ptr [r8 + 2*rsi], di
+ add rsi, 1
+ add rdx, -1
+ jne .LBB2_606
+.LBB2_607:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_608: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_608
+ jmp .LBB2_737
+.LBB2_129:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.130:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_131
+# %bb.249:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_369
+# %bb.250:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_369
+.LBB2_131:
+ xor esi, esi
+.LBB2_613:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_615
+.LBB2_614: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_614
+.LBB2_615:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_616: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_616
+ jmp .LBB2_737
+.LBB2_132:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.133:
+ vmovss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB2_134
+# %bb.252:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_372
+# %bb.253:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_372
+.LBB2_134:
+ xor edx, edx
+.LBB2_621:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_623
+.LBB2_622: # =>This Inner Loop Header: Depth=1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_622
+.LBB2_623:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_624: # =>This Inner Loop Header: Depth=1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx + 4]
+ vmovss dword ptr [r8 + 4*rdx + 4], xmm1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx + 8]
+ vmovss dword ptr [r8 + 4*rdx + 8], xmm1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx + 12]
+ vmovss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_624
+ jmp .LBB2_737
+.LBB2_135:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.136:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_137
+# %bb.255:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_375
+# %bb.256:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_375
+.LBB2_137:
+ xor esi, esi
+.LBB2_629:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_631
+.LBB2_630: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_630
+.LBB2_631:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_632: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_632
+ jmp .LBB2_737
+.LBB2_138:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.139:
+ vmovss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB2_140
+# %bb.258:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_378
+# %bb.259:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_378
+.LBB2_140:
+ xor edx, edx
+.LBB2_637:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_639
+.LBB2_638: # =>This Inner Loop Header: Depth=1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_638
+.LBB2_639:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_640: # =>This Inner Loop Header: Depth=1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx + 4]
+ vmovss dword ptr [r8 + 4*rdx + 4], xmm1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx + 8]
+ vmovss dword ptr [r8 + 4*rdx + 8], xmm1
+ vsubss xmm1, xmm0, dword ptr [rcx + 4*rdx + 12]
+ vmovss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_640
+ jmp .LBB2_737
+.LBB2_141:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.142:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_143
+# %bb.261:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_381
+# %bb.262:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_381
+.LBB2_143:
+ xor esi, esi
+.LBB2_645:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_647
+.LBB2_646: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_646
+.LBB2_647:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_648: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_648
+ jmp .LBB2_737
+.LBB2_144:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.145:
+ vmovss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB2_146
+# %bb.264:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_384
+# %bb.265:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_384
+.LBB2_146:
+ xor edx, edx
+.LBB2_653:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_655
+.LBB2_654: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_654
+.LBB2_655:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_656: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx + 4]
+ vmovss dword ptr [r8 + 4*rdx + 4], xmm1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx + 8]
+ vmovss dword ptr [r8 + 4*rdx + 8], xmm1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx + 12]
+ vmovss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_656
+ jmp .LBB2_737
+.LBB2_147:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.148:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_149
+# %bb.267:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_387
+# %bb.268:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_387
+.LBB2_149:
+ xor esi, esi
+.LBB2_661:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_663
+.LBB2_662: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_662
+.LBB2_663:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_664: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_664
+ jmp .LBB2_737
+.LBB2_150:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.151:
+ vmovss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 32
+ jb .LBB2_152
+# %bb.270:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_390
+# %bb.271:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_390
+.LBB2_152:
+ xor edx, edx
+.LBB2_669:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_671
+.LBB2_670: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_670
+.LBB2_671:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_672: # =>This Inner Loop Header: Depth=1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx]
+ vmovss dword ptr [r8 + 4*rdx], xmm1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx + 4]
+ vmovss dword ptr [r8 + 4*rdx + 4], xmm1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx + 8]
+ vmovss dword ptr [r8 + 4*rdx + 8], xmm1
+ vaddss xmm1, xmm0, dword ptr [rcx + 4*rdx + 12]
+ vmovss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_672
+ jmp .LBB2_737
+.LBB2_153:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.154:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_155
+# %bb.273:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_393
+# %bb.274:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_393
+.LBB2_155:
+ xor esi, esi
+.LBB2_677:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_679
+.LBB2_678: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_678
+.LBB2_679:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_680: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_680
+ jmp .LBB2_737
+.LBB2_156:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.157:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_158
+# %bb.276:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_396
+# %bb.277:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_396
+.LBB2_158:
+ xor esi, esi
+.LBB2_685:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_687
+.LBB2_686: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_686
+.LBB2_687:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_688: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], dl
+ mov edx, eax
+ sub dl, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_688
+ jmp .LBB2_737
+.LBB2_159:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.160:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_161
+# %bb.279:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_399
+# %bb.280:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_399
+.LBB2_161:
+ xor esi, esi
+.LBB2_693:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_695
+.LBB2_694: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_694
+.LBB2_695:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_696: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_696
+ jmp .LBB2_737
+.LBB2_162:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.163:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 128
+ jb .LBB2_164
+# %bb.282:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_402
+# %bb.283:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_402
+.LBB2_164:
+ xor esi, esi
+.LBB2_701:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_703
+.LBB2_702: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_702
+.LBB2_703:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_704: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_704
+ jmp .LBB2_737
+.LBB2_165:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.166:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_167
+# %bb.285:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_405
+# %bb.286:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_405
+.LBB2_167:
+ xor esi, esi
+.LBB2_709:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_711
+.LBB2_710: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_710
+.LBB2_711:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_712: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_712
+ jmp .LBB2_737
+.LBB2_168:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.169:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_170
+# %bb.288:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_408
+# %bb.289:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_408
+.LBB2_170:
+ xor esi, esi
+.LBB2_717:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_719
+.LBB2_718: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_718
+.LBB2_719:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_720: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_720
+ jmp .LBB2_737
+.LBB2_171:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.172:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_173
+# %bb.291:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_411
+# %bb.292:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_411
+.LBB2_173:
+ xor esi, esi
+.LBB2_725:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_727
+.LBB2_726: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_726
+.LBB2_727:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_728: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_728
+ jmp .LBB2_737
+.LBB2_174:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.175:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_176
+# %bb.294:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_414
+# %bb.295:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_414
+.LBB2_176:
+ xor esi, esi
+.LBB2_733:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_735
+.LBB2_734: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_734
+.LBB2_735:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_736: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_736
+ jmp .LBB2_737
+.LBB2_297:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, r11d
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_417
+# %bb.298:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_299: # =>This Inner Loop Header: Depth=1
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_299
+ jmp .LBB2_418
+.LBB2_300:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, r11d
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_425
+# %bb.301:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_302: # =>This Inner Loop Header: Depth=1
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_302
+ jmp .LBB2_426
+.LBB2_303:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_433
+# %bb.304:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_305: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_305
+ jmp .LBB2_434
+.LBB2_306:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_441
+# %bb.307:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_308: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_308
+ jmp .LBB2_442
+.LBB2_309:
+ mov edx, eax
+ and edx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rdx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_449
+# %bb.310:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_311: # =>This Inner Loop Header: Depth=1
+ vsubpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm5
+ vsubpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi + 128]
+ vsubpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 160]
+ vsubpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 192]
+ vsubpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 224]
+ vmovupd ymmword ptr [r8 + 8*rdi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 224], ymm5
+ add rdi, 32
+ add rsi, 2
+ jne .LBB2_311
+ jmp .LBB2_450
+.LBB2_312:
+ mov edx, eax
+ and edx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rdx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_457
+# %bb.313:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_314: # =>This Inner Loop Header: Depth=1
+ vsubpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm5
+ vsubpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi + 128]
+ vsubpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 160]
+ vsubpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 192]
+ vsubpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 224]
+ vmovupd ymmword ptr [r8 + 8*rdi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 224], ymm5
+ add rdi, 32
+ add rsi, 2
+ jne .LBB2_314
+ jmp .LBB2_458
+.LBB2_315:
+ mov edx, eax
+ and edx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rdx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_465
+# %bb.316:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_317: # =>This Inner Loop Header: Depth=1
+ vaddpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vaddpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm5
+ vaddpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi + 128]
+ vaddpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 160]
+ vaddpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 192]
+ vaddpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 224]
+ vmovupd ymmword ptr [r8 + 8*rdi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 224], ymm5
+ add rdi, 32
+ add rsi, 2
+ jne .LBB2_317
+ jmp .LBB2_466
+.LBB2_318:
+ mov edx, eax
+ and edx, -16
+ vbroadcastsd ymm1, xmm0
+ lea rsi, [rdx - 16]
+ mov r9, rsi
+ shr r9, 4
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_473
+# %bb.319:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_320: # =>This Inner Loop Header: Depth=1
+ vaddpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vaddpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm5
+ vaddpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi + 128]
+ vaddpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 160]
+ vaddpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 192]
+ vaddpd ymm5, ymm1, ymmword ptr [rcx + 8*rdi + 224]
+ vmovupd ymmword ptr [r8 + 8*rdi + 128], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 160], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 192], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 224], ymm5
+ add rdi, 32
+ add rsi, 2
+ jne .LBB2_320
+ jmp .LBB2_474
+.LBB2_321:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_481
+# %bb.322:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_323: # =>This Inner Loop Header: Depth=1
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_323
+ jmp .LBB2_482
+.LBB2_324:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_489
+# %bb.325:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_326: # =>This Inner Loop Header: Depth=1
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_326
+ jmp .LBB2_490
+.LBB2_327:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_497
+# %bb.328:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_329: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_329
+ jmp .LBB2_498
+.LBB2_330:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_505
+# %bb.331:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_332: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_332
+ jmp .LBB2_506
+.LBB2_333:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, r11
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_513
+# %bb.334:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_335: # =>This Inner Loop Header: Depth=1
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_335
+ jmp .LBB2_514
+.LBB2_336:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, r11
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_521
+# %bb.337:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_338: # =>This Inner Loop Header: Depth=1
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_338
+ jmp .LBB2_522
+.LBB2_339:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_529
+# %bb.340:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_341: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_341
+ jmp .LBB2_530
+.LBB2_342:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_537
+# %bb.343:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_344: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_344
+ jmp .LBB2_538
+.LBB2_345:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_545
+# %bb.346:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_347: # =>This Inner Loop Header: Depth=1
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_347
+ jmp .LBB2_546
+.LBB2_348:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_553
+# %bb.349:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_350: # =>This Inner Loop Header: Depth=1
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_350
+ jmp .LBB2_554
+.LBB2_351:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_561
+# %bb.352:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_353: # =>This Inner Loop Header: Depth=1
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_353
+ jmp .LBB2_562
+.LBB2_354:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_569
+# %bb.355:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_356: # =>This Inner Loop Header: Depth=1
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpsubw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_356
+ jmp .LBB2_570
+.LBB2_357:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_577
+# %bb.358:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_359: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_359
+ jmp .LBB2_578
+.LBB2_360:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_585
+# %bb.361:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_362: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_362
+ jmp .LBB2_586
+.LBB2_363:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_593
+# %bb.364:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_365: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_365
+ jmp .LBB2_594
+.LBB2_366:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastw ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_601
+# %bb.367:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_368: # =>This Inner Loop Header: Depth=1
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm2
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi + 64]
+ vpaddw ymm2, ymm0, ymmword ptr [rcx + 2*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 2*rdi + 64], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 96], ymm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_368
+ jmp .LBB2_602
+.LBB2_369:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, r11
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_609
+# %bb.370:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_371: # =>This Inner Loop Header: Depth=1
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_371
+ jmp .LBB2_610
+.LBB2_372:
+ mov edx, eax
+ and edx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rdx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_617
+# %bb.373:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_374: # =>This Inner Loop Header: Depth=1
+ vsubps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm5
+ vsubps ymm2, ymm1, ymmword ptr [rcx + 4*rdi + 128]
+ vsubps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 160]
+ vsubps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 192]
+ vsubps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 224]
+ vmovups ymmword ptr [r8 + 4*rdi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 224], ymm5
+ add rdi, 64
+ add rsi, 2
+ jne .LBB2_374
+ jmp .LBB2_618
+.LBB2_375:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, r11
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_625
+# %bb.376:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_377: # =>This Inner Loop Header: Depth=1
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpsubq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_377
+ jmp .LBB2_626
+.LBB2_378:
+ mov edx, eax
+ and edx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rdx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_633
+# %bb.379:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_380: # =>This Inner Loop Header: Depth=1
+ vsubps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm5
+ vsubps ymm2, ymm1, ymmword ptr [rcx + 4*rdi + 128]
+ vsubps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 160]
+ vsubps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 192]
+ vsubps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 224]
+ vmovups ymmword ptr [r8 + 4*rdi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 224], ymm5
+ add rdi, 64
+ add rsi, 2
+ jne .LBB2_380
+ jmp .LBB2_634
+.LBB2_381:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_641
+# %bb.382:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_383: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_383
+ jmp .LBB2_642
+.LBB2_384:
+ mov edx, eax
+ and edx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rdx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_649
+# %bb.385:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_386: # =>This Inner Loop Header: Depth=1
+ vaddps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vaddps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm5
+ vaddps ymm2, ymm1, ymmword ptr [rcx + 4*rdi + 128]
+ vaddps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 160]
+ vaddps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 192]
+ vaddps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 224]
+ vmovups ymmword ptr [r8 + 4*rdi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 224], ymm5
+ add rdi, 64
+ add rsi, 2
+ jne .LBB2_386
+ jmp .LBB2_650
+.LBB2_387:
+ mov esi, r10d
+ and esi, -16
+ vmovq xmm0, rax
+ vpbroadcastq ymm0, xmm0
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_657
+# %bb.388:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_389: # =>This Inner Loop Header: Depth=1
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm4
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi + 128]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 160]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 192]
+ vpaddq ymm4, ymm0, ymmword ptr [rcx + 8*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 8*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 224], ymm4
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_389
+ jmp .LBB2_658
+.LBB2_390:
+ mov edx, eax
+ and edx, -32
+ vbroadcastss ymm1, xmm0
+ lea rsi, [rdx - 32]
+ mov r9, rsi
+ shr r9, 5
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_665
+# %bb.391:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_392: # =>This Inner Loop Header: Depth=1
+ vaddps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vaddps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm5
+ vaddps ymm2, ymm1, ymmword ptr [rcx + 4*rdi + 128]
+ vaddps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 160]
+ vaddps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 192]
+ vaddps ymm5, ymm1, ymmword ptr [rcx + 4*rdi + 224]
+ vmovups ymmword ptr [r8 + 4*rdi + 128], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 160], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 192], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 224], ymm5
+ add rdi, 64
+ add rsi, 2
+ jne .LBB2_392
+ jmp .LBB2_666
+.LBB2_393:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_673
+# %bb.394:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_395: # =>This Inner Loop Header: Depth=1
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_395
+ jmp .LBB2_674
+.LBB2_396:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_681
+# %bb.397:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_398: # =>This Inner Loop Header: Depth=1
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpsubb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_398
+ jmp .LBB2_682
+.LBB2_399:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_689
+# %bb.400:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_401: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_401
+ jmp .LBB2_690
+.LBB2_402:
+ mov esi, r10d
+ and esi, -128
+ vmovd xmm0, eax
+ vpbroadcastb ymm0, xmm0
+ lea rdx, [rsi - 128]
+ mov r9, rdx
+ shr r9, 7
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_697
+# %bb.403:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_404: # =>This Inner Loop Header: Depth=1
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm4
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi + 128]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 160]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 192]
+ vpaddb ymm4, ymm0, ymmword ptr [rcx + rdi + 224]
+ vmovdqu ymmword ptr [r8 + rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 224], ymm4
+ add rdi, 256
+ add rdx, 2
+ jne .LBB2_404
+ jmp .LBB2_698
+.LBB2_405:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, r11d
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_705
+# %bb.406:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_407: # =>This Inner Loop Header: Depth=1
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_407
+ jmp .LBB2_706
+.LBB2_408:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, r11d
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_713
+# %bb.409:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_410: # =>This Inner Loop Header: Depth=1
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpsubd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_410
+ jmp .LBB2_714
+.LBB2_411:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_721
+# %bb.412:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_413: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_413
+ jmp .LBB2_722
+.LBB2_414:
+ mov esi, r10d
+ and esi, -32
+ vmovd xmm0, eax
+ vpbroadcastd ymm0, xmm0
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_729
+# %bb.415:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_416: # =>This Inner Loop Header: Depth=1
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm4
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi + 128]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 160]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 192]
+ vpaddd ymm4, ymm0, ymmword ptr [rcx + 4*rdi + 224]
+ vmovdqu ymmword ptr [r8 + 4*rdi + 128], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 160], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 192], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 224], ymm4
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_416
+ jmp .LBB2_730
+.LBB2_417:
+ xor edi, edi
+.LBB2_418:
+ test r9b, 1
+ je .LBB2_420
+# %bb.419:
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_420:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_421
+.LBB2_425:
+ xor edi, edi
+.LBB2_426:
+ test r9b, 1
+ je .LBB2_428
+# %bb.427:
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_428:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_429
+.LBB2_433:
+ xor edi, edi
+.LBB2_434:
+ test r9b, 1
+ je .LBB2_436
+# %bb.435:
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_436:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_437
+.LBB2_441:
+ xor edi, edi
+.LBB2_442:
+ test r9b, 1
+ je .LBB2_444
+# %bb.443:
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_444:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_445
+.LBB2_449:
+ xor edi, edi
+.LBB2_450:
+ test r9b, 1
+ je .LBB2_452
+# %bb.451:
+ vsubpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm1
+.LBB2_452:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_453
+.LBB2_457:
+ xor edi, edi
+.LBB2_458:
+ test r9b, 1
+ je .LBB2_460
+# %bb.459:
+ vsubpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vsubpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vsubpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vsubpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm1
+.LBB2_460:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_461
+.LBB2_465:
+ xor edi, edi
+.LBB2_466:
+ test r9b, 1
+ je .LBB2_468
+# %bb.467:
+ vaddpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vaddpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm1
+.LBB2_468:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_469
+.LBB2_473:
+ xor edi, edi
+.LBB2_474:
+ test r9b, 1
+ je .LBB2_476
+# %bb.475:
+ vaddpd ymm2, ymm1, ymmword ptr [rcx + 8*rdi]
+ vaddpd ymm3, ymm1, ymmword ptr [rcx + 8*rdi + 32]
+ vaddpd ymm4, ymm1, ymmword ptr [rcx + 8*rdi + 64]
+ vaddpd ymm1, ymm1, ymmword ptr [rcx + 8*rdi + 96]
+ vmovupd ymmword ptr [r8 + 8*rdi], ymm2
+ vmovupd ymmword ptr [r8 + 8*rdi + 32], ymm3
+ vmovupd ymmword ptr [r8 + 8*rdi + 64], ymm4
+ vmovupd ymmword ptr [r8 + 8*rdi + 96], ymm1
+.LBB2_476:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_477
+.LBB2_481:
+ xor edi, edi
+.LBB2_482:
+ test r9b, 1
+ je .LBB2_484
+# %bb.483:
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_484:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_485
+.LBB2_489:
+ xor edi, edi
+.LBB2_490:
+ test r9b, 1
+ je .LBB2_492
+# %bb.491:
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_492:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_493
+.LBB2_497:
+ xor edi, edi
+.LBB2_498:
+ test r9b, 1
+ je .LBB2_500
+# %bb.499:
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_500:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_501
+.LBB2_505:
+ xor edi, edi
+.LBB2_506:
+ test r9b, 1
+ je .LBB2_508
+# %bb.507:
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_508:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_509
+.LBB2_513:
+ xor edi, edi
+.LBB2_514:
+ test r9b, 1
+ je .LBB2_516
+# %bb.515:
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_516:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_517
+.LBB2_521:
+ xor edi, edi
+.LBB2_522:
+ test r9b, 1
+ je .LBB2_524
+# %bb.523:
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_524:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_525
+.LBB2_529:
+ xor edi, edi
+.LBB2_530:
+ test r9b, 1
+ je .LBB2_532
+# %bb.531:
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_532:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_533
+.LBB2_537:
+ xor edi, edi
+.LBB2_538:
+ test r9b, 1
+ je .LBB2_540
+# %bb.539:
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_540:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_541
+.LBB2_545:
+ xor edi, edi
+.LBB2_546:
+ test r9b, 1
+ je .LBB2_548
+# %bb.547:
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_548:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_549
+.LBB2_553:
+ xor edi, edi
+.LBB2_554:
+ test r9b, 1
+ je .LBB2_556
+# %bb.555:
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_556:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_557
+.LBB2_561:
+ xor edi, edi
+.LBB2_562:
+ test r9b, 1
+ je .LBB2_564
+# %bb.563:
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_564:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_565
+.LBB2_569:
+ xor edi, edi
+.LBB2_570:
+ test r9b, 1
+ je .LBB2_572
+# %bb.571:
+ vpsubw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpsubw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_572:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_573
+.LBB2_577:
+ xor edi, edi
+.LBB2_578:
+ test r9b, 1
+ je .LBB2_580
+# %bb.579:
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_580:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_581
+.LBB2_585:
+ xor edi, edi
+.LBB2_586:
+ test r9b, 1
+ je .LBB2_588
+# %bb.587:
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_588:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_589
+.LBB2_593:
+ xor edi, edi
+.LBB2_594:
+ test r9b, 1
+ je .LBB2_596
+# %bb.595:
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_596:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_597
+.LBB2_601:
+ xor edi, edi
+.LBB2_602:
+ test r9b, 1
+ je .LBB2_604
+# %bb.603:
+ vpaddw ymm1, ymm0, ymmword ptr [rcx + 2*rdi]
+ vpaddw ymm0, ymm0, ymmword ptr [rcx + 2*rdi + 32]
+ vmovdqu ymmword ptr [r8 + 2*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 2*rdi + 32], ymm0
+.LBB2_604:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_605
+.LBB2_609:
+ xor edi, edi
+.LBB2_610:
+ test r9b, 1
+ je .LBB2_612
+# %bb.611:
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_612:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_613
+.LBB2_617:
+ xor edi, edi
+.LBB2_618:
+ test r9b, 1
+ je .LBB2_620
+# %bb.619:
+ vsubps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm1
+.LBB2_620:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_621
+.LBB2_625:
+ xor edi, edi
+.LBB2_626:
+ test r9b, 1
+ je .LBB2_628
+# %bb.627:
+ vpsubq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpsubq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpsubq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpsubq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_628:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_629
+.LBB2_633:
+ xor edi, edi
+.LBB2_634:
+ test r9b, 1
+ je .LBB2_636
+# %bb.635:
+ vsubps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vsubps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vsubps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vsubps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm1
+.LBB2_636:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_637
+.LBB2_641:
+ xor edi, edi
+.LBB2_642:
+ test r9b, 1
+ je .LBB2_644
+# %bb.643:
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_644:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_645
+.LBB2_649:
+ xor edi, edi
+.LBB2_650:
+ test r9b, 1
+ je .LBB2_652
+# %bb.651:
+ vaddps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vaddps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm1
+.LBB2_652:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_653
+.LBB2_657:
+ xor edi, edi
+.LBB2_658:
+ test r9b, 1
+ je .LBB2_660
+# %bb.659:
+ vpaddq ymm1, ymm0, ymmword ptr [rcx + 8*rdi]
+ vpaddq ymm2, ymm0, ymmword ptr [rcx + 8*rdi + 32]
+ vpaddq ymm3, ymm0, ymmword ptr [rcx + 8*rdi + 64]
+ vpaddq ymm0, ymm0, ymmword ptr [rcx + 8*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 8*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 8*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 8*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 8*rdi + 96], ymm0
+.LBB2_660:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_661
+.LBB2_665:
+ xor edi, edi
+.LBB2_666:
+ test r9b, 1
+ je .LBB2_668
+# %bb.667:
+ vaddps ymm2, ymm1, ymmword ptr [rcx + 4*rdi]
+ vaddps ymm3, ymm1, ymmword ptr [rcx + 4*rdi + 32]
+ vaddps ymm4, ymm1, ymmword ptr [rcx + 4*rdi + 64]
+ vaddps ymm1, ymm1, ymmword ptr [rcx + 4*rdi + 96]
+ vmovups ymmword ptr [r8 + 4*rdi], ymm2
+ vmovups ymmword ptr [r8 + 4*rdi + 32], ymm3
+ vmovups ymmword ptr [r8 + 4*rdi + 64], ymm4
+ vmovups ymmword ptr [r8 + 4*rdi + 96], ymm1
+.LBB2_668:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_669
+.LBB2_673:
+ xor edi, edi
+.LBB2_674:
+ test r9b, 1
+ je .LBB2_676
+# %bb.675:
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_676:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_677
+.LBB2_681:
+ xor edi, edi
+.LBB2_682:
+ test r9b, 1
+ je .LBB2_684
+# %bb.683:
+ vpsubb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpsubb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpsubb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpsubb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_684:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_685
+.LBB2_689:
+ xor edi, edi
+.LBB2_690:
+ test r9b, 1
+ je .LBB2_692
+# %bb.691:
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_692:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_693
+.LBB2_697:
+ xor edi, edi
+.LBB2_698:
+ test r9b, 1
+ je .LBB2_700
+# %bb.699:
+ vpaddb ymm1, ymm0, ymmword ptr [rcx + rdi]
+ vpaddb ymm2, ymm0, ymmword ptr [rcx + rdi + 32]
+ vpaddb ymm3, ymm0, ymmword ptr [rcx + rdi + 64]
+ vpaddb ymm0, ymm0, ymmword ptr [rcx + rdi + 96]
+ vmovdqu ymmword ptr [r8 + rdi], ymm1
+ vmovdqu ymmword ptr [r8 + rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + rdi + 96], ymm0
+.LBB2_700:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_701
+.LBB2_705:
+ xor edi, edi
+.LBB2_706:
+ test r9b, 1
+ je .LBB2_708
+# %bb.707:
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_708:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_709
+.LBB2_713:
+ xor edi, edi
+.LBB2_714:
+ test r9b, 1
+ je .LBB2_716
+# %bb.715:
+ vpsubd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpsubd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpsubd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpsubd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_716:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_717
+.LBB2_721:
+ xor edi, edi
+.LBB2_722:
+ test r9b, 1
+ je .LBB2_724
+# %bb.723:
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_724:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_725
+.LBB2_729:
+ xor edi, edi
+.LBB2_730:
+ test r9b, 1
+ je .LBB2_732
+# %bb.731:
+ vpaddd ymm1, ymm0, ymmword ptr [rcx + 4*rdi]
+ vpaddd ymm2, ymm0, ymmword ptr [rcx + 4*rdi + 32]
+ vpaddd ymm3, ymm0, ymmword ptr [rcx + 4*rdi + 64]
+ vpaddd ymm0, ymm0, ymmword ptr [rcx + 4*rdi + 96]
+ vmovdqu ymmword ptr [r8 + 4*rdi], ymm1
+ vmovdqu ymmword ptr [r8 + 4*rdi + 32], ymm2
+ vmovdqu ymmword ptr [r8 + 4*rdi + 64], ymm3
+ vmovdqu ymmword ptr [r8 + 4*rdi + 96], ymm0
+.LBB2_732:
+ cmp rsi, r10
+ jne .LBB2_733
+.LBB2_737:
+ mov rsp, rbp
+ pop rbp
+ vzeroupper
+ ret
+.Lfunc_end2:
+ .size arithmetic_scalar_arr_avx2, .Lfunc_end2-arithmetic_scalar_arr_avx2
+ # -- End function
+ .ident "Ubuntu clang version 11.1.0-6"
+ .section ".note.GNU-stack","",@progbits
+ .addrsig
diff --git a/go/arrow/compute/internal/kernels/_lib/base_arithmetic_sse4_amd64.s b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_sse4_amd64.s
new file mode 100644
index 00000000000..f1566f738e9
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/base_arithmetic_sse4_amd64.s
@@ -0,0 +1,13338 @@
+ .text
+ .intel_syntax noprefix
+ .file "base_arithmetic.cc"
+ .globl arithmetic_sse4 # -- Begin function arithmetic_sse4
+ .p2align 4, 0x90
+ .type arithmetic_sse4,@function
+arithmetic_sse4: # @arithmetic_sse4
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 1
+ jg .LBB0_10
+# %bb.1:
+ test sil, sil
+ je .LBB0_19
+# %bb.2:
+ cmp sil, 1
+ jne .LBB0_697
+# %bb.3:
+ cmp edi, 6
+ jg .LBB0_371
+# %bb.4:
+ cmp edi, 3
+ jle .LBB0_5
+# %bb.365:
+ cmp edi, 4
+ je .LBB0_412
+# %bb.366:
+ cmp edi, 5
+ je .LBB0_428
+# %bb.367:
+ cmp edi, 6
+ jne .LBB0_697
+# %bb.368:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.369:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_444
+# %bb.370:
+ xor esi, esi
+.LBB0_453:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_455
+.LBB0_454: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_454
+.LBB0_455:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_456: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_456
+ jmp .LBB0_697
+.LBB0_10:
+ cmp sil, 2
+ je .LBB0_192
+# %bb.11:
+ cmp sil, 3
+ jne .LBB0_697
+# %bb.12:
+ cmp edi, 6
+ jg .LBB0_537
+# %bb.13:
+ cmp edi, 3
+ jle .LBB0_14
+# %bb.531:
+ cmp edi, 4
+ je .LBB0_578
+# %bb.532:
+ cmp edi, 5
+ je .LBB0_594
+# %bb.533:
+ cmp edi, 6
+ jne .LBB0_697
+# %bb.534:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.535:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_610
+# %bb.536:
+ xor esi, esi
+.LBB0_619:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_621
+.LBB0_620: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_620
+.LBB0_621:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_622: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_622
+ jmp .LBB0_697
+.LBB0_19:
+ cmp edi, 6
+ jg .LBB0_32
+# %bb.20:
+ cmp edi, 3
+ jle .LBB0_21
+# %bb.26:
+ cmp edi, 4
+ je .LBB0_73
+# %bb.27:
+ cmp edi, 5
+ je .LBB0_89
+# %bb.28:
+ cmp edi, 6
+ jne .LBB0_697
+# %bb.29:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.30:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_105
+# %bb.31:
+ xor esi, esi
+.LBB0_114:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_116
+.LBB0_115: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_115
+.LBB0_116:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_117: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_117
+ jmp .LBB0_697
+.LBB0_192:
+ cmp edi, 6
+ jg .LBB0_205
+# %bb.193:
+ cmp edi, 3
+ jle .LBB0_194
+# %bb.199:
+ cmp edi, 4
+ je .LBB0_246
+# %bb.200:
+ cmp edi, 5
+ je .LBB0_262
+# %bb.201:
+ cmp edi, 6
+ jne .LBB0_697
+# %bb.202:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.203:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_278
+# %bb.204:
+ xor esi, esi
+.LBB0_287:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_289
+.LBB0_288: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_288
+.LBB0_289:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_290: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_290
+ jmp .LBB0_697
+.LBB0_371:
+ cmp edi, 8
+ jle .LBB0_372
+# %bb.377:
+ cmp edi, 9
+ je .LBB0_486
+# %bb.378:
+ cmp edi, 11
+ je .LBB0_502
+# %bb.379:
+ cmp edi, 12
+ jne .LBB0_697
+# %bb.380:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.381:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_518
+# %bb.382:
+ xor esi, esi
+.LBB0_527:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_529
+.LBB0_528: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_528
+.LBB0_529:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_530: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ movsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi + 8]
+ movsd qword ptr [r8 + 8*rsi + 8], xmm0
+ movsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi + 16]
+ movsd qword ptr [r8 + 8*rsi + 16], xmm0
+ movsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi + 24]
+ movsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_530
+ jmp .LBB0_697
+.LBB0_537:
+ cmp edi, 8
+ jle .LBB0_538
+# %bb.543:
+ cmp edi, 9
+ je .LBB0_652
+# %bb.544:
+ cmp edi, 11
+ je .LBB0_668
+# %bb.545:
+ cmp edi, 12
+ jne .LBB0_697
+# %bb.546:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.547:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_684
+# %bb.548:
+ xor esi, esi
+.LBB0_693:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_695
+.LBB0_694: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_694
+.LBB0_695:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_696: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ movsd xmm0, qword ptr [rdx + 8*rsi + 8] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi + 8]
+ movsd qword ptr [r8 + 8*rsi + 8], xmm0
+ movsd xmm0, qword ptr [rdx + 8*rsi + 16] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi + 16]
+ movsd qword ptr [r8 + 8*rsi + 16], xmm0
+ movsd xmm0, qword ptr [rdx + 8*rsi + 24] # xmm0 = mem[0],zero
+ subsd xmm0, qword ptr [rcx + 8*rsi + 24]
+ movsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_696
+ jmp .LBB0_697
+.LBB0_32:
+ cmp edi, 8
+ jle .LBB0_33
+# %bb.38:
+ cmp edi, 9
+ je .LBB0_147
+# %bb.39:
+ cmp edi, 11
+ je .LBB0_163
+# %bb.40:
+ cmp edi, 12
+ jne .LBB0_697
+# %bb.41:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.42:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_179
+# %bb.43:
+ xor esi, esi
+.LBB0_188:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_190
+.LBB0_189: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_189
+.LBB0_190:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_191: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ movsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi + 8]
+ movsd qword ptr [r8 + 8*rsi + 8], xmm0
+ movsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi + 16]
+ movsd qword ptr [r8 + 8*rsi + 16], xmm0
+ movsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi + 24]
+ movsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_191
+ jmp .LBB0_697
+.LBB0_205:
+ cmp edi, 8
+ jle .LBB0_206
+# %bb.211:
+ cmp edi, 9
+ je .LBB0_320
+# %bb.212:
+ cmp edi, 11
+ je .LBB0_336
+# %bb.213:
+ cmp edi, 12
+ jne .LBB0_697
+# %bb.214:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.215:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_352
+# %bb.216:
+ xor esi, esi
+.LBB0_361:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_363
+.LBB0_362: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_362
+.LBB0_363:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_364: # =>This Inner Loop Header: Depth=1
+ movsd xmm0, qword ptr [rcx + 8*rsi] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi]
+ movsd qword ptr [r8 + 8*rsi], xmm0
+ movsd xmm0, qword ptr [rcx + 8*rsi + 8] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi + 8]
+ movsd qword ptr [r8 + 8*rsi + 8], xmm0
+ movsd xmm0, qword ptr [rcx + 8*rsi + 16] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi + 16]
+ movsd qword ptr [r8 + 8*rsi + 16], xmm0
+ movsd xmm0, qword ptr [rcx + 8*rsi + 24] # xmm0 = mem[0],zero
+ addsd xmm0, qword ptr [rdx + 8*rsi + 24]
+ movsd qword ptr [r8 + 8*rsi + 24], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_364
+ jmp .LBB0_697
+.LBB0_5:
+ cmp edi, 2
+ je .LBB0_383
+# %bb.6:
+ cmp edi, 3
+ jne .LBB0_697
+# %bb.7:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.8:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_399
+# %bb.9:
+ xor esi, esi
+.LBB0_408:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_410
+.LBB0_409: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_409
+.LBB0_410:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_411: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_411
+ jmp .LBB0_697
+.LBB0_14:
+ cmp edi, 2
+ je .LBB0_549
+# %bb.15:
+ cmp edi, 3
+ jne .LBB0_697
+# %bb.16:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.17:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_565
+# %bb.18:
+ xor esi, esi
+.LBB0_574:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_576
+.LBB0_575: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_575
+.LBB0_576:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_577: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_577
+ jmp .LBB0_697
+.LBB0_21:
+ cmp edi, 2
+ je .LBB0_44
+# %bb.22:
+ cmp edi, 3
+ jne .LBB0_697
+# %bb.23:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.24:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_60
+# %bb.25:
+ xor esi, esi
+.LBB0_69:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_71
+.LBB0_70: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_70
+.LBB0_71:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_72: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_72
+ jmp .LBB0_697
+.LBB0_194:
+ cmp edi, 2
+ je .LBB0_217
+# %bb.195:
+ cmp edi, 3
+ jne .LBB0_697
+# %bb.196:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.197:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_233
+# %bb.198:
+ xor esi, esi
+.LBB0_242:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_244
+.LBB0_243: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_243
+.LBB0_244:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_245: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_245
+ jmp .LBB0_697
+.LBB0_372:
+ cmp edi, 7
+ je .LBB0_457
+# %bb.373:
+ cmp edi, 8
+ jne .LBB0_697
+# %bb.374:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.375:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_473
+# %bb.376:
+ xor esi, esi
+.LBB0_482:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_484
+.LBB0_483: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_483
+.LBB0_484:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_485: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_485
+ jmp .LBB0_697
+.LBB0_538:
+ cmp edi, 7
+ je .LBB0_623
+# %bb.539:
+ cmp edi, 8
+ jne .LBB0_697
+# %bb.540:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.541:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_639
+# %bb.542:
+ xor esi, esi
+.LBB0_648:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_650
+.LBB0_649: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_649
+.LBB0_650:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_651: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_651
+ jmp .LBB0_697
+.LBB0_33:
+ cmp edi, 7
+ je .LBB0_118
+# %bb.34:
+ cmp edi, 8
+ jne .LBB0_697
+# %bb.35:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.36:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_134
+# %bb.37:
+ xor esi, esi
+.LBB0_143:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_145
+.LBB0_144: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_144
+.LBB0_145:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_146: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_146
+ jmp .LBB0_697
+.LBB0_206:
+ cmp edi, 7
+ je .LBB0_291
+# %bb.207:
+ cmp edi, 8
+ jne .LBB0_697
+# %bb.208:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.209:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_307
+# %bb.210:
+ xor esi, esi
+.LBB0_316:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_318
+.LBB0_317: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_317
+.LBB0_318:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_319: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_319
+ jmp .LBB0_697
+.LBB0_412:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.413:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_415
+# %bb.414:
+ xor esi, esi
+.LBB0_424:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_426
+.LBB0_425: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_425
+.LBB0_426:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_427: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_427
+ jmp .LBB0_697
+.LBB0_428:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.429:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_431
+# %bb.430:
+ xor esi, esi
+.LBB0_440:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_442
+.LBB0_441: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_441
+.LBB0_442:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_443: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_443
+ jmp .LBB0_697
+.LBB0_578:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.579:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_581
+# %bb.580:
+ xor esi, esi
+.LBB0_590:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_592
+.LBB0_591: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_591
+.LBB0_592:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_593: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_593
+ jmp .LBB0_697
+.LBB0_594:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.595:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_597
+# %bb.596:
+ xor esi, esi
+.LBB0_606:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_608
+.LBB0_607: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_607
+.LBB0_608:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_609: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rdx + 2*rsi]
+ sub ax, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rdx + 2*rsi + 2]
+ sub ax, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rdx + 2*rsi + 4]
+ sub ax, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rdx + 2*rsi + 6]
+ sub ax, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_609
+ jmp .LBB0_697
+.LBB0_73:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.74:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_76
+# %bb.75:
+ xor esi, esi
+.LBB0_85:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_87
+.LBB0_86: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_86
+.LBB0_87:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_88: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_88
+ jmp .LBB0_697
+.LBB0_89:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.90:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_92
+# %bb.91:
+ xor esi, esi
+.LBB0_101:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_103
+.LBB0_102: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_102
+.LBB0_103:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_104: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_104
+ jmp .LBB0_697
+.LBB0_246:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.247:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_249
+# %bb.248:
+ xor esi, esi
+.LBB0_258:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_260
+.LBB0_259: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_259
+.LBB0_260:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_261: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_261
+ jmp .LBB0_697
+.LBB0_262:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.263:
+ mov r10d, r9d
+ cmp r9d, 16
+ jae .LBB0_265
+# %bb.264:
+ xor esi, esi
+.LBB0_274:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_276
+.LBB0_275: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_275
+.LBB0_276:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_277: # =>This Inner Loop Header: Depth=1
+ movzx eax, word ptr [rcx + 2*rsi]
+ add ax, word ptr [rdx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], ax
+ movzx eax, word ptr [rcx + 2*rsi + 2]
+ add ax, word ptr [rdx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], ax
+ movzx eax, word ptr [rcx + 2*rsi + 4]
+ add ax, word ptr [rdx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], ax
+ movzx eax, word ptr [rcx + 2*rsi + 6]
+ add ax, word ptr [rdx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], ax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_277
+ jmp .LBB0_697
+.LBB0_486:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.487:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_489
+# %bb.488:
+ xor esi, esi
+.LBB0_498:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_500
+.LBB0_499: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_499
+.LBB0_500:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_501: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_501
+ jmp .LBB0_697
+.LBB0_502:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.503:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_505
+# %bb.504:
+ xor esi, esi
+.LBB0_514:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_516
+.LBB0_515: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_515
+.LBB0_516:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_517: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ movss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi + 4]
+ movss dword ptr [r8 + 4*rsi + 4], xmm0
+ movss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi + 8]
+ movss dword ptr [r8 + 4*rsi + 8], xmm0
+ movss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi + 12]
+ movss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_517
+ jmp .LBB0_697
+.LBB0_652:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.653:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_655
+# %bb.654:
+ xor esi, esi
+.LBB0_664:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_666
+.LBB0_665: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_665
+.LBB0_666:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_667: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rdx + 8*rsi]
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rdx + 8*rsi + 8]
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rdx + 8*rsi + 16]
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rdx + 8*rsi + 24]
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_667
+ jmp .LBB0_697
+.LBB0_668:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.669:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_671
+# %bb.670:
+ xor esi, esi
+.LBB0_680:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_682
+.LBB0_681: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_681
+.LBB0_682:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_683: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ movss xmm0, dword ptr [rdx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi + 4]
+ movss dword ptr [r8 + 4*rsi + 4], xmm0
+ movss xmm0, dword ptr [rdx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi + 8]
+ movss dword ptr [r8 + 4*rsi + 8], xmm0
+ movss xmm0, dword ptr [rdx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ subss xmm0, dword ptr [rcx + 4*rsi + 12]
+ movss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_683
+ jmp .LBB0_697
+.LBB0_147:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.148:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_150
+# %bb.149:
+ xor esi, esi
+.LBB0_159:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_161
+.LBB0_160: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_160
+.LBB0_161:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_162: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_162
+ jmp .LBB0_697
+.LBB0_163:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.164:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_166
+# %bb.165:
+ xor esi, esi
+.LBB0_175:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_177
+.LBB0_176: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_176
+.LBB0_177:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_178: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ movss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi + 4]
+ movss dword ptr [r8 + 4*rsi + 4], xmm0
+ movss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi + 8]
+ movss dword ptr [r8 + 4*rsi + 8], xmm0
+ movss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi + 12]
+ movss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_178
+ jmp .LBB0_697
+.LBB0_320:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.321:
+ mov r10d, r9d
+ cmp r9d, 4
+ jae .LBB0_323
+# %bb.322:
+ xor esi, esi
+.LBB0_332:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_334
+.LBB0_333: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_333
+.LBB0_334:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_335: # =>This Inner Loop Header: Depth=1
+ mov rax, qword ptr [rcx + 8*rsi]
+ add rax, qword ptr [rdx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, qword ptr [rcx + 8*rsi + 8]
+ add rax, qword ptr [rdx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, qword ptr [rcx + 8*rsi + 16]
+ add rax, qword ptr [rdx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, qword ptr [rcx + 8*rsi + 24]
+ add rax, qword ptr [rdx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_335
+ jmp .LBB0_697
+.LBB0_336:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.337:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_339
+# %bb.338:
+ xor esi, esi
+.LBB0_348:
+ mov rax, rsi
+ not rax
+ add rax, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_350
+.LBB0_349: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_349
+.LBB0_350:
+ cmp rax, 3
+ jb .LBB0_697
+.LBB0_351: # =>This Inner Loop Header: Depth=1
+ movss xmm0, dword ptr [rcx + 4*rsi] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi]
+ movss dword ptr [r8 + 4*rsi], xmm0
+ movss xmm0, dword ptr [rcx + 4*rsi + 4] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi + 4]
+ movss dword ptr [r8 + 4*rsi + 4], xmm0
+ movss xmm0, dword ptr [rcx + 4*rsi + 8] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi + 8]
+ movss dword ptr [r8 + 4*rsi + 8], xmm0
+ movss xmm0, dword ptr [rcx + 4*rsi + 12] # xmm0 = mem[0],zero,zero,zero
+ addss xmm0, dword ptr [rdx + 4*rsi + 12]
+ movss dword ptr [r8 + 4*rsi + 12], xmm0
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_351
+ jmp .LBB0_697
+.LBB0_383:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.384:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_386
+# %bb.385:
+ xor esi, esi
+.LBB0_395:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_397
+.LBB0_396: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_396
+.LBB0_397:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_398: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_398
+ jmp .LBB0_697
+.LBB0_549:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.550:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_552
+# %bb.551:
+ xor esi, esi
+.LBB0_561:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_563
+.LBB0_562: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_562
+.LBB0_563:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_564: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rdx + rsi]
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rdx + rsi + 1]
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rdx + rsi + 2]
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rdx + rsi + 3]
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_564
+ jmp .LBB0_697
+.LBB0_44:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.45:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_47
+# %bb.46:
+ xor esi, esi
+.LBB0_56:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_58
+.LBB0_57: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_57
+.LBB0_58:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_59: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_59
+ jmp .LBB0_697
+.LBB0_217:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.218:
+ mov r10d, r9d
+ cmp r9d, 32
+ jae .LBB0_220
+# %bb.219:
+ xor esi, esi
+.LBB0_229:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_231
+.LBB0_230: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_230
+.LBB0_231:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_232: # =>This Inner Loop Header: Depth=1
+ movzx eax, byte ptr [rcx + rsi]
+ add al, byte ptr [rdx + rsi]
+ mov byte ptr [r8 + rsi], al
+ movzx eax, byte ptr [rcx + rsi + 1]
+ add al, byte ptr [rdx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ movzx eax, byte ptr [rcx + rsi + 2]
+ add al, byte ptr [rdx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ movzx eax, byte ptr [rcx + rsi + 3]
+ add al, byte ptr [rdx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_232
+ jmp .LBB0_697
+.LBB0_457:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.458:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_460
+# %bb.459:
+ xor esi, esi
+.LBB0_469:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_471
+.LBB0_470: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_470
+.LBB0_471:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_472: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_472
+ jmp .LBB0_697
+.LBB0_623:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.624:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_626
+# %bb.625:
+ xor esi, esi
+.LBB0_635:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_637
+.LBB0_636: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_636
+.LBB0_637:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_638: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rdx + 4*rsi]
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rdx + 4*rsi + 4]
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rdx + 4*rsi + 8]
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rdx + 4*rsi + 12]
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_638
+ jmp .LBB0_697
+.LBB0_118:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.119:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_121
+# %bb.120:
+ xor esi, esi
+.LBB0_130:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_132
+.LBB0_131: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_131
+.LBB0_132:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_133: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_133
+ jmp .LBB0_697
+.LBB0_291:
+ test r9d, r9d
+ jle .LBB0_697
+# %bb.292:
+ mov r10d, r9d
+ cmp r9d, 8
+ jae .LBB0_294
+# %bb.293:
+ xor esi, esi
+.LBB0_303:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB0_305
+.LBB0_304: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB0_304
+.LBB0_305:
+ cmp r9, 3
+ jb .LBB0_697
+.LBB0_306: # =>This Inner Loop Header: Depth=1
+ mov eax, dword ptr [rcx + 4*rsi]
+ add eax, dword ptr [rdx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, dword ptr [rcx + 4*rsi + 4]
+ add eax, dword ptr [rdx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, dword ptr [rcx + 4*rsi + 8]
+ add eax, dword ptr [rdx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, dword ptr [rcx + 4*rsi + 12]
+ add eax, dword ptr [rdx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB0_306
+ jmp .LBB0_697
+.LBB0_444:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_453
+# %bb.445:
+ and al, dil
+ jne .LBB0_453
+# %bb.446:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_447
+# %bb.448:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_449: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_449
+ jmp .LBB0_450
+.LBB0_610:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_619
+# %bb.611:
+ and al, dil
+ jne .LBB0_619
+# %bb.612:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_613
+# %bb.614:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_615: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_615
+ jmp .LBB0_616
+.LBB0_105:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_114
+# %bb.106:
+ and al, dil
+ jne .LBB0_114
+# %bb.107:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_108
+# %bb.109:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_110: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_110
+ jmp .LBB0_111
+.LBB0_278:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_287
+# %bb.279:
+ and al, dil
+ jne .LBB0_287
+# %bb.280:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_281
+# %bb.282:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_283: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_283
+ jmp .LBB0_284
+.LBB0_518:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_527
+# %bb.519:
+ and al, dil
+ jne .LBB0_527
+# %bb.520:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_521
+# %bb.522:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_523: # =>This Inner Loop Header: Depth=1
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ subpd xmm0, xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ subpd xmm1, xmm2
+ movupd xmmword ptr [r8 + 8*rdi], xmm0
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movupd xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ subpd xmm0, xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ subpd xmm1, xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm0
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_523
+ jmp .LBB0_524
+.LBB0_684:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_693
+# %bb.685:
+ and al, dil
+ jne .LBB0_693
+# %bb.686:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_687
+# %bb.688:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_689: # =>This Inner Loop Header: Depth=1
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ subpd xmm0, xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ subpd xmm1, xmm2
+ movupd xmmword ptr [r8 + 8*rdi], xmm0
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movupd xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ subpd xmm0, xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ subpd xmm1, xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm0
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_689
+ jmp .LBB0_690
+.LBB0_179:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_188
+# %bb.180:
+ and al, dil
+ jne .LBB0_188
+# %bb.181:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_182
+# %bb.183:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_184: # =>This Inner Loop Header: Depth=1
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ addpd xmm2, xmm0
+ movupd xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm0, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm0
+ movupd xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ addpd xmm2, xmm0
+ movupd xmm0, xmmword ptr [rcx + 8*rdi + 48]
+ addpd xmm0, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm0
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_184
+ jmp .LBB0_185
+.LBB0_352:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_361
+# %bb.353:
+ and al, dil
+ jne .LBB0_361
+# %bb.354:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_355
+# %bb.356:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_357: # =>This Inner Loop Header: Depth=1
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ addpd xmm2, xmm0
+ movupd xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm0, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm0
+ movupd xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ addpd xmm2, xmm0
+ movupd xmm0, xmmword ptr [rcx + 8*rdi + 48]
+ addpd xmm0, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm0
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_357
+ jmp .LBB0_358
+.LBB0_399:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_408
+# %bb.400:
+ and al, dil
+ jne .LBB0_408
+# %bb.401:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_402
+# %bb.403:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_404: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_404
+ jmp .LBB0_405
+.LBB0_565:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_574
+# %bb.566:
+ and al, dil
+ jne .LBB0_574
+# %bb.567:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_568
+# %bb.569:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_570: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_570
+ jmp .LBB0_571
+.LBB0_60:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_69
+# %bb.61:
+ and al, dil
+ jne .LBB0_69
+# %bb.62:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_63
+# %bb.64:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_65: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 48]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + rdi + 48], xmm0
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_65
+ jmp .LBB0_66
+.LBB0_233:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_242
+# %bb.234:
+ and al, dil
+ jne .LBB0_242
+# %bb.235:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_236
+# %bb.237:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_238: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 48]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + rdi + 48], xmm0
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_238
+ jmp .LBB0_239
+.LBB0_473:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_482
+# %bb.474:
+ and al, dil
+ jne .LBB0_482
+# %bb.475:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_476
+# %bb.477:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_478: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_478
+ jmp .LBB0_479
+.LBB0_639:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_648
+# %bb.640:
+ and al, dil
+ jne .LBB0_648
+# %bb.641:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_642
+# %bb.643:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_644: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_644
+ jmp .LBB0_645
+.LBB0_134:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_143
+# %bb.135:
+ and al, dil
+ jne .LBB0_143
+# %bb.136:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_137
+# %bb.138:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_139: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_139
+ jmp .LBB0_140
+.LBB0_307:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_316
+# %bb.308:
+ and al, dil
+ jne .LBB0_316
+# %bb.309:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_310
+# %bb.311:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_312: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_312
+ jmp .LBB0_313
+.LBB0_415:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_424
+# %bb.416:
+ and al, dil
+ jne .LBB0_424
+# %bb.417:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_418
+# %bb.419:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_420: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_420
+ jmp .LBB0_421
+.LBB0_431:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_440
+# %bb.432:
+ and al, dil
+ jne .LBB0_440
+# %bb.433:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_434
+# %bb.435:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_436: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_436
+ jmp .LBB0_437
+.LBB0_581:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_590
+# %bb.582:
+ and al, dil
+ jne .LBB0_590
+# %bb.583:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_584
+# %bb.585:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_586: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_586
+ jmp .LBB0_587
+.LBB0_597:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_606
+# %bb.598:
+ and al, dil
+ jne .LBB0_606
+# %bb.599:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_600
+# %bb.601:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_602: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_602
+ jmp .LBB0_603
+.LBB0_76:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_85
+# %bb.77:
+ and al, dil
+ jne .LBB0_85
+# %bb.78:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_79
+# %bb.80:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_81: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_81
+ jmp .LBB0_82
+.LBB0_92:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_101
+# %bb.93:
+ and al, dil
+ jne .LBB0_101
+# %bb.94:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_95
+# %bb.96:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_97: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_97
+ jmp .LBB0_98
+.LBB0_249:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_258
+# %bb.250:
+ and al, dil
+ jne .LBB0_258
+# %bb.251:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_252
+# %bb.253:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_254: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_254
+ jmp .LBB0_255
+.LBB0_265:
+ lea rsi, [r8 + 2*r10]
+ lea rax, [rdx + 2*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 2*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_274
+# %bb.266:
+ and al, dil
+ jne .LBB0_274
+# %bb.267:
+ mov esi, r10d
+ and esi, -16
+ lea rax, [rsi - 16]
+ mov r9, rax
+ shr r9, 4
+ add r9, 1
+ test rax, rax
+ je .LBB0_268
+# %bb.269:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_270: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 32]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm0
+ add rdi, 32
+ add rax, 2
+ jne .LBB0_270
+ jmp .LBB0_271
+.LBB0_489:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_498
+# %bb.490:
+ and al, dil
+ jne .LBB0_498
+# %bb.491:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_492
+# %bb.493:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_494: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_494
+ jmp .LBB0_495
+.LBB0_505:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_514
+# %bb.506:
+ and al, dil
+ jne .LBB0_514
+# %bb.507:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_508
+# %bb.509:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_510: # =>This Inner Loop Header: Depth=1
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ subps xmm0, xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ subps xmm1, xmm2
+ movups xmmword ptr [r8 + 4*rdi], xmm0
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movups xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ subps xmm0, xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ subps xmm1, xmm2
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm0
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_510
+ jmp .LBB0_511
+.LBB0_655:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_664
+# %bb.656:
+ and al, dil
+ jne .LBB0_664
+# %bb.657:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_658
+# %bb.659:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_660: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_660
+ jmp .LBB0_661
+.LBB0_671:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_680
+# %bb.672:
+ and al, dil
+ jne .LBB0_680
+# %bb.673:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_674
+# %bb.675:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_676: # =>This Inner Loop Header: Depth=1
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ subps xmm0, xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ subps xmm1, xmm2
+ movups xmmword ptr [r8 + 4*rdi], xmm0
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movups xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ subps xmm0, xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ subps xmm1, xmm2
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm0
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_676
+ jmp .LBB0_677
+.LBB0_150:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_159
+# %bb.151:
+ and al, dil
+ jne .LBB0_159
+# %bb.152:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_153
+# %bb.154:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_155: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_155
+ jmp .LBB0_156
+.LBB0_166:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_175
+# %bb.167:
+ and al, dil
+ jne .LBB0_175
+# %bb.168:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_169
+# %bb.170:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_171: # =>This Inner Loop Header: Depth=1
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ addps xmm2, xmm0
+ movups xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm0, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm0
+ movups xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ addps xmm2, xmm0
+ movups xmm0, xmmword ptr [rcx + 4*rdi + 48]
+ addps xmm0, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm0
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_171
+ jmp .LBB0_172
+.LBB0_323:
+ lea rsi, [r8 + 8*r10]
+ lea rax, [rdx + 8*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 8*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_332
+# %bb.324:
+ and al, dil
+ jne .LBB0_332
+# %bb.325:
+ mov esi, r10d
+ and esi, -4
+ lea rax, [rsi - 4]
+ mov r9, rax
+ shr r9, 2
+ add r9, 1
+ test rax, rax
+ je .LBB0_326
+# %bb.327:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_328: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm0
+ add rdi, 8
+ add rax, 2
+ jne .LBB0_328
+ jmp .LBB0_329
+.LBB0_339:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_348
+# %bb.340:
+ and al, dil
+ jne .LBB0_348
+# %bb.341:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_342
+# %bb.343:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_344: # =>This Inner Loop Header: Depth=1
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ addps xmm2, xmm0
+ movups xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm0, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm0
+ movups xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ addps xmm2, xmm0
+ movups xmm0, xmmword ptr [rcx + 4*rdi + 48]
+ addps xmm0, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm0
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_344
+ jmp .LBB0_345
+.LBB0_386:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_395
+# %bb.387:
+ and al, dil
+ jne .LBB0_395
+# %bb.388:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_389
+# %bb.390:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_391: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_391
+ jmp .LBB0_392
+.LBB0_552:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_561
+# %bb.553:
+ and al, dil
+ jne .LBB0_561
+# %bb.554:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_555
+# %bb.556:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_557: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_557
+ jmp .LBB0_558
+.LBB0_47:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_56
+# %bb.48:
+ and al, dil
+ jne .LBB0_56
+# %bb.49:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_50
+# %bb.51:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_52: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 48]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + rdi + 48], xmm0
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_52
+ jmp .LBB0_53
+.LBB0_220:
+ lea rsi, [r8 + r10]
+ lea rax, [rdx + r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_229
+# %bb.221:
+ and al, dil
+ jne .LBB0_229
+# %bb.222:
+ mov esi, r10d
+ and esi, -32
+ lea rax, [rsi - 32]
+ mov r9, rax
+ shr r9, 5
+ add r9, 1
+ test rax, rax
+ je .LBB0_223
+# %bb.224:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_225: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 32]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 48]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + rdi + 48], xmm0
+ add rdi, 64
+ add rax, 2
+ jne .LBB0_225
+ jmp .LBB0_226
+.LBB0_460:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_469
+# %bb.461:
+ and al, dil
+ jne .LBB0_469
+# %bb.462:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_463
+# %bb.464:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_465: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_465
+ jmp .LBB0_466
+.LBB0_626:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_635
+# %bb.627:
+ and al, dil
+ jne .LBB0_635
+# %bb.628:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_629
+# %bb.630:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_631: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_631
+ jmp .LBB0_632
+.LBB0_121:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_130
+# %bb.122:
+ and al, dil
+ jne .LBB0_130
+# %bb.123:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_124
+# %bb.125:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_126: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_126
+ jmp .LBB0_127
+.LBB0_294:
+ lea rsi, [r8 + 4*r10]
+ lea rax, [rdx + 4*r10]
+ cmp rax, r8
+ seta r9b
+ lea rax, [rcx + 4*r10]
+ cmp rsi, rdx
+ seta r11b
+ cmp rax, r8
+ seta al
+ cmp rsi, rcx
+ seta dil
+ xor esi, esi
+ test r9b, r11b
+ jne .LBB0_303
+# %bb.295:
+ and al, dil
+ jne .LBB0_303
+# %bb.296:
+ mov esi, r10d
+ and esi, -8
+ lea rax, [rsi - 8]
+ mov r9, rax
+ shr r9, 3
+ add r9, 1
+ test rax, rax
+ je .LBB0_297
+# %bb.298:
+ mov rax, r9
+ and rax, -2
+ neg rax
+ xor edi, edi
+.LBB0_299: # =>This Inner Loop Header: Depth=1
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 48]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm0
+ add rdi, 16
+ add rax, 2
+ jne .LBB0_299
+ jmp .LBB0_300
+.LBB0_447:
+ xor edi, edi
+.LBB0_450:
+ test r9b, 1
+ je .LBB0_452
+# %bb.451:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB0_452:
+ cmp rsi, r10
+ jne .LBB0_453
+ jmp .LBB0_697
+.LBB0_613:
+ xor edi, edi
+.LBB0_616:
+ test r9b, 1
+ je .LBB0_618
+# %bb.617:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB0_618:
+ cmp rsi, r10
+ jne .LBB0_619
+ jmp .LBB0_697
+.LBB0_108:
+ xor edi, edi
+.LBB0_111:
+ test r9b, 1
+ je .LBB0_113
+# %bb.112:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB0_113:
+ cmp rsi, r10
+ jne .LBB0_114
+ jmp .LBB0_697
+.LBB0_281:
+ xor edi, edi
+.LBB0_284:
+ test r9b, 1
+ je .LBB0_286
+# %bb.285:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB0_286:
+ cmp rsi, r10
+ jne .LBB0_287
+ jmp .LBB0_697
+.LBB0_521:
+ xor edi, edi
+.LBB0_524:
+ test r9b, 1
+ je .LBB0_526
+# %bb.525:
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ subpd xmm0, xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ subpd xmm1, xmm2
+ movupd xmmword ptr [r8 + 8*rdi], xmm0
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB0_526:
+ cmp rsi, r10
+ jne .LBB0_527
+ jmp .LBB0_697
+.LBB0_687:
+ xor edi, edi
+.LBB0_690:
+ test r9b, 1
+ je .LBB0_692
+# %bb.691:
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ subpd xmm0, xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ subpd xmm1, xmm2
+ movupd xmmword ptr [r8 + 8*rdi], xmm0
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB0_692:
+ cmp rsi, r10
+ jne .LBB0_693
+ jmp .LBB0_697
+.LBB0_182:
+ xor edi, edi
+.LBB0_185:
+ test r9b, 1
+ je .LBB0_187
+# %bb.186:
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ addpd xmm2, xmm0
+ movupd xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm0, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB0_187:
+ cmp rsi, r10
+ jne .LBB0_188
+ jmp .LBB0_697
+.LBB0_355:
+ xor edi, edi
+.LBB0_358:
+ test r9b, 1
+ je .LBB0_360
+# %bb.359:
+ movupd xmm0, xmmword ptr [rdx + 8*rdi]
+ movupd xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ addpd xmm2, xmm0
+ movupd xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm0, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB0_360:
+ cmp rsi, r10
+ jne .LBB0_361
+ jmp .LBB0_697
+.LBB0_402:
+ xor edi, edi
+.LBB0_405:
+ test r9b, 1
+ je .LBB0_407
+# %bb.406:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+.LBB0_407:
+ cmp rsi, r10
+ jne .LBB0_408
+ jmp .LBB0_697
+.LBB0_568:
+ xor edi, edi
+.LBB0_571:
+ test r9b, 1
+ je .LBB0_573
+# %bb.572:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+.LBB0_573:
+ cmp rsi, r10
+ jne .LBB0_574
+ jmp .LBB0_697
+.LBB0_63:
+ xor edi, edi
+.LBB0_66:
+ test r9b, 1
+ je .LBB0_68
+# %bb.67:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB0_68:
+ cmp rsi, r10
+ jne .LBB0_69
+ jmp .LBB0_697
+.LBB0_236:
+ xor edi, edi
+.LBB0_239:
+ test r9b, 1
+ je .LBB0_241
+# %bb.240:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB0_241:
+ cmp rsi, r10
+ jne .LBB0_242
+ jmp .LBB0_697
+.LBB0_476:
+ xor edi, edi
+.LBB0_479:
+ test r9b, 1
+ je .LBB0_481
+# %bb.480:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB0_481:
+ cmp rsi, r10
+ jne .LBB0_482
+ jmp .LBB0_697
+.LBB0_642:
+ xor edi, edi
+.LBB0_645:
+ test r9b, 1
+ je .LBB0_647
+# %bb.646:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB0_647:
+ cmp rsi, r10
+ jne .LBB0_648
+ jmp .LBB0_697
+.LBB0_137:
+ xor edi, edi
+.LBB0_140:
+ test r9b, 1
+ je .LBB0_142
+# %bb.141:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB0_142:
+ cmp rsi, r10
+ jne .LBB0_143
+ jmp .LBB0_697
+.LBB0_310:
+ xor edi, edi
+.LBB0_313:
+ test r9b, 1
+ je .LBB0_315
+# %bb.314:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB0_315:
+ cmp rsi, r10
+ jne .LBB0_316
+ jmp .LBB0_697
+.LBB0_418:
+ xor edi, edi
+.LBB0_421:
+ test r9b, 1
+ je .LBB0_423
+# %bb.422:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+.LBB0_423:
+ cmp rsi, r10
+ jne .LBB0_424
+ jmp .LBB0_697
+.LBB0_434:
+ xor edi, edi
+.LBB0_437:
+ test r9b, 1
+ je .LBB0_439
+# %bb.438:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+.LBB0_439:
+ cmp rsi, r10
+ jne .LBB0_440
+ jmp .LBB0_697
+.LBB0_584:
+ xor edi, edi
+.LBB0_587:
+ test r9b, 1
+ je .LBB0_589
+# %bb.588:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+.LBB0_589:
+ cmp rsi, r10
+ jne .LBB0_590
+ jmp .LBB0_697
+.LBB0_600:
+ xor edi, edi
+.LBB0_603:
+ test r9b, 1
+ je .LBB0_605
+# %bb.604:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ psubw xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+.LBB0_605:
+ cmp rsi, r10
+ jne .LBB0_606
+ jmp .LBB0_697
+.LBB0_79:
+ xor edi, edi
+.LBB0_82:
+ test r9b, 1
+ je .LBB0_84
+# %bb.83:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB0_84:
+ cmp rsi, r10
+ jne .LBB0_85
+ jmp .LBB0_697
+.LBB0_95:
+ xor edi, edi
+.LBB0_98:
+ test r9b, 1
+ je .LBB0_100
+# %bb.99:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB0_100:
+ cmp rsi, r10
+ jne .LBB0_101
+ jmp .LBB0_697
+.LBB0_252:
+ xor edi, edi
+.LBB0_255:
+ test r9b, 1
+ je .LBB0_257
+# %bb.256:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB0_257:
+ cmp rsi, r10
+ jne .LBB0_258
+ jmp .LBB0_697
+.LBB0_268:
+ xor edi, edi
+.LBB0_271:
+ test r9b, 1
+ je .LBB0_273
+# %bb.272:
+ movdqu xmm0, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi]
+ paddw xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm0, xmm1
+ movdqu xmmword ptr [r8 + 2*rdi], xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB0_273:
+ cmp rsi, r10
+ jne .LBB0_274
+ jmp .LBB0_697
+.LBB0_492:
+ xor edi, edi
+.LBB0_495:
+ test r9b, 1
+ je .LBB0_497
+# %bb.496:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB0_497:
+ cmp rsi, r10
+ jne .LBB0_498
+ jmp .LBB0_697
+.LBB0_508:
+ xor edi, edi
+.LBB0_511:
+ test r9b, 1
+ je .LBB0_513
+# %bb.512:
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ subps xmm0, xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ subps xmm1, xmm2
+ movups xmmword ptr [r8 + 4*rdi], xmm0
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB0_513:
+ cmp rsi, r10
+ jne .LBB0_514
+ jmp .LBB0_697
+.LBB0_658:
+ xor edi, edi
+.LBB0_661:
+ test r9b, 1
+ je .LBB0_663
+# %bb.662:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ psubq xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB0_663:
+ cmp rsi, r10
+ jne .LBB0_664
+ jmp .LBB0_697
+.LBB0_674:
+ xor edi, edi
+.LBB0_677:
+ test r9b, 1
+ je .LBB0_679
+# %bb.678:
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ subps xmm0, xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ subps xmm1, xmm2
+ movups xmmword ptr [r8 + 4*rdi], xmm0
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB0_679:
+ cmp rsi, r10
+ jne .LBB0_680
+ jmp .LBB0_697
+.LBB0_153:
+ xor edi, edi
+.LBB0_156:
+ test r9b, 1
+ je .LBB0_158
+# %bb.157:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB0_158:
+ cmp rsi, r10
+ jne .LBB0_159
+ jmp .LBB0_697
+.LBB0_169:
+ xor edi, edi
+.LBB0_172:
+ test r9b, 1
+ je .LBB0_174
+# %bb.173:
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ addps xmm2, xmm0
+ movups xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm0, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB0_174:
+ cmp rsi, r10
+ jne .LBB0_175
+ jmp .LBB0_697
+.LBB0_326:
+ xor edi, edi
+.LBB0_329:
+ test r9b, 1
+ je .LBB0_331
+# %bb.330:
+ movdqu xmm0, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi]
+ paddq xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm0, xmm1
+ movdqu xmmword ptr [r8 + 8*rdi], xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB0_331:
+ cmp rsi, r10
+ jne .LBB0_332
+ jmp .LBB0_697
+.LBB0_342:
+ xor edi, edi
+.LBB0_345:
+ test r9b, 1
+ je .LBB0_347
+# %bb.346:
+ movups xmm0, xmmword ptr [rdx + 4*rdi]
+ movups xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ addps xmm2, xmm0
+ movups xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm0, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB0_347:
+ cmp rsi, r10
+ jne .LBB0_348
+ jmp .LBB0_697
+.LBB0_389:
+ xor edi, edi
+.LBB0_392:
+ test r9b, 1
+ je .LBB0_394
+# %bb.393:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+.LBB0_394:
+ cmp rsi, r10
+ jne .LBB0_395
+ jmp .LBB0_697
+.LBB0_555:
+ xor edi, edi
+.LBB0_558:
+ test r9b, 1
+ je .LBB0_560
+# %bb.559:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ psubb xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm0
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+.LBB0_560:
+ cmp rsi, r10
+ jne .LBB0_561
+ jmp .LBB0_697
+.LBB0_50:
+ xor edi, edi
+.LBB0_53:
+ test r9b, 1
+ je .LBB0_55
+# %bb.54:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB0_55:
+ cmp rsi, r10
+ jne .LBB0_56
+ jmp .LBB0_697
+.LBB0_223:
+ xor edi, edi
+.LBB0_226:
+ test r9b, 1
+ je .LBB0_228
+# %bb.227:
+ movdqu xmm0, xmmword ptr [rdx + rdi]
+ movdqu xmm1, xmmword ptr [rdx + rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + rdi]
+ paddb xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + rdi + 16]
+ paddb xmm0, xmm1
+ movdqu xmmword ptr [r8 + rdi], xmm2
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB0_228:
+ cmp rsi, r10
+ jne .LBB0_229
+ jmp .LBB0_697
+.LBB0_463:
+ xor edi, edi
+.LBB0_466:
+ test r9b, 1
+ je .LBB0_468
+# %bb.467:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB0_468:
+ cmp rsi, r10
+ jne .LBB0_469
+ jmp .LBB0_697
+.LBB0_629:
+ xor edi, edi
+.LBB0_632:
+ test r9b, 1
+ je .LBB0_634
+# %bb.633:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ psubd xmm0, xmm2
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB0_634:
+ cmp rsi, r10
+ jne .LBB0_635
+ jmp .LBB0_697
+.LBB0_124:
+ xor edi, edi
+.LBB0_127:
+ test r9b, 1
+ je .LBB0_129
+# %bb.128:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB0_129:
+ cmp rsi, r10
+ jne .LBB0_130
+ jmp .LBB0_697
+.LBB0_297:
+ xor edi, edi
+.LBB0_300:
+ test r9b, 1
+ je .LBB0_302
+# %bb.301:
+ movdqu xmm0, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi]
+ paddd xmm2, xmm0
+ movdqu xmm0, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm0, xmm1
+ movdqu xmmword ptr [r8 + 4*rdi], xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB0_302:
+ cmp rsi, r10
+ jne .LBB0_303
+.LBB0_697:
+ mov rsp, rbp
+ pop rbp
+ ret
+.Lfunc_end0:
+ .size arithmetic_sse4, .Lfunc_end0-arithmetic_sse4
+ # -- End function
+ .globl arithmetic_arr_scalar_sse4 # -- Begin function arithmetic_arr_scalar_sse4
+ .p2align 4, 0x90
+ .type arithmetic_arr_scalar_sse4,@function
+arithmetic_arr_scalar_sse4: # @arithmetic_arr_scalar_sse4
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 1
+ jg .LBB1_11
+# %bb.1:
+ test sil, sil
+ je .LBB1_21
+# %bb.2:
+ cmp sil, 1
+ jne .LBB1_737
+# %bb.3:
+ cmp edi, 6
+ jg .LBB1_37
+# %bb.4:
+ cmp edi, 3
+ jle .LBB1_65
+# %bb.5:
+ cmp edi, 4
+ je .LBB1_105
+# %bb.6:
+ cmp edi, 5
+ je .LBB1_108
+# %bb.7:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.8:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.9:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_10
+# %bb.177:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_297
+# %bb.178:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_297
+.LBB1_10:
+ xor esi, esi
+.LBB1_421:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_423
+.LBB1_422: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_422
+.LBB1_423:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_424: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_424
+ jmp .LBB1_737
+.LBB1_11:
+ cmp sil, 2
+ je .LBB1_29
+# %bb.12:
+ cmp sil, 3
+ jne .LBB1_737
+# %bb.13:
+ cmp edi, 6
+ jg .LBB1_44
+# %bb.14:
+ cmp edi, 3
+ jle .LBB1_70
+# %bb.15:
+ cmp edi, 4
+ je .LBB1_111
+# %bb.16:
+ cmp edi, 5
+ je .LBB1_114
+# %bb.17:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.18:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.19:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_20
+# %bb.180:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_300
+# %bb.181:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_300
+.LBB1_20:
+ xor esi, esi
+.LBB1_429:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_431
+.LBB1_430: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_430
+.LBB1_431:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_432: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_432
+ jmp .LBB1_737
+.LBB1_21:
+ cmp edi, 6
+ jg .LBB1_51
+# %bb.22:
+ cmp edi, 3
+ jle .LBB1_75
+# %bb.23:
+ cmp edi, 4
+ je .LBB1_117
+# %bb.24:
+ cmp edi, 5
+ je .LBB1_120
+# %bb.25:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.26:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.27:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_28
+# %bb.183:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_303
+# %bb.184:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_303
+.LBB1_28:
+ xor esi, esi
+.LBB1_437:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_439
+.LBB1_438: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_438
+.LBB1_439:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_440: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_440
+ jmp .LBB1_737
+.LBB1_29:
+ cmp edi, 6
+ jg .LBB1_58
+# %bb.30:
+ cmp edi, 3
+ jle .LBB1_80
+# %bb.31:
+ cmp edi, 4
+ je .LBB1_123
+# %bb.32:
+ cmp edi, 5
+ je .LBB1_126
+# %bb.33:
+ cmp edi, 6
+ jne .LBB1_737
+# %bb.34:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.35:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_36
+# %bb.186:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_306
+# %bb.187:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_306
+.LBB1_36:
+ xor esi, esi
+.LBB1_445:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_447
+.LBB1_446: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_446
+.LBB1_447:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_448: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_448
+ jmp .LBB1_737
+.LBB1_37:
+ cmp edi, 8
+ jle .LBB1_85
+# %bb.38:
+ cmp edi, 9
+ je .LBB1_129
+# %bb.39:
+ cmp edi, 11
+ je .LBB1_132
+# %bb.40:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.41:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.42:
+ movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB1_43
+# %bb.189:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_309
+# %bb.190:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_309
+.LBB1_43:
+ xor ecx, ecx
+.LBB1_453:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_455
+.LBB1_454: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_454
+.LBB1_455:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_456: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 8], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 16], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_456
+ jmp .LBB1_737
+.LBB1_44:
+ cmp edi, 8
+ jle .LBB1_90
+# %bb.45:
+ cmp edi, 9
+ je .LBB1_135
+# %bb.46:
+ cmp edi, 11
+ je .LBB1_138
+# %bb.47:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.48:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.49:
+ movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB1_50
+# %bb.192:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_312
+# %bb.193:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_312
+.LBB1_50:
+ xor ecx, ecx
+.LBB1_461:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_463
+.LBB1_462: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_462
+.LBB1_463:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_464: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 8], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 16], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
+ subsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_464
+ jmp .LBB1_737
+.LBB1_51:
+ cmp edi, 8
+ jle .LBB1_95
+# %bb.52:
+ cmp edi, 9
+ je .LBB1_141
+# %bb.53:
+ cmp edi, 11
+ je .LBB1_144
+# %bb.54:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.55:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.56:
+ movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB1_57
+# %bb.195:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_315
+# %bb.196:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_315
+.LBB1_57:
+ xor ecx, ecx
+.LBB1_469:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_471
+.LBB1_470: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_470
+.LBB1_471:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_472: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 8], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 16], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_472
+ jmp .LBB1_737
+.LBB1_58:
+ cmp edi, 8
+ jle .LBB1_100
+# %bb.59:
+ cmp edi, 9
+ je .LBB1_147
+# %bb.60:
+ cmp edi, 11
+ je .LBB1_150
+# %bb.61:
+ cmp edi, 12
+ jne .LBB1_737
+# %bb.62:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.63:
+ movsd xmm0, qword ptr [rcx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB1_64
+# %bb.198:
+ lea rcx, [rdx + 8*rax]
+ cmp rcx, r8
+ jbe .LBB1_318
+# %bb.199:
+ lea rcx, [r8 + 8*rax]
+ cmp rcx, rdx
+ jbe .LBB1_318
+.LBB1_64:
+ xor ecx, ecx
+.LBB1_477:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_479
+.LBB1_478: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_478
+.LBB1_479:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_480: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rdx + 8*rcx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 8] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 8], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 16] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 16], xmm1
+ movsd xmm1, qword ptr [rdx + 8*rcx + 24] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rcx + 24], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_480
+ jmp .LBB1_737
+.LBB1_65:
+ cmp edi, 2
+ je .LBB1_153
+# %bb.66:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.67:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.68:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_69
+# %bb.201:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_321
+# %bb.202:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_321
+.LBB1_69:
+ xor esi, esi
+.LBB1_485:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_487
+.LBB1_486: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_486
+.LBB1_487:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_488: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_488
+ jmp .LBB1_737
+.LBB1_70:
+ cmp edi, 2
+ je .LBB1_156
+# %bb.71:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.72:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.73:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_74
+# %bb.204:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_324
+# %bb.205:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_324
+.LBB1_74:
+ xor esi, esi
+.LBB1_493:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_495
+.LBB1_494: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_494
+.LBB1_495:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_496: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_496
+ jmp .LBB1_737
+.LBB1_75:
+ cmp edi, 2
+ je .LBB1_159
+# %bb.76:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.77:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.78:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_79
+# %bb.207:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_327
+# %bb.208:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_327
+.LBB1_79:
+ xor esi, esi
+.LBB1_501:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_503
+.LBB1_502: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_502
+.LBB1_503:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_504: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_504
+ jmp .LBB1_737
+.LBB1_80:
+ cmp edi, 2
+ je .LBB1_162
+# %bb.81:
+ cmp edi, 3
+ jne .LBB1_737
+# %bb.82:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.83:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_84
+# %bb.210:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_330
+# %bb.211:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_330
+.LBB1_84:
+ xor esi, esi
+.LBB1_509:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_511
+.LBB1_510: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_510
+.LBB1_511:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_512: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_512
+ jmp .LBB1_737
+.LBB1_85:
+ cmp edi, 7
+ je .LBB1_165
+# %bb.86:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.87:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.88:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_89
+# %bb.213:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_333
+# %bb.214:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_333
+.LBB1_89:
+ xor esi, esi
+.LBB1_517:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_519
+.LBB1_518: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_518
+.LBB1_519:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_520: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_520
+ jmp .LBB1_737
+.LBB1_90:
+ cmp edi, 7
+ je .LBB1_168
+# %bb.91:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.92:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.93:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_94
+# %bb.216:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_336
+# %bb.217:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_336
+.LBB1_94:
+ xor esi, esi
+.LBB1_525:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_527
+.LBB1_526: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_526
+.LBB1_527:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_528: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_528
+ jmp .LBB1_737
+.LBB1_95:
+ cmp edi, 7
+ je .LBB1_171
+# %bb.96:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.97:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.98:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_99
+# %bb.219:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_339
+# %bb.220:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_339
+.LBB1_99:
+ xor esi, esi
+.LBB1_533:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_535
+.LBB1_534: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_534
+.LBB1_535:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_536: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_536
+ jmp .LBB1_737
+.LBB1_100:
+ cmp edi, 7
+ je .LBB1_174
+# %bb.101:
+ cmp edi, 8
+ jne .LBB1_737
+# %bb.102:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.103:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_104
+# %bb.222:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_342
+# %bb.223:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_342
+.LBB1_104:
+ xor esi, esi
+.LBB1_541:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_543
+.LBB1_542: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_542
+.LBB1_543:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_544: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_544
+ jmp .LBB1_737
+.LBB1_105:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.106:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_107
+# %bb.225:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_345
+# %bb.226:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_345
+.LBB1_107:
+ xor esi, esi
+.LBB1_549:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_551
+.LBB1_550: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_550
+.LBB1_551:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_552: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_552
+ jmp .LBB1_737
+.LBB1_108:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.109:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_110
+# %bb.228:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_348
+# %bb.229:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_348
+.LBB1_110:
+ xor esi, esi
+.LBB1_557:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_559
+.LBB1_558: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_558
+.LBB1_559:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_560: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_560
+ jmp .LBB1_737
+.LBB1_111:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.112:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_113
+# %bb.231:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_351
+# %bb.232:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_351
+.LBB1_113:
+ xor esi, esi
+.LBB1_565:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_567
+.LBB1_566: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_566
+.LBB1_567:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_568: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_568
+ jmp .LBB1_737
+.LBB1_114:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.115:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_116
+# %bb.234:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_354
+# %bb.235:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_354
+.LBB1_116:
+ xor esi, esi
+.LBB1_573:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_575
+.LBB1_574: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_574
+.LBB1_575:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_576: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ sub ecx, eax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_576
+ jmp .LBB1_737
+.LBB1_117:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.118:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_119
+# %bb.237:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_357
+# %bb.238:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_357
+.LBB1_119:
+ xor esi, esi
+.LBB1_581:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_583
+.LBB1_582: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_582
+.LBB1_583:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_584: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_584
+ jmp .LBB1_737
+.LBB1_120:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.121:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_122
+# %bb.240:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_360
+# %bb.241:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_360
+.LBB1_122:
+ xor esi, esi
+.LBB1_589:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_591
+.LBB1_590: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_590
+.LBB1_591:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_592: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_592
+ jmp .LBB1_737
+.LBB1_123:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.124:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_125
+# %bb.243:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_363
+# %bb.244:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_363
+.LBB1_125:
+ xor esi, esi
+.LBB1_597:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_599
+.LBB1_598: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_598
+.LBB1_599:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_600: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_600
+ jmp .LBB1_737
+.LBB1_126:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.127:
+ movzx eax, word ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB1_128
+# %bb.246:
+ lea rcx, [rdx + 2*r10]
+ cmp rcx, r8
+ jbe .LBB1_366
+# %bb.247:
+ lea rcx, [r8 + 2*r10]
+ cmp rcx, rdx
+ jbe .LBB1_366
+.LBB1_128:
+ xor esi, esi
+.LBB1_605:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_607
+.LBB1_606: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_606
+.LBB1_607:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_608: # =>This Inner Loop Header: Depth=1
+ movzx ecx, word ptr [rdx + 2*rsi]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 2]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 2], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 4]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 4], cx
+ movzx ecx, word ptr [rdx + 2*rsi + 6]
+ add cx, ax
+ mov word ptr [r8 + 2*rsi + 6], cx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_608
+ jmp .LBB1_737
+.LBB1_129:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.130:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_131
+# %bb.249:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_369
+# %bb.250:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_369
+.LBB1_131:
+ xor esi, esi
+.LBB1_613:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_615
+.LBB1_614: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_614
+.LBB1_615:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_616: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_616
+ jmp .LBB1_737
+.LBB1_132:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.133:
+ movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB1_134
+# %bb.252:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_372
+# %bb.253:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_372
+.LBB1_134:
+ xor ecx, ecx
+.LBB1_621:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_623
+.LBB1_622: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_622
+.LBB1_623:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_624: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 4], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 8], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_624
+ jmp .LBB1_737
+.LBB1_135:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.136:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_137
+# %bb.255:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_375
+# %bb.256:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_375
+.LBB1_137:
+ xor esi, esi
+.LBB1_629:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_631
+.LBB1_630: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_630
+.LBB1_631:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_632: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ sub rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_632
+ jmp .LBB1_737
+.LBB1_138:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.139:
+ movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB1_140
+# %bb.258:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_378
+# %bb.259:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_378
+.LBB1_140:
+ xor ecx, ecx
+.LBB1_637:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_639
+.LBB1_638: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_638
+.LBB1_639:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_640: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 4], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 8], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
+ subss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_640
+ jmp .LBB1_737
+.LBB1_141:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.142:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_143
+# %bb.261:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_381
+# %bb.262:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_381
+.LBB1_143:
+ xor esi, esi
+.LBB1_645:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_647
+.LBB1_646: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_646
+.LBB1_647:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_648: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_648
+ jmp .LBB1_737
+.LBB1_144:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.145:
+ movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB1_146
+# %bb.264:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_384
+# %bb.265:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_384
+.LBB1_146:
+ xor ecx, ecx
+.LBB1_653:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_655
+.LBB1_654: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_654
+.LBB1_655:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_656: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 4], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 8], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_656
+ jmp .LBB1_737
+.LBB1_147:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.148:
+ mov rax, qword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB1_149
+# %bb.267:
+ lea rcx, [rdx + 8*r10]
+ cmp rcx, r8
+ jbe .LBB1_387
+# %bb.268:
+ lea rcx, [r8 + 8*r10]
+ cmp rcx, rdx
+ jbe .LBB1_387
+.LBB1_149:
+ xor esi, esi
+.LBB1_661:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_663
+.LBB1_662: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_662
+.LBB1_663:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_664: # =>This Inner Loop Header: Depth=1
+ mov rcx, qword ptr [rdx + 8*rsi]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 8]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 16]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rcx
+ mov rcx, qword ptr [rdx + 8*rsi + 24]
+ add rcx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rcx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_664
+ jmp .LBB1_737
+.LBB1_150:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.151:
+ movss xmm0, dword ptr [rcx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB1_152
+# %bb.270:
+ lea rcx, [rdx + 4*rax]
+ cmp rcx, r8
+ jbe .LBB1_390
+# %bb.271:
+ lea rcx, [r8 + 4*rax]
+ cmp rcx, rdx
+ jbe .LBB1_390
+.LBB1_152:
+ xor ecx, ecx
+.LBB1_669:
+ mov rsi, rcx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB1_671
+.LBB1_670: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ add rcx, 1
+ add rdi, -1
+ jne .LBB1_670
+.LBB1_671:
+ cmp rsi, 3
+ jb .LBB1_737
+.LBB1_672: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rdx + 4*rcx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 4] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 4], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 8] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 8], xmm1
+ movss xmm1, dword ptr [rdx + 4*rcx + 12] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rcx + 12], xmm1
+ add rcx, 4
+ cmp rax, rcx
+ jne .LBB1_672
+ jmp .LBB1_737
+.LBB1_153:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.154:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_155
+# %bb.273:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_393
+# %bb.274:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_393
+.LBB1_155:
+ xor esi, esi
+.LBB1_677:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_679
+.LBB1_678: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_678
+.LBB1_679:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_680: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_680
+ jmp .LBB1_737
+.LBB1_156:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.157:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_158
+# %bb.276:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_396
+# %bb.277:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_396
+.LBB1_158:
+ xor esi, esi
+.LBB1_685:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_687
+.LBB1_686: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_686
+.LBB1_687:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_688: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ sub cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ sub cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_688
+ jmp .LBB1_737
+.LBB1_159:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.160:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_161
+# %bb.279:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_399
+# %bb.280:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_399
+.LBB1_161:
+ xor esi, esi
+.LBB1_693:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_695
+.LBB1_694: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_694
+.LBB1_695:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_696: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_696
+ jmp .LBB1_737
+.LBB1_162:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.163:
+ mov al, byte ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB1_164
+# %bb.282:
+ lea rcx, [rdx + r10]
+ cmp rcx, r8
+ jbe .LBB1_402
+# %bb.283:
+ lea rcx, [r8 + r10]
+ cmp rcx, rdx
+ jbe .LBB1_402
+.LBB1_164:
+ xor esi, esi
+.LBB1_701:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_703
+.LBB1_702: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_702
+.LBB1_703:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_704: # =>This Inner Loop Header: Depth=1
+ movzx ecx, byte ptr [rdx + rsi]
+ add cl, al
+ mov byte ptr [r8 + rsi], cl
+ movzx ecx, byte ptr [rdx + rsi + 1]
+ add cl, al
+ mov byte ptr [r8 + rsi + 1], cl
+ movzx ecx, byte ptr [rdx + rsi + 2]
+ add cl, al
+ mov byte ptr [r8 + rsi + 2], cl
+ movzx ecx, byte ptr [rdx + rsi + 3]
+ add cl, al
+ mov byte ptr [r8 + rsi + 3], cl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_704
+ jmp .LBB1_737
+.LBB1_165:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.166:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_167
+# %bb.285:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_405
+# %bb.286:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_405
+.LBB1_167:
+ xor esi, esi
+.LBB1_709:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_711
+.LBB1_710: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_710
+.LBB1_711:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_712: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_712
+ jmp .LBB1_737
+.LBB1_168:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.169:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_170
+# %bb.288:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_408
+# %bb.289:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_408
+.LBB1_170:
+ xor esi, esi
+.LBB1_717:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_719
+.LBB1_718: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_718
+.LBB1_719:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_720: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ sub ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_720
+ jmp .LBB1_737
+.LBB1_171:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.172:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_173
+# %bb.291:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_411
+# %bb.292:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_411
+.LBB1_173:
+ xor esi, esi
+.LBB1_725:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_727
+.LBB1_726: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_726
+.LBB1_727:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_728: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_728
+ jmp .LBB1_737
+.LBB1_174:
+ test r9d, r9d
+ jle .LBB1_737
+# %bb.175:
+ mov eax, dword ptr [rcx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB1_176
+# %bb.294:
+ lea rcx, [rdx + 4*r10]
+ cmp rcx, r8
+ jbe .LBB1_414
+# %bb.295:
+ lea rcx, [r8 + 4*r10]
+ cmp rcx, rdx
+ jbe .LBB1_414
+.LBB1_176:
+ xor esi, esi
+.LBB1_733:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB1_735
+.LBB1_734: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB1_734
+.LBB1_735:
+ cmp r9, 3
+ jb .LBB1_737
+.LBB1_736: # =>This Inner Loop Header: Depth=1
+ mov ecx, dword ptr [rdx + 4*rsi]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 4]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 4], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 8]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 8], ecx
+ mov ecx, dword ptr [rdx + 4*rsi + 12]
+ add ecx, eax
+ mov dword ptr [r8 + 4*rsi + 12], ecx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB1_736
+ jmp .LBB1_737
+.LBB1_297:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_417
+# %bb.298:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_299: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_299
+ jmp .LBB1_418
+.LBB1_300:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_425
+# %bb.301:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_302: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_302
+ jmp .LBB1_426
+.LBB1_303:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_433
+# %bb.304:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_305: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_305
+ jmp .LBB1_434
+.LBB1_306:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_441
+# %bb.307:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_308: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_308
+ jmp .LBB1_442
+.LBB1_309:
+ mov ecx, eax
+ and ecx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rcx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_449
+# %bb.310:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_311: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ subpd xmm2, xmm1
+ subpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+ movupd xmm2, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 48]
+ subpd xmm2, xmm1
+ subpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm3
+ add rdi, 8
+ add rsi, 2
+ jne .LBB1_311
+ jmp .LBB1_450
+.LBB1_312:
+ mov ecx, eax
+ and ecx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rcx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_457
+# %bb.313:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_314: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ subpd xmm2, xmm1
+ subpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+ movupd xmm2, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 48]
+ subpd xmm2, xmm1
+ subpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm3
+ add rdi, 8
+ add rsi, 2
+ jne .LBB1_314
+ jmp .LBB1_458
+.LBB1_315:
+ mov ecx, eax
+ and ecx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rcx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_465
+# %bb.316:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_317: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+ movupd xmm2, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 48]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm3
+ add rdi, 8
+ add rsi, 2
+ jne .LBB1_317
+ jmp .LBB1_466
+.LBB1_318:
+ mov ecx, eax
+ and ecx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rcx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_473
+# %bb.319:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_320: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+ movupd xmm2, xmmword ptr [rdx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 48]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm3
+ add rdi, 8
+ add rsi, 2
+ jne .LBB1_320
+ jmp .LBB1_474
+.LBB1_321:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_481
+# %bb.322:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_323: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_323
+ jmp .LBB1_482
+.LBB1_324:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_489
+# %bb.325:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_326: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_326
+ jmp .LBB1_490
+.LBB1_327:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_497
+# %bb.328:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_329: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_329
+ jmp .LBB1_498
+.LBB1_330:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_505
+# %bb.331:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_332: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_332
+ jmp .LBB1_506
+.LBB1_333:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_513
+# %bb.334:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_335: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_335
+ jmp .LBB1_514
+.LBB1_336:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_521
+# %bb.337:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_338: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_338
+ jmp .LBB1_522
+.LBB1_339:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_529
+# %bb.340:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_341: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_341
+ jmp .LBB1_530
+.LBB1_342:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_537
+# %bb.343:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_344: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_344
+ jmp .LBB1_538
+.LBB1_345:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_545
+# %bb.346:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_347: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_347
+ jmp .LBB1_546
+.LBB1_348:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_553
+# %bb.349:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_350: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_350
+ jmp .LBB1_554
+.LBB1_351:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_561
+# %bb.352:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_353: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_353
+ jmp .LBB1_562
+.LBB1_354:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_569
+# %bb.355:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_356: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_356
+ jmp .LBB1_570
+.LBB1_357:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_577
+# %bb.358:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_359: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_359
+ jmp .LBB1_578
+.LBB1_360:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_585
+# %bb.361:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_362: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_362
+ jmp .LBB1_586
+.LBB1_363:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_593
+# %bb.364:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_365: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_365
+ jmp .LBB1_594
+.LBB1_366:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 16]
+ mov r9, rcx
+ shr r9, 4
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_601
+# %bb.367:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_368: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rcx, 2
+ jne .LBB1_368
+ jmp .LBB1_602
+.LBB1_369:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_609
+# %bb.370:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_371: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_371
+ jmp .LBB1_610
+.LBB1_372:
+ mov ecx, eax
+ and ecx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rcx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_617
+# %bb.373:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_374: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ subps xmm2, xmm1
+ subps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+ movups xmm2, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 48]
+ subps xmm2, xmm1
+ subps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm3
+ add rdi, 16
+ add rsi, 2
+ jne .LBB1_374
+ jmp .LBB1_618
+.LBB1_375:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_625
+# %bb.376:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_377: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_377
+ jmp .LBB1_626
+.LBB1_378:
+ mov ecx, eax
+ and ecx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rcx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_633
+# %bb.379:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_380: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ subps xmm2, xmm1
+ subps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+ movups xmm2, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 48]
+ subps xmm2, xmm1
+ subps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm3
+ add rdi, 16
+ add rsi, 2
+ jne .LBB1_380
+ jmp .LBB1_634
+.LBB1_381:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_641
+# %bb.382:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_383: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_383
+ jmp .LBB1_642
+.LBB1_384:
+ mov ecx, eax
+ and ecx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rcx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_649
+# %bb.385:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_386: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+ movups xmm2, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 48]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm3
+ add rdi, 16
+ add rsi, 2
+ jne .LBB1_386
+ jmp .LBB1_650
+.LBB1_387:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rcx, [rsi - 4]
+ mov r9, rcx
+ shr r9, 2
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_657
+# %bb.388:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_389: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rcx, 2
+ jne .LBB1_389
+ jmp .LBB1_658
+.LBB1_390:
+ mov ecx, eax
+ and ecx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rcx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB1_665
+# %bb.391:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB1_392: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+ movups xmm2, xmmword ptr [rdx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 48]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm3
+ add rdi, 16
+ add rsi, 2
+ jne .LBB1_392
+ jmp .LBB1_666
+.LBB1_393:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_673
+# %bb.394:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_395: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_395
+ jmp .LBB1_674
+.LBB1_396:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_681
+# %bb.397:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_398: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_398
+ jmp .LBB1_682
+.LBB1_399:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_689
+# %bb.400:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_401: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_401
+ jmp .LBB1_690
+.LBB1_402:
+ mov esi, r10d
+ and esi, -32
+ movzx ecx, al
+ movd xmm0, ecx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rcx, [rsi - 32]
+ mov r9, rcx
+ shr r9, 5
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_697
+# %bb.403:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_404: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rcx, 2
+ jne .LBB1_404
+ jmp .LBB1_698
+.LBB1_405:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_705
+# %bb.406:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_407: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_407
+ jmp .LBB1_706
+.LBB1_408:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_713
+# %bb.409:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_410: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_410
+ jmp .LBB1_714
+.LBB1_411:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_721
+# %bb.412:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_413: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_413
+ jmp .LBB1_722
+.LBB1_414:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rcx, [rsi - 8]
+ mov r9, rcx
+ shr r9, 3
+ add r9, 1
+ test rcx, rcx
+ je .LBB1_729
+# %bb.415:
+ mov rcx, r9
+ and rcx, -2
+ neg rcx
+ xor edi, edi
+.LBB1_416: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rcx, 2
+ jne .LBB1_416
+ jmp .LBB1_730
+.LBB1_417:
+ xor edi, edi
+.LBB1_418:
+ test r9b, 1
+ je .LBB1_420
+# %bb.419:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_420:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_421
+.LBB1_425:
+ xor edi, edi
+.LBB1_426:
+ test r9b, 1
+ je .LBB1_428
+# %bb.427:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_428:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_429
+.LBB1_433:
+ xor edi, edi
+.LBB1_434:
+ test r9b, 1
+ je .LBB1_436
+# %bb.435:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_436:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_437
+.LBB1_441:
+ xor edi, edi
+.LBB1_442:
+ test r9b, 1
+ je .LBB1_444
+# %bb.443:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_444:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_445
+.LBB1_449:
+ xor edi, edi
+.LBB1_450:
+ test r9b, 1
+ je .LBB1_452
+# %bb.451:
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ subpd xmm2, xmm1
+ subpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+.LBB1_452:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_453
+.LBB1_457:
+ xor edi, edi
+.LBB1_458:
+ test r9b, 1
+ je .LBB1_460
+# %bb.459:
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ subpd xmm2, xmm1
+ subpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+.LBB1_460:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_461
+.LBB1_465:
+ xor edi, edi
+.LBB1_466:
+ test r9b, 1
+ je .LBB1_468
+# %bb.467:
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+.LBB1_468:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_469
+.LBB1_473:
+ xor edi, edi
+.LBB1_474:
+ test r9b, 1
+ je .LBB1_476
+# %bb.475:
+ movupd xmm2, xmmword ptr [rdx + 8*rdi]
+ movupd xmm3, xmmword ptr [rdx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+.LBB1_476:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_477
+.LBB1_481:
+ xor edi, edi
+.LBB1_482:
+ test r9b, 1
+ je .LBB1_484
+# %bb.483:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_484:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_485
+.LBB1_489:
+ xor edi, edi
+.LBB1_490:
+ test r9b, 1
+ je .LBB1_492
+# %bb.491:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_492:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_493
+.LBB1_497:
+ xor edi, edi
+.LBB1_498:
+ test r9b, 1
+ je .LBB1_500
+# %bb.499:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_500:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_501
+.LBB1_505:
+ xor edi, edi
+.LBB1_506:
+ test r9b, 1
+ je .LBB1_508
+# %bb.507:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_508:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_509
+.LBB1_513:
+ xor edi, edi
+.LBB1_514:
+ test r9b, 1
+ je .LBB1_516
+# %bb.515:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_516:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_517
+.LBB1_521:
+ xor edi, edi
+.LBB1_522:
+ test r9b, 1
+ je .LBB1_524
+# %bb.523:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_524:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_525
+.LBB1_529:
+ xor edi, edi
+.LBB1_530:
+ test r9b, 1
+ je .LBB1_532
+# %bb.531:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_532:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_533
+.LBB1_537:
+ xor edi, edi
+.LBB1_538:
+ test r9b, 1
+ je .LBB1_540
+# %bb.539:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_540:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_541
+.LBB1_545:
+ xor edi, edi
+.LBB1_546:
+ test r9b, 1
+ je .LBB1_548
+# %bb.547:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_548:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_549
+.LBB1_553:
+ xor edi, edi
+.LBB1_554:
+ test r9b, 1
+ je .LBB1_556
+# %bb.555:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_556:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_557
+.LBB1_561:
+ xor edi, edi
+.LBB1_562:
+ test r9b, 1
+ je .LBB1_564
+# %bb.563:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_564:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_565
+.LBB1_569:
+ xor edi, edi
+.LBB1_570:
+ test r9b, 1
+ je .LBB1_572
+# %bb.571:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ psubw xmm1, xmm0
+ psubw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_572:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_573
+.LBB1_577:
+ xor edi, edi
+.LBB1_578:
+ test r9b, 1
+ je .LBB1_580
+# %bb.579:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_580:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_581
+.LBB1_585:
+ xor edi, edi
+.LBB1_586:
+ test r9b, 1
+ je .LBB1_588
+# %bb.587:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_588:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_589
+.LBB1_593:
+ xor edi, edi
+.LBB1_594:
+ test r9b, 1
+ je .LBB1_596
+# %bb.595:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_596:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_597
+.LBB1_601:
+ xor edi, edi
+.LBB1_602:
+ test r9b, 1
+ je .LBB1_604
+# %bb.603:
+ movdqu xmm1, xmmword ptr [rdx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB1_604:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_605
+.LBB1_609:
+ xor edi, edi
+.LBB1_610:
+ test r9b, 1
+ je .LBB1_612
+# %bb.611:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_612:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_613
+.LBB1_617:
+ xor edi, edi
+.LBB1_618:
+ test r9b, 1
+ je .LBB1_620
+# %bb.619:
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ subps xmm2, xmm1
+ subps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+.LBB1_620:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_621
+.LBB1_625:
+ xor edi, edi
+.LBB1_626:
+ test r9b, 1
+ je .LBB1_628
+# %bb.627:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ psubq xmm1, xmm0
+ psubq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_628:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_629
+.LBB1_633:
+ xor edi, edi
+.LBB1_634:
+ test r9b, 1
+ je .LBB1_636
+# %bb.635:
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ subps xmm2, xmm1
+ subps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+.LBB1_636:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_637
+.LBB1_641:
+ xor edi, edi
+.LBB1_642:
+ test r9b, 1
+ je .LBB1_644
+# %bb.643:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_644:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_645
+.LBB1_649:
+ xor edi, edi
+.LBB1_650:
+ test r9b, 1
+ je .LBB1_652
+# %bb.651:
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+.LBB1_652:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_653
+.LBB1_657:
+ xor edi, edi
+.LBB1_658:
+ test r9b, 1
+ je .LBB1_660
+# %bb.659:
+ movdqu xmm1, xmmword ptr [rdx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB1_660:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_661
+.LBB1_665:
+ xor edi, edi
+.LBB1_666:
+ test r9b, 1
+ je .LBB1_668
+# %bb.667:
+ movups xmm2, xmmword ptr [rdx + 4*rdi]
+ movups xmm3, xmmword ptr [rdx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+.LBB1_668:
+ cmp rcx, rax
+ je .LBB1_737
+ jmp .LBB1_669
+.LBB1_673:
+ xor edi, edi
+.LBB1_674:
+ test r9b, 1
+ je .LBB1_676
+# %bb.675:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_676:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_677
+.LBB1_681:
+ xor edi, edi
+.LBB1_682:
+ test r9b, 1
+ je .LBB1_684
+# %bb.683:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ psubb xmm1, xmm0
+ psubb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_684:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_685
+.LBB1_689:
+ xor edi, edi
+.LBB1_690:
+ test r9b, 1
+ je .LBB1_692
+# %bb.691:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_692:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_693
+.LBB1_697:
+ xor edi, edi
+.LBB1_698:
+ test r9b, 1
+ je .LBB1_700
+# %bb.699:
+ movdqu xmm1, xmmword ptr [rdx + rdi]
+ movdqu xmm2, xmmword ptr [rdx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB1_700:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_701
+.LBB1_705:
+ xor edi, edi
+.LBB1_706:
+ test r9b, 1
+ je .LBB1_708
+# %bb.707:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_708:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_709
+.LBB1_713:
+ xor edi, edi
+.LBB1_714:
+ test r9b, 1
+ je .LBB1_716
+# %bb.715:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ psubd xmm1, xmm0
+ psubd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_716:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_717
+.LBB1_721:
+ xor edi, edi
+.LBB1_722:
+ test r9b, 1
+ je .LBB1_724
+# %bb.723:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_724:
+ cmp rsi, r10
+ je .LBB1_737
+ jmp .LBB1_725
+.LBB1_729:
+ xor edi, edi
+.LBB1_730:
+ test r9b, 1
+ je .LBB1_732
+# %bb.731:
+ movdqu xmm1, xmmword ptr [rdx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB1_732:
+ cmp rsi, r10
+ jne .LBB1_733
+.LBB1_737:
+ mov rsp, rbp
+ pop rbp
+ ret
+.Lfunc_end1:
+ .size arithmetic_arr_scalar_sse4, .Lfunc_end1-arithmetic_arr_scalar_sse4
+ # -- End function
+ .globl arithmetic_scalar_arr_sse4 # -- Begin function arithmetic_scalar_arr_sse4
+ .p2align 4, 0x90
+ .type arithmetic_scalar_arr_sse4,@function
+arithmetic_scalar_arr_sse4: # @arithmetic_scalar_arr_sse4
+# %bb.0:
+ push rbp
+ mov rbp, rsp
+ and rsp, -8
+ cmp sil, 1
+ jg .LBB2_11
+# %bb.1:
+ test sil, sil
+ je .LBB2_21
+# %bb.2:
+ cmp sil, 1
+ jne .LBB2_737
+# %bb.3:
+ cmp edi, 6
+ jg .LBB2_37
+# %bb.4:
+ cmp edi, 3
+ jle .LBB2_65
+# %bb.5:
+ cmp edi, 4
+ je .LBB2_105
+# %bb.6:
+ cmp edi, 5
+ je .LBB2_108
+# %bb.7:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.8:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.9:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_10
+# %bb.177:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_297
+# %bb.178:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_297
+.LBB2_10:
+ xor esi, esi
+.LBB2_421:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_423
+.LBB2_422: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_422
+.LBB2_423:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_424: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_424
+ jmp .LBB2_737
+.LBB2_11:
+ cmp sil, 2
+ je .LBB2_29
+# %bb.12:
+ cmp sil, 3
+ jne .LBB2_737
+# %bb.13:
+ cmp edi, 6
+ jg .LBB2_44
+# %bb.14:
+ cmp edi, 3
+ jle .LBB2_70
+# %bb.15:
+ cmp edi, 4
+ je .LBB2_111
+# %bb.16:
+ cmp edi, 5
+ je .LBB2_114
+# %bb.17:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.18:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.19:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_20
+# %bb.180:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_300
+# %bb.181:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_300
+.LBB2_20:
+ xor esi, esi
+.LBB2_429:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_431
+.LBB2_430: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_430
+.LBB2_431:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_432: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_432
+ jmp .LBB2_737
+.LBB2_21:
+ cmp edi, 6
+ jg .LBB2_51
+# %bb.22:
+ cmp edi, 3
+ jle .LBB2_75
+# %bb.23:
+ cmp edi, 4
+ je .LBB2_117
+# %bb.24:
+ cmp edi, 5
+ je .LBB2_120
+# %bb.25:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.26:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.27:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_28
+# %bb.183:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_303
+# %bb.184:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_303
+.LBB2_28:
+ xor esi, esi
+.LBB2_437:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_439
+.LBB2_438: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_438
+.LBB2_439:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_440: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_440
+ jmp .LBB2_737
+.LBB2_29:
+ cmp edi, 6
+ jg .LBB2_58
+# %bb.30:
+ cmp edi, 3
+ jle .LBB2_80
+# %bb.31:
+ cmp edi, 4
+ je .LBB2_123
+# %bb.32:
+ cmp edi, 5
+ je .LBB2_126
+# %bb.33:
+ cmp edi, 6
+ jne .LBB2_737
+# %bb.34:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.35:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_36
+# %bb.186:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_306
+# %bb.187:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_306
+.LBB2_36:
+ xor esi, esi
+.LBB2_445:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_447
+.LBB2_446: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_446
+.LBB2_447:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_448: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_448
+ jmp .LBB2_737
+.LBB2_37:
+ cmp edi, 8
+ jle .LBB2_85
+# %bb.38:
+ cmp edi, 9
+ je .LBB2_129
+# %bb.39:
+ cmp edi, 11
+ je .LBB2_132
+# %bb.40:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.41:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.42:
+ movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB2_43
+# %bb.189:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_309
+# %bb.190:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_309
+.LBB2_43:
+ xor edx, edx
+.LBB2_453:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_455
+.LBB2_454: # =>This Inner Loop Header: Depth=1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx]
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_454
+.LBB2_455:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_456: # =>This Inner Loop Header: Depth=1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx]
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx + 8]
+ movsd qword ptr [r8 + 8*rdx + 8], xmm1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx + 16]
+ movsd qword ptr [r8 + 8*rdx + 16], xmm1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx + 24]
+ movsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_456
+ jmp .LBB2_737
+.LBB2_44:
+ cmp edi, 8
+ jle .LBB2_90
+# %bb.45:
+ cmp edi, 9
+ je .LBB2_135
+# %bb.46:
+ cmp edi, 11
+ je .LBB2_138
+# %bb.47:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.48:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.49:
+ movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB2_50
+# %bb.192:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_312
+# %bb.193:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_312
+.LBB2_50:
+ xor edx, edx
+.LBB2_461:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_463
+.LBB2_462: # =>This Inner Loop Header: Depth=1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx]
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_462
+.LBB2_463:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_464: # =>This Inner Loop Header: Depth=1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx]
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx + 8]
+ movsd qword ptr [r8 + 8*rdx + 8], xmm1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx + 16]
+ movsd qword ptr [r8 + 8*rdx + 16], xmm1
+ movapd xmm1, xmm0
+ subsd xmm1, qword ptr [rcx + 8*rdx + 24]
+ movsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_464
+ jmp .LBB2_737
+.LBB2_51:
+ cmp edi, 8
+ jle .LBB2_95
+# %bb.52:
+ cmp edi, 9
+ je .LBB2_141
+# %bb.53:
+ cmp edi, 11
+ je .LBB2_144
+# %bb.54:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.55:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.56:
+ movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB2_57
+# %bb.195:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_315
+# %bb.196:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_315
+.LBB2_57:
+ xor edx, edx
+.LBB2_469:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_471
+.LBB2_470: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_470
+.LBB2_471:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_472: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ movsd xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx + 8], xmm1
+ movsd xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx + 16], xmm1
+ movsd xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_472
+ jmp .LBB2_737
+.LBB2_58:
+ cmp edi, 8
+ jle .LBB2_100
+# %bb.59:
+ cmp edi, 9
+ je .LBB2_147
+# %bb.60:
+ cmp edi, 11
+ je .LBB2_150
+# %bb.61:
+ cmp edi, 12
+ jne .LBB2_737
+# %bb.62:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.63:
+ movsd xmm0, qword ptr [rdx] # xmm0 = mem[0],zero
+ mov eax, r9d
+ cmp r9d, 4
+ jb .LBB2_64
+# %bb.198:
+ lea rdx, [rcx + 8*rax]
+ cmp rdx, r8
+ jbe .LBB2_318
+# %bb.199:
+ lea rdx, [r8 + 8*rax]
+ cmp rdx, rcx
+ jbe .LBB2_318
+.LBB2_64:
+ xor edx, edx
+.LBB2_477:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_479
+.LBB2_478: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_478
+.LBB2_479:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_480: # =>This Inner Loop Header: Depth=1
+ movsd xmm1, qword ptr [rcx + 8*rdx] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx], xmm1
+ movsd xmm1, qword ptr [rcx + 8*rdx + 8] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx + 8], xmm1
+ movsd xmm1, qword ptr [rcx + 8*rdx + 16] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx + 16], xmm1
+ movsd xmm1, qword ptr [rcx + 8*rdx + 24] # xmm1 = mem[0],zero
+ addsd xmm1, xmm0
+ movsd qword ptr [r8 + 8*rdx + 24], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_480
+ jmp .LBB2_737
+.LBB2_65:
+ cmp edi, 2
+ je .LBB2_153
+# %bb.66:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.67:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.68:
+ mov r11b, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_69
+# %bb.201:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_321
+# %bb.202:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_321
+.LBB2_69:
+ xor esi, esi
+.LBB2_485:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_487
+.LBB2_486: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_486
+.LBB2_487:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_488: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_488
+ jmp .LBB2_737
+.LBB2_70:
+ cmp edi, 2
+ je .LBB2_156
+# %bb.71:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.72:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.73:
+ mov r11b, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_74
+# %bb.204:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_324
+# %bb.205:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_324
+.LBB2_74:
+ xor esi, esi
+.LBB2_493:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_495
+.LBB2_494: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_494
+.LBB2_495:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_496: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_496
+ jmp .LBB2_737
+.LBB2_75:
+ cmp edi, 2
+ je .LBB2_159
+# %bb.76:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.77:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.78:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_79
+# %bb.207:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_327
+# %bb.208:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_327
+.LBB2_79:
+ xor esi, esi
+.LBB2_501:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_503
+.LBB2_502: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_502
+.LBB2_503:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_504: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_504
+ jmp .LBB2_737
+.LBB2_80:
+ cmp edi, 2
+ je .LBB2_162
+# %bb.81:
+ cmp edi, 3
+ jne .LBB2_737
+# %bb.82:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.83:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_84
+# %bb.210:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_330
+# %bb.211:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_330
+.LBB2_84:
+ xor esi, esi
+.LBB2_509:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_511
+.LBB2_510: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_510
+.LBB2_511:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_512: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_512
+ jmp .LBB2_737
+.LBB2_85:
+ cmp edi, 7
+ je .LBB2_165
+# %bb.86:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.87:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.88:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_89
+# %bb.213:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_333
+# %bb.214:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_333
+.LBB2_89:
+ xor esi, esi
+.LBB2_517:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_519
+.LBB2_518: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_518
+.LBB2_519:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_520: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_520
+ jmp .LBB2_737
+.LBB2_90:
+ cmp edi, 7
+ je .LBB2_168
+# %bb.91:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.92:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.93:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_94
+# %bb.216:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_336
+# %bb.217:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_336
+.LBB2_94:
+ xor esi, esi
+.LBB2_525:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_527
+.LBB2_526: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_526
+.LBB2_527:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_528: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_528
+ jmp .LBB2_737
+.LBB2_95:
+ cmp edi, 7
+ je .LBB2_171
+# %bb.96:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.97:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.98:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_99
+# %bb.219:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_339
+# %bb.220:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_339
+.LBB2_99:
+ xor esi, esi
+.LBB2_533:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_535
+.LBB2_534: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_534
+.LBB2_535:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_536: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_536
+ jmp .LBB2_737
+.LBB2_100:
+ cmp edi, 7
+ je .LBB2_174
+# %bb.101:
+ cmp edi, 8
+ jne .LBB2_737
+# %bb.102:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.103:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_104
+# %bb.222:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_342
+# %bb.223:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_342
+.LBB2_104:
+ xor esi, esi
+.LBB2_541:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_543
+.LBB2_542: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_542
+.LBB2_543:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_544: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_544
+ jmp .LBB2_737
+.LBB2_105:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.106:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_107
+# %bb.225:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_345
+# %bb.226:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_345
+.LBB2_107:
+ xor esi, esi
+.LBB2_549:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_551
+.LBB2_550: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_550
+.LBB2_551:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_552: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_552
+ jmp .LBB2_737
+.LBB2_108:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.109:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_110
+# %bb.228:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_348
+# %bb.229:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_348
+.LBB2_110:
+ xor esi, esi
+.LBB2_557:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_559
+.LBB2_558: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_558
+.LBB2_559:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_560: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_560
+ jmp .LBB2_737
+.LBB2_111:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.112:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_113
+# %bb.231:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_351
+# %bb.232:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_351
+.LBB2_113:
+ xor esi, esi
+.LBB2_565:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_567
+.LBB2_566: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_566
+.LBB2_567:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_568: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_568
+ jmp .LBB2_737
+.LBB2_114:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.115:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_116
+# %bb.234:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_354
+# %bb.235:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_354
+.LBB2_116:
+ xor esi, esi
+.LBB2_573:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_575
+.LBB2_574: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_574
+.LBB2_575:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_576: # =>This Inner Loop Header: Depth=1
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi]
+ mov word ptr [r8 + 2*rsi], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 2]
+ mov word ptr [r8 + 2*rsi + 2], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 4]
+ mov word ptr [r8 + 2*rsi + 4], dx
+ mov edx, eax
+ sub dx, word ptr [rcx + 2*rsi + 6]
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_576
+ jmp .LBB2_737
+.LBB2_117:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.118:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_119
+# %bb.237:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_357
+# %bb.238:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_357
+.LBB2_119:
+ xor esi, esi
+.LBB2_581:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_583
+.LBB2_582: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_582
+.LBB2_583:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_584: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_584
+ jmp .LBB2_737
+.LBB2_120:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.121:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_122
+# %bb.240:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_360
+# %bb.241:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_360
+.LBB2_122:
+ xor esi, esi
+.LBB2_589:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_591
+.LBB2_590: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_590
+.LBB2_591:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_592: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_592
+ jmp .LBB2_737
+.LBB2_123:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.124:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_125
+# %bb.243:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_363
+# %bb.244:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_363
+.LBB2_125:
+ xor esi, esi
+.LBB2_597:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_599
+.LBB2_598: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_598
+.LBB2_599:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_600: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_600
+ jmp .LBB2_737
+.LBB2_126:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.127:
+ movzx eax, word ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 16
+ jb .LBB2_128
+# %bb.246:
+ lea rdx, [rcx + 2*r10]
+ cmp rdx, r8
+ jbe .LBB2_366
+# %bb.247:
+ lea rdx, [r8 + 2*r10]
+ cmp rdx, rcx
+ jbe .LBB2_366
+.LBB2_128:
+ xor esi, esi
+.LBB2_605:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_607
+.LBB2_606: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_606
+.LBB2_607:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_608: # =>This Inner Loop Header: Depth=1
+ movzx edx, word ptr [rcx + 2*rsi]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi], dx
+ movzx edx, word ptr [rcx + 2*rsi + 2]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 2], dx
+ movzx edx, word ptr [rcx + 2*rsi + 4]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 4], dx
+ movzx edx, word ptr [rcx + 2*rsi + 6]
+ add dx, ax
+ mov word ptr [r8 + 2*rsi + 6], dx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_608
+ jmp .LBB2_737
+.LBB2_129:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.130:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_131
+# %bb.249:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_369
+# %bb.250:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_369
+.LBB2_131:
+ xor esi, esi
+.LBB2_613:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_615
+.LBB2_614: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_614
+.LBB2_615:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_616: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_616
+ jmp .LBB2_737
+.LBB2_132:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.133:
+ movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB2_134
+# %bb.252:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_372
+# %bb.253:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_372
+.LBB2_134:
+ xor edx, edx
+.LBB2_621:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_623
+.LBB2_622: # =>This Inner Loop Header: Depth=1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx]
+ movss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_622
+.LBB2_623:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_624: # =>This Inner Loop Header: Depth=1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx]
+ movss dword ptr [r8 + 4*rdx], xmm1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx + 4]
+ movss dword ptr [r8 + 4*rdx + 4], xmm1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx + 8]
+ movss dword ptr [r8 + 4*rdx + 8], xmm1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx + 12]
+ movss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_624
+ jmp .LBB2_737
+.LBB2_135:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.136:
+ mov r11, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_137
+# %bb.255:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_375
+# %bb.256:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_375
+.LBB2_137:
+ xor esi, esi
+.LBB2_629:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_631
+.LBB2_630: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_630
+.LBB2_631:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_632: # =>This Inner Loop Header: Depth=1
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi]
+ mov qword ptr [r8 + 8*rsi], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 8]
+ mov qword ptr [r8 + 8*rsi + 8], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 16]
+ mov qword ptr [r8 + 8*rsi + 16], rax
+ mov rax, r11
+ sub rax, qword ptr [rcx + 8*rsi + 24]
+ mov qword ptr [r8 + 8*rsi + 24], rax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_632
+ jmp .LBB2_737
+.LBB2_138:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.139:
+ movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB2_140
+# %bb.258:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_378
+# %bb.259:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_378
+.LBB2_140:
+ xor edx, edx
+.LBB2_637:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_639
+.LBB2_638: # =>This Inner Loop Header: Depth=1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx]
+ movss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_638
+.LBB2_639:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_640: # =>This Inner Loop Header: Depth=1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx]
+ movss dword ptr [r8 + 4*rdx], xmm1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx + 4]
+ movss dword ptr [r8 + 4*rdx + 4], xmm1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx + 8]
+ movss dword ptr [r8 + 4*rdx + 8], xmm1
+ movaps xmm1, xmm0
+ subss xmm1, dword ptr [rcx + 4*rdx + 12]
+ movss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_640
+ jmp .LBB2_737
+.LBB2_141:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.142:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_143
+# %bb.261:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_381
+# %bb.262:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_381
+.LBB2_143:
+ xor esi, esi
+.LBB2_645:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_647
+.LBB2_646: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_646
+.LBB2_647:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_648: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_648
+ jmp .LBB2_737
+.LBB2_144:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.145:
+ movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB2_146
+# %bb.264:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_384
+# %bb.265:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_384
+.LBB2_146:
+ xor edx, edx
+.LBB2_653:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_655
+.LBB2_654: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_654
+.LBB2_655:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_656: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx], xmm1
+ movss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx + 4], xmm1
+ movss xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx + 8], xmm1
+ movss xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_656
+ jmp .LBB2_737
+.LBB2_147:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.148:
+ mov rax, qword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 4
+ jb .LBB2_149
+# %bb.267:
+ lea rdx, [rcx + 8*r10]
+ cmp rdx, r8
+ jbe .LBB2_387
+# %bb.268:
+ lea rdx, [r8 + 8*r10]
+ cmp rdx, rcx
+ jbe .LBB2_387
+.LBB2_149:
+ xor esi, esi
+.LBB2_661:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_663
+.LBB2_662: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_662
+.LBB2_663:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_664: # =>This Inner Loop Header: Depth=1
+ mov rdx, qword ptr [rcx + 8*rsi]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 8]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 8], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 16]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 16], rdx
+ mov rdx, qword ptr [rcx + 8*rsi + 24]
+ add rdx, rax
+ mov qword ptr [r8 + 8*rsi + 24], rdx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_664
+ jmp .LBB2_737
+.LBB2_150:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.151:
+ movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
+ mov eax, r9d
+ cmp r9d, 8
+ jb .LBB2_152
+# %bb.270:
+ lea rdx, [rcx + 4*rax]
+ cmp rdx, r8
+ jbe .LBB2_390
+# %bb.271:
+ lea rdx, [r8 + 4*rax]
+ cmp rdx, rcx
+ jbe .LBB2_390
+.LBB2_152:
+ xor edx, edx
+.LBB2_669:
+ mov rsi, rdx
+ not rsi
+ add rsi, rax
+ mov rdi, rax
+ and rdi, 3
+ je .LBB2_671
+.LBB2_670: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx], xmm1
+ add rdx, 1
+ add rdi, -1
+ jne .LBB2_670
+.LBB2_671:
+ cmp rsi, 3
+ jb .LBB2_737
+.LBB2_672: # =>This Inner Loop Header: Depth=1
+ movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx], xmm1
+ movss xmm1, dword ptr [rcx + 4*rdx + 4] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx + 4], xmm1
+ movss xmm1, dword ptr [rcx + 4*rdx + 8] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx + 8], xmm1
+ movss xmm1, dword ptr [rcx + 4*rdx + 12] # xmm1 = mem[0],zero,zero,zero
+ addss xmm1, xmm0
+ movss dword ptr [r8 + 4*rdx + 12], xmm1
+ add rdx, 4
+ cmp rax, rdx
+ jne .LBB2_672
+ jmp .LBB2_737
+.LBB2_153:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.154:
+ mov r11b, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_155
+# %bb.273:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_393
+# %bb.274:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_393
+.LBB2_155:
+ xor esi, esi
+.LBB2_677:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_679
+.LBB2_678: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_678
+.LBB2_679:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_680: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_680
+ jmp .LBB2_737
+.LBB2_156:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.157:
+ mov r11b, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_158
+# %bb.276:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_396
+# %bb.277:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_396
+.LBB2_158:
+ xor esi, esi
+.LBB2_685:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_687
+.LBB2_686: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_686
+.LBB2_687:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_688: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi]
+ mov byte ptr [r8 + rsi], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 1]
+ mov byte ptr [r8 + rsi + 1], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 2]
+ mov byte ptr [r8 + rsi + 2], al
+ mov eax, r11d
+ sub al, byte ptr [rcx + rsi + 3]
+ mov byte ptr [r8 + rsi + 3], al
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_688
+ jmp .LBB2_737
+.LBB2_159:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.160:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_161
+# %bb.279:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_399
+# %bb.280:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_399
+.LBB2_161:
+ xor esi, esi
+.LBB2_693:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_695
+.LBB2_694: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_694
+.LBB2_695:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_696: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_696
+ jmp .LBB2_737
+.LBB2_162:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.163:
+ mov al, byte ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 32
+ jb .LBB2_164
+# %bb.282:
+ lea rdx, [rcx + r10]
+ cmp rdx, r8
+ jbe .LBB2_402
+# %bb.283:
+ lea rdx, [r8 + r10]
+ cmp rdx, rcx
+ jbe .LBB2_402
+.LBB2_164:
+ xor esi, esi
+.LBB2_701:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_703
+.LBB2_702: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_702
+.LBB2_703:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_704: # =>This Inner Loop Header: Depth=1
+ movzx edx, byte ptr [rcx + rsi]
+ add dl, al
+ mov byte ptr [r8 + rsi], dl
+ movzx edx, byte ptr [rcx + rsi + 1]
+ add dl, al
+ mov byte ptr [r8 + rsi + 1], dl
+ movzx edx, byte ptr [rcx + rsi + 2]
+ add dl, al
+ mov byte ptr [r8 + rsi + 2], dl
+ movzx edx, byte ptr [rcx + rsi + 3]
+ add dl, al
+ mov byte ptr [r8 + rsi + 3], dl
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_704
+ jmp .LBB2_737
+.LBB2_165:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.166:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_167
+# %bb.285:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_405
+# %bb.286:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_405
+.LBB2_167:
+ xor esi, esi
+.LBB2_709:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_711
+.LBB2_710: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_710
+.LBB2_711:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_712: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_712
+ jmp .LBB2_737
+.LBB2_168:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.169:
+ mov r11d, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_170
+# %bb.288:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_408
+# %bb.289:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_408
+.LBB2_170:
+ xor esi, esi
+.LBB2_717:
+ mov rdx, rsi
+ not rdx
+ add rdx, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_719
+.LBB2_718: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_718
+.LBB2_719:
+ cmp rdx, 3
+ jb .LBB2_737
+.LBB2_720: # =>This Inner Loop Header: Depth=1
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi]
+ mov dword ptr [r8 + 4*rsi], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 4]
+ mov dword ptr [r8 + 4*rsi + 4], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 8]
+ mov dword ptr [r8 + 4*rsi + 8], eax
+ mov eax, r11d
+ sub eax, dword ptr [rcx + 4*rsi + 12]
+ mov dword ptr [r8 + 4*rsi + 12], eax
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_720
+ jmp .LBB2_737
+.LBB2_171:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.172:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_173
+# %bb.291:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_411
+# %bb.292:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_411
+.LBB2_173:
+ xor esi, esi
+.LBB2_725:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_727
+.LBB2_726: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_726
+.LBB2_727:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_728: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_728
+ jmp .LBB2_737
+.LBB2_174:
+ test r9d, r9d
+ jle .LBB2_737
+# %bb.175:
+ mov eax, dword ptr [rdx]
+ mov r10d, r9d
+ cmp r9d, 8
+ jb .LBB2_176
+# %bb.294:
+ lea rdx, [rcx + 4*r10]
+ cmp rdx, r8
+ jbe .LBB2_414
+# %bb.295:
+ lea rdx, [r8 + 4*r10]
+ cmp rdx, rcx
+ jbe .LBB2_414
+.LBB2_176:
+ xor esi, esi
+.LBB2_733:
+ mov r9, rsi
+ not r9
+ add r9, r10
+ mov rdi, r10
+ and rdi, 3
+ je .LBB2_735
+.LBB2_734: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ add rsi, 1
+ add rdi, -1
+ jne .LBB2_734
+.LBB2_735:
+ cmp r9, 3
+ jb .LBB2_737
+.LBB2_736: # =>This Inner Loop Header: Depth=1
+ mov edx, dword ptr [rcx + 4*rsi]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi], edx
+ mov edx, dword ptr [rcx + 4*rsi + 4]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 4], edx
+ mov edx, dword ptr [rcx + 4*rsi + 8]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 8], edx
+ mov edx, dword ptr [rcx + 4*rsi + 12]
+ add edx, eax
+ mov dword ptr [r8 + 4*rsi + 12], edx
+ add rsi, 4
+ cmp r10, rsi
+ jne .LBB2_736
+ jmp .LBB2_737
+.LBB2_297:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, r11d
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_417
+# %bb.298:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_299: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_299
+ jmp .LBB2_418
+.LBB2_300:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, r11d
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_425
+# %bb.301:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_302: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_302
+ jmp .LBB2_426
+.LBB2_303:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_433
+# %bb.304:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_305: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_305
+ jmp .LBB2_434
+.LBB2_306:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_441
+# %bb.307:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_308: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_308
+ jmp .LBB2_442
+.LBB2_309:
+ mov edx, eax
+ and edx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rdx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_449
+# %bb.310:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_311: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ movapd xmm4, xmm1
+ subpd xmm4, xmm2
+ movapd xmm2, xmm1
+ subpd xmm2, xmm3
+ movupd xmmword ptr [r8 + 8*rdi], xmm4
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 48]
+ movapd xmm4, xmm1
+ subpd xmm4, xmm2
+ movapd xmm2, xmm1
+ subpd xmm2, xmm3
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm4
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rsi, 2
+ jne .LBB2_311
+ jmp .LBB2_450
+.LBB2_312:
+ mov edx, eax
+ and edx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rdx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_457
+# %bb.313:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_314: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ movapd xmm4, xmm1
+ subpd xmm4, xmm2
+ movapd xmm2, xmm1
+ subpd xmm2, xmm3
+ movupd xmmword ptr [r8 + 8*rdi], xmm4
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 48]
+ movapd xmm4, xmm1
+ subpd xmm4, xmm2
+ movapd xmm2, xmm1
+ subpd xmm2, xmm3
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm4
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rsi, 2
+ jne .LBB2_314
+ jmp .LBB2_458
+.LBB2_315:
+ mov edx, eax
+ and edx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rdx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_465
+# %bb.316:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_317: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 48]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm3
+ add rdi, 8
+ add rsi, 2
+ jne .LBB2_317
+ jmp .LBB2_466
+.LBB2_318:
+ mov edx, eax
+ and edx, -4
+ movddup xmm1, xmm0 # xmm1 = xmm0[0,0]
+ lea rsi, [rdx - 4]
+ mov r9, rsi
+ shr r9, 2
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_473
+# %bb.319:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_320: # =>This Inner Loop Header: Depth=1
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+ movupd xmm2, xmmword ptr [rcx + 8*rdi + 32]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 48]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi + 32], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 48], xmm3
+ add rdi, 8
+ add rsi, 2
+ jne .LBB2_320
+ jmp .LBB2_474
+.LBB2_321:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, r11b
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_481
+# %bb.322:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_323: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_323
+ jmp .LBB2_482
+.LBB2_324:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, r11b
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_489
+# %bb.325:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_326: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_326
+ jmp .LBB2_490
+.LBB2_327:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, al
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_497
+# %bb.328:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_329: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_329
+ jmp .LBB2_498
+.LBB2_330:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, al
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_505
+# %bb.331:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_332: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_332
+ jmp .LBB2_506
+.LBB2_333:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, r11
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_513
+# %bb.334:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_335: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_335
+ jmp .LBB2_514
+.LBB2_336:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, r11
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_521
+# %bb.337:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_338: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_338
+ jmp .LBB2_522
+.LBB2_339:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_529
+# %bb.340:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_341: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_341
+ jmp .LBB2_530
+.LBB2_342:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_537
+# %bb.343:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_344: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_344
+ jmp .LBB2_538
+.LBB2_345:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_545
+# %bb.346:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_347: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_347
+ jmp .LBB2_546
+.LBB2_348:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_553
+# %bb.349:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_350: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_350
+ jmp .LBB2_554
+.LBB2_351:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_561
+# %bb.352:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_353: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_353
+ jmp .LBB2_562
+.LBB2_354:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_569
+# %bb.355:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_356: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubw xmm1, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm1
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_356
+ jmp .LBB2_570
+.LBB2_357:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_577
+# %bb.358:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_359: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_359
+ jmp .LBB2_578
+.LBB2_360:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_585
+# %bb.361:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_362: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_362
+ jmp .LBB2_586
+.LBB2_363:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_593
+# %bb.364:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_365: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_365
+ jmp .LBB2_594
+.LBB2_366:
+ mov esi, r10d
+ and esi, -16
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 224 # xmm0 = xmm0[0,0,2,3,4,5,6,7]
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 16]
+ mov r9, rdx
+ shr r9, 4
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_601
+# %bb.367:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_368: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 48]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 48], xmm2
+ add rdi, 32
+ add rdx, 2
+ jne .LBB2_368
+ jmp .LBB2_602
+.LBB2_369:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, r11
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_609
+# %bb.370:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_371: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_371
+ jmp .LBB2_610
+.LBB2_372:
+ mov edx, eax
+ and edx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rdx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_617
+# %bb.373:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_374: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ movaps xmm4, xmm1
+ subps xmm4, xmm2
+ movaps xmm2, xmm1
+ subps xmm2, xmm3
+ movups xmmword ptr [r8 + 4*rdi], xmm4
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 48]
+ movaps xmm4, xmm1
+ subps xmm4, xmm2
+ movaps xmm2, xmm1
+ subps xmm2, xmm3
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm4
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rsi, 2
+ jne .LBB2_374
+ jmp .LBB2_618
+.LBB2_375:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, r11
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_625
+# %bb.376:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_377: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubq xmm1, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm1
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_377
+ jmp .LBB2_626
+.LBB2_378:
+ mov edx, eax
+ and edx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rdx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_633
+# %bb.379:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_380: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ movaps xmm4, xmm1
+ subps xmm4, xmm2
+ movaps xmm2, xmm1
+ subps xmm2, xmm3
+ movups xmmword ptr [r8 + 4*rdi], xmm4
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 48]
+ movaps xmm4, xmm1
+ subps xmm4, xmm2
+ movaps xmm2, xmm1
+ subps xmm2, xmm3
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm4
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rsi, 2
+ jne .LBB2_380
+ jmp .LBB2_634
+.LBB2_381:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_641
+# %bb.382:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_383: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_383
+ jmp .LBB2_642
+.LBB2_384:
+ mov edx, eax
+ and edx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rdx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_649
+# %bb.385:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_386: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 48]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm3
+ add rdi, 16
+ add rsi, 2
+ jne .LBB2_386
+ jmp .LBB2_650
+.LBB2_387:
+ mov esi, r10d
+ and esi, -4
+ movq xmm0, rax
+ pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
+ lea rdx, [rsi - 4]
+ mov r9, rdx
+ shr r9, 2
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_657
+# %bb.388:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_389: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 48]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 48], xmm2
+ add rdi, 8
+ add rdx, 2
+ jne .LBB2_389
+ jmp .LBB2_658
+.LBB2_390:
+ mov edx, eax
+ and edx, -8
+ movaps xmm1, xmm0
+ shufps xmm1, xmm0, 0 # xmm1 = xmm1[0,0],xmm0[0,0]
+ lea rsi, [rdx - 8]
+ mov r9, rsi
+ shr r9, 3
+ add r9, 1
+ test rsi, rsi
+ je .LBB2_665
+# %bb.391:
+ mov rsi, r9
+ and rsi, -2
+ neg rsi
+ xor edi, edi
+.LBB2_392: # =>This Inner Loop Header: Depth=1
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+ movups xmm2, xmmword ptr [rcx + 4*rdi + 32]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 48]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi + 32], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 48], xmm3
+ add rdi, 16
+ add rsi, 2
+ jne .LBB2_392
+ jmp .LBB2_666
+.LBB2_393:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, r11b
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_673
+# %bb.394:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_395: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_395
+ jmp .LBB2_674
+.LBB2_396:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, r11b
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_681
+# %bb.397:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_398: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubb xmm1, xmm2
+ movdqu xmmword ptr [r8 + rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + rdi + 48], xmm1
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_398
+ jmp .LBB2_682
+.LBB2_399:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, al
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_689
+# %bb.400:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_401: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_401
+ jmp .LBB2_690
+.LBB2_402:
+ mov esi, r10d
+ and esi, -32
+ movzx edx, al
+ movd xmm0, edx
+ pxor xmm1, xmm1
+ pshufb xmm0, xmm1
+ lea rdx, [rsi - 32]
+ mov r9, rdx
+ shr r9, 5
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_697
+# %bb.403:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_404: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 48]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + rdi + 48], xmm2
+ add rdi, 64
+ add rdx, 2
+ jne .LBB2_404
+ jmp .LBB2_698
+.LBB2_405:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, r11d
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_705
+# %bb.406:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_407: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_407
+ jmp .LBB2_706
+.LBB2_408:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, r11d
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_713
+# %bb.409:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_410: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ movdqa xmm1, xmm0
+ psubd xmm1, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm1
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_410
+ jmp .LBB2_714
+.LBB2_411:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_721
+# %bb.412:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_413: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_413
+ jmp .LBB2_722
+.LBB2_414:
+ mov esi, r10d
+ and esi, -8
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 # xmm0 = xmm0[0,0,0,0]
+ lea rdx, [rsi - 8]
+ mov r9, rdx
+ shr r9, 3
+ add r9, 1
+ test rdx, rdx
+ je .LBB2_729
+# %bb.415:
+ mov rdx, r9
+ and rdx, -2
+ neg rdx
+ xor edi, edi
+.LBB2_416: # =>This Inner Loop Header: Depth=1
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi + 32]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 48]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi + 32], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 48], xmm2
+ add rdi, 16
+ add rdx, 2
+ jne .LBB2_416
+ jmp .LBB2_730
+.LBB2_417:
+ xor edi, edi
+.LBB2_418:
+ test r9b, 1
+ je .LBB2_420
+# %bb.419:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ psubd xmm0, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB2_420:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_421
+.LBB2_425:
+ xor edi, edi
+.LBB2_426:
+ test r9b, 1
+ je .LBB2_428
+# %bb.427:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ psubd xmm0, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB2_428:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_429
+.LBB2_433:
+ xor edi, edi
+.LBB2_434:
+ test r9b, 1
+ je .LBB2_436
+# %bb.435:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB2_436:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_437
+.LBB2_441:
+ xor edi, edi
+.LBB2_442:
+ test r9b, 1
+ je .LBB2_444
+# %bb.443:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB2_444:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_445
+.LBB2_449:
+ xor edi, edi
+.LBB2_450:
+ test r9b, 1
+ je .LBB2_452
+# %bb.451:
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ movapd xmm4, xmm1
+ subpd xmm4, xmm2
+ subpd xmm1, xmm3
+ movupd xmmword ptr [r8 + 8*rdi], xmm4
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB2_452:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_453
+.LBB2_457:
+ xor edi, edi
+.LBB2_458:
+ test r9b, 1
+ je .LBB2_460
+# %bb.459:
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ movapd xmm4, xmm1
+ subpd xmm4, xmm2
+ subpd xmm1, xmm3
+ movupd xmmword ptr [r8 + 8*rdi], xmm4
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm1
+.LBB2_460:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_461
+.LBB2_465:
+ xor edi, edi
+.LBB2_466:
+ test r9b, 1
+ je .LBB2_468
+# %bb.467:
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+.LBB2_468:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_469
+.LBB2_473:
+ xor edi, edi
+.LBB2_474:
+ test r9b, 1
+ je .LBB2_476
+# %bb.475:
+ movupd xmm2, xmmword ptr [rcx + 8*rdi]
+ movupd xmm3, xmmword ptr [rcx + 8*rdi + 16]
+ addpd xmm2, xmm1
+ addpd xmm3, xmm1
+ movupd xmmword ptr [r8 + 8*rdi], xmm2
+ movupd xmmword ptr [r8 + 8*rdi + 16], xmm3
+.LBB2_476:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_477
+.LBB2_481:
+ xor edi, edi
+.LBB2_482:
+ test r9b, 1
+ je .LBB2_484
+# %bb.483:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ psubb xmm0, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB2_484:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_485
+.LBB2_489:
+ xor edi, edi
+.LBB2_490:
+ test r9b, 1
+ je .LBB2_492
+# %bb.491:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ psubb xmm0, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB2_492:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_493
+.LBB2_497:
+ xor edi, edi
+.LBB2_498:
+ test r9b, 1
+ je .LBB2_500
+# %bb.499:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB2_500:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_501
+.LBB2_505:
+ xor edi, edi
+.LBB2_506:
+ test r9b, 1
+ je .LBB2_508
+# %bb.507:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB2_508:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_509
+.LBB2_513:
+ xor edi, edi
+.LBB2_514:
+ test r9b, 1
+ je .LBB2_516
+# %bb.515:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ psubq xmm0, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB2_516:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_517
+.LBB2_521:
+ xor edi, edi
+.LBB2_522:
+ test r9b, 1
+ je .LBB2_524
+# %bb.523:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ psubq xmm0, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB2_524:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_525
+.LBB2_529:
+ xor edi, edi
+.LBB2_530:
+ test r9b, 1
+ je .LBB2_532
+# %bb.531:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB2_532:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_533
+.LBB2_537:
+ xor edi, edi
+.LBB2_538:
+ test r9b, 1
+ je .LBB2_540
+# %bb.539:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB2_540:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_541
+.LBB2_545:
+ xor edi, edi
+.LBB2_546:
+ test r9b, 1
+ je .LBB2_548
+# %bb.547:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ psubw xmm0, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB2_548:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_549
+.LBB2_553:
+ xor edi, edi
+.LBB2_554:
+ test r9b, 1
+ je .LBB2_556
+# %bb.555:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ psubw xmm0, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB2_556:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_557
+.LBB2_561:
+ xor edi, edi
+.LBB2_562:
+ test r9b, 1
+ je .LBB2_564
+# %bb.563:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ psubw xmm0, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB2_564:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_565
+.LBB2_569:
+ xor edi, edi
+.LBB2_570:
+ test r9b, 1
+ je .LBB2_572
+# %bb.571:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ movdqa xmm3, xmm0
+ psubw xmm3, xmm1
+ psubw xmm0, xmm2
+ movdqu xmmword ptr [r8 + 2*rdi], xmm3
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm0
+.LBB2_572:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_573
+.LBB2_577:
+ xor edi, edi
+.LBB2_578:
+ test r9b, 1
+ je .LBB2_580
+# %bb.579:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB2_580:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_581
+.LBB2_585:
+ xor edi, edi
+.LBB2_586:
+ test r9b, 1
+ je .LBB2_588
+# %bb.587:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB2_588:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_589
+.LBB2_593:
+ xor edi, edi
+.LBB2_594:
+ test r9b, 1
+ je .LBB2_596
+# %bb.595:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB2_596:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_597
+.LBB2_601:
+ xor edi, edi
+.LBB2_602:
+ test r9b, 1
+ je .LBB2_604
+# %bb.603:
+ movdqu xmm1, xmmword ptr [rcx + 2*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 2*rdi + 16]
+ paddw xmm1, xmm0
+ paddw xmm2, xmm0
+ movdqu xmmword ptr [r8 + 2*rdi], xmm1
+ movdqu xmmword ptr [r8 + 2*rdi + 16], xmm2
+.LBB2_604:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_605
+.LBB2_609:
+ xor edi, edi
+.LBB2_610:
+ test r9b, 1
+ je .LBB2_612
+# %bb.611:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ psubq xmm0, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB2_612:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_613
+.LBB2_617:
+ xor edi, edi
+.LBB2_618:
+ test r9b, 1
+ je .LBB2_620
+# %bb.619:
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ movaps xmm4, xmm1
+ subps xmm4, xmm2
+ subps xmm1, xmm3
+ movups xmmword ptr [r8 + 4*rdi], xmm4
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB2_620:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_621
+.LBB2_625:
+ xor edi, edi
+.LBB2_626:
+ test r9b, 1
+ je .LBB2_628
+# %bb.627:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ movdqa xmm3, xmm0
+ psubq xmm3, xmm1
+ psubq xmm0, xmm2
+ movdqu xmmword ptr [r8 + 8*rdi], xmm3
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm0
+.LBB2_628:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_629
+.LBB2_633:
+ xor edi, edi
+.LBB2_634:
+ test r9b, 1
+ je .LBB2_636
+# %bb.635:
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ movaps xmm4, xmm1
+ subps xmm4, xmm2
+ subps xmm1, xmm3
+ movups xmmword ptr [r8 + 4*rdi], xmm4
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm1
+.LBB2_636:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_637
+.LBB2_641:
+ xor edi, edi
+.LBB2_642:
+ test r9b, 1
+ je .LBB2_644
+# %bb.643:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB2_644:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_645
+.LBB2_649:
+ xor edi, edi
+.LBB2_650:
+ test r9b, 1
+ je .LBB2_652
+# %bb.651:
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+.LBB2_652:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_653
+.LBB2_657:
+ xor edi, edi
+.LBB2_658:
+ test r9b, 1
+ je .LBB2_660
+# %bb.659:
+ movdqu xmm1, xmmword ptr [rcx + 8*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 8*rdi + 16]
+ paddq xmm1, xmm0
+ paddq xmm2, xmm0
+ movdqu xmmword ptr [r8 + 8*rdi], xmm1
+ movdqu xmmword ptr [r8 + 8*rdi + 16], xmm2
+.LBB2_660:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_661
+.LBB2_665:
+ xor edi, edi
+.LBB2_666:
+ test r9b, 1
+ je .LBB2_668
+# %bb.667:
+ movups xmm2, xmmword ptr [rcx + 4*rdi]
+ movups xmm3, xmmword ptr [rcx + 4*rdi + 16]
+ addps xmm2, xmm1
+ addps xmm3, xmm1
+ movups xmmword ptr [r8 + 4*rdi], xmm2
+ movups xmmword ptr [r8 + 4*rdi + 16], xmm3
+.LBB2_668:
+ cmp rdx, rax
+ je .LBB2_737
+ jmp .LBB2_669
+.LBB2_673:
+ xor edi, edi
+.LBB2_674:
+ test r9b, 1
+ je .LBB2_676
+# %bb.675:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ psubb xmm0, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB2_676:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_677
+.LBB2_681:
+ xor edi, edi
+.LBB2_682:
+ test r9b, 1
+ je .LBB2_684
+# %bb.683:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ movdqa xmm3, xmm0
+ psubb xmm3, xmm1
+ psubb xmm0, xmm2
+ movdqu xmmword ptr [r8 + rdi], xmm3
+ movdqu xmmword ptr [r8 + rdi + 16], xmm0
+.LBB2_684:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_685
+.LBB2_689:
+ xor edi, edi
+.LBB2_690:
+ test r9b, 1
+ je .LBB2_692
+# %bb.691:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB2_692:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_693
+.LBB2_697:
+ xor edi, edi
+.LBB2_698:
+ test r9b, 1
+ je .LBB2_700
+# %bb.699:
+ movdqu xmm1, xmmword ptr [rcx + rdi]
+ movdqu xmm2, xmmword ptr [rcx + rdi + 16]
+ paddb xmm1, xmm0
+ paddb xmm2, xmm0
+ movdqu xmmword ptr [r8 + rdi], xmm1
+ movdqu xmmword ptr [r8 + rdi + 16], xmm2
+.LBB2_700:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_701
+.LBB2_705:
+ xor edi, edi
+.LBB2_706:
+ test r9b, 1
+ je .LBB2_708
+# %bb.707:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ psubd xmm0, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB2_708:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_709
+.LBB2_713:
+ xor edi, edi
+.LBB2_714:
+ test r9b, 1
+ je .LBB2_716
+# %bb.715:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ movdqa xmm3, xmm0
+ psubd xmm3, xmm1
+ psubd xmm0, xmm2
+ movdqu xmmword ptr [r8 + 4*rdi], xmm3
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm0
+.LBB2_716:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_717
+.LBB2_721:
+ xor edi, edi
+.LBB2_722:
+ test r9b, 1
+ je .LBB2_724
+# %bb.723:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB2_724:
+ cmp rsi, r10
+ je .LBB2_737
+ jmp .LBB2_725
+.LBB2_729:
+ xor edi, edi
+.LBB2_730:
+ test r9b, 1
+ je .LBB2_732
+# %bb.731:
+ movdqu xmm1, xmmword ptr [rcx + 4*rdi]
+ movdqu xmm2, xmmword ptr [rcx + 4*rdi + 16]
+ paddd xmm1, xmm0
+ paddd xmm2, xmm0
+ movdqu xmmword ptr [r8 + 4*rdi], xmm1
+ movdqu xmmword ptr [r8 + 4*rdi + 16], xmm2
+.LBB2_732:
+ cmp rsi, r10
+ jne .LBB2_733
+.LBB2_737:
+ mov rsp, rbp
+ pop rbp
+ ret
+.Lfunc_end2:
+ .size arithmetic_scalar_arr_sse4, .Lfunc_end2-arithmetic_scalar_arr_sse4
+ # -- End function
+ .ident "Ubuntu clang version 11.1.0-6"
+ .section ".note.GNU-stack","",@progbits
+ .addrsig
diff --git a/go/arrow/compute/internal/kernels/_lib/cast_numeric.cc b/go/arrow/compute/internal/kernels/_lib/cast_numeric.cc
index 666d85a6ac5..1e8c821ea5e 100644
--- a/go/arrow/compute/internal/kernels/_lib/cast_numeric.cc
+++ b/go/arrow/compute/internal/kernels/_lib/cast_numeric.cc
@@ -16,23 +16,7 @@
#include
#include
-
-// corresponds to datatype.go's arrow.Type
-enum class arrtype : int {
- NULL,
- BOOL,
- UINT8,
- INT8,
- UINT16,
- INT16,
- UINT32,
- INT32,
- UINT64,
- INT64,
- FLOAT16,
- FLOAT32,
- FLOAT64
-};
+#include "types.h"
template
static inline void FULL_NAME(cast_tmpl_numeric)(const I* in, O* out, const int len) {
diff --git a/go/arrow/compute/internal/kernels/_lib/types.h b/go/arrow/compute/internal/kernels/_lib/types.h
new file mode 100644
index 00000000000..5e0a3ae01c1
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/types.h
@@ -0,0 +1,368 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// corresponds to datatype.go's arrow.Type
+enum class arrtype : int {
+ NULL,
+ BOOL,
+ UINT8,
+ INT8,
+ UINT16,
+ INT16,
+ UINT32,
+ INT32,
+ UINT64,
+ INT64,
+ FLOAT16,
+ FLOAT32,
+ FLOAT64
+};
+
+
+#define _LIBCPP_TEMPLATE_VIS
+#define _LIBCPP_CONSTEXPR constexpr
+#define _LIBCPP_INLINE_VISIBILITY
+#define _LIBCPP_STD_VER 17
+#define _LIBCPP_NODEBUG
+#define _LIBCPP_HAS_NO_CHAR8_T
+#define _NOEXCEPT noexcept
+#define _NOEXCEPT_(x) noexcept(x)
+
+// copied from libcxx/include/__type_traits/integral_constant.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+template
+struct _LIBCPP_TEMPLATE_VIS integral_constant
+{
+ static _LIBCPP_CONSTEXPR const _Tp value = __v;
+ typedef _Tp value_type;
+ typedef integral_constant type;
+ _LIBCPP_INLINE_VISIBILITY
+ _LIBCPP_CONSTEXPR operator value_type() const _NOEXCEPT {return value;}
+#if _LIBCPP_STD_VER > 11
+ _LIBCPP_INLINE_VISIBILITY
+ constexpr value_type operator ()() const _NOEXCEPT {return value;}
+#endif
+};
+
+template
+_LIBCPP_CONSTEXPR const _Tp integral_constant<_Tp, __v>::value;
+
+typedef integral_constant true_type;
+typedef integral_constant false_type;
+
+template
+using _BoolConstant _LIBCPP_NODEBUG = integral_constant;
+
+#if _LIBCPP_STD_VER > 14
+template
+using bool_constant = integral_constant;
+#endif
+
+// copied from libcxx/include/__type_traits/remove_const.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __has_builtin(__remove_const)
+template
+struct remove_const {
+ using type _LIBCPP_NODEBUG = __remove_const(_Tp);
+};
+
+template
+using __remove_const_t = __remove_const(_Tp);
+#else
+template struct _LIBCPP_TEMPLATE_VIS remove_const {typedef _Tp type;};
+template struct _LIBCPP_TEMPLATE_VIS remove_const {typedef _Tp type;};
+
+template
+using __remove_const_t = typename remove_const<_Tp>::type;
+#endif // __has_builtin(__remove_const)
+
+#if _LIBCPP_STD_VER > 11
+template using remove_const_t = __remove_const_t<_Tp>;
+#endif
+
+// copied from libcxx/include/__type_traits/remove_volatile.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __has_builtin(__remove_volatile)
+template
+struct remove_volatile {
+ using type _LIBCPP_NODEBUG = __remove_volatile(_Tp);
+};
+
+template
+using __remove_volatile_t = __remove_volatile(_Tp);
+#else
+template struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef _Tp type;};
+template struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef _Tp type;};
+
+template
+using __remove_volatile_t = typename remove_volatile<_Tp>::type;
+#endif // __has_builtin(__remove_volatile)
+
+#if _LIBCPP_STD_VER > 11
+template using remove_volatile_t = __remove_volatile_t<_Tp>;
+#endif
+
+// copied from libcxx/include/__type_traits/remove_cv.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __has_builtin(__remove_cv)
+template
+struct remove_cv {
+ using type _LIBCPP_NODEBUG = __remove_cv(_Tp);
+};
+
+template
+using __remove_cv_t = __remove_cv(_Tp);
+#else
+template struct _LIBCPP_TEMPLATE_VIS remove_cv
+{typedef __remove_volatile_t<__remove_const_t<_Tp> > type;};
+
+template
+using __remove_cv_t = __remove_volatile_t<__remove_const_t<_Tp> >;
+#endif // __has_builtin(__remove_cv)
+
+#if _LIBCPP_STD_VER > 11
+template using remove_cv_t = __remove_cv_t<_Tp>;
+#endif
+
+// copied from libcxx/include/__type_traits/is_floating_point.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+template struct __libcpp_is_floating_point : public false_type {};
+template <> struct __libcpp_is_floating_point : public true_type {};
+template <> struct __libcpp_is_floating_point : public true_type {};
+template <> struct __libcpp_is_floating_point : public true_type {};
+
+template struct _LIBCPP_TEMPLATE_VIS is_floating_point
+ : public __libcpp_is_floating_point<__remove_cv_t<_Tp> > {};
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_floating_point_v = is_floating_point<_Tp>::value;
+#endif
+
+// copied from libcxx/include/__type_traits/is_integral.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+template struct __libcpp_is_integral { enum { value = 0 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+#endif
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+#endif
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+template <> struct __libcpp_is_integral { enum { value = 1 }; };
+#ifndef _LIBCPP_HAS_NO_INT128
+template <> struct __libcpp_is_integral<__int128_t> { enum { value = 1 }; };
+template <> struct __libcpp_is_integral<__uint128_t> { enum { value = 1 }; };
+#endif
+
+#if __has_builtin(__is_integral)
+
+template
+struct _LIBCPP_TEMPLATE_VIS is_integral : _BoolConstant<__is_integral(_Tp)> { };
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_integral_v = __is_integral(_Tp);
+#endif
+
+#else
+
+template struct _LIBCPP_TEMPLATE_VIS is_integral
+ : public _BoolConstant<__libcpp_is_integral<__remove_cv_t<_Tp> >::value> {};
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_integral_v = is_integral<_Tp>::value;
+#endif
+
+#endif // __has_builtin(__is_integral)
+
+// copied from libcxx/include/__type_traits/is_arithmetic.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+template struct _LIBCPP_TEMPLATE_VIS is_arithmetic
+ : public integral_constant::value ||
+ is_floating_point<_Tp>::value> {};
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_arithmetic_v = is_arithmetic<_Tp>::value;
+#endif
+
+// copied from libcxx/include/__type_traits/is_signed.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if __has_builtin(__is_signed)
+
+template
+struct _LIBCPP_TEMPLATE_VIS is_signed : _BoolConstant<__is_signed(_Tp)> { };
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_signed_v = __is_signed(_Tp);
+#endif
+
+#else // __has_builtin(__is_signed)
+
+template ::value>
+struct __libcpp_is_signed_impl : public _BoolConstant<(_Tp(-1) < _Tp(0))> {};
+
+template
+struct __libcpp_is_signed_impl<_Tp, false> : public true_type {}; // floating point
+
+template ::value>
+struct __libcpp_is_signed : public __libcpp_is_signed_impl<_Tp> {};
+
+template struct __libcpp_is_signed<_Tp, false> : public false_type {};
+
+template struct _LIBCPP_TEMPLATE_VIS is_signed : public __libcpp_is_signed<_Tp> {};
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_signed_v = is_signed<_Tp>::value;
+#endif
+
+#endif // __has_builtin(__is_signed)
+
+
+// copied from libcxx/include/__type_traits/is_unsigned.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+// Before AppleClang 14, __is_unsigned returned true for enums with signed underlying type.
+#if __has_builtin(__is_unsigned) && !(defined(_LIBCPP_APPLE_CLANG_VER) && _LIBCPP_APPLE_CLANG_VER < 1400)
+
+template
+struct _LIBCPP_TEMPLATE_VIS is_unsigned : _BoolConstant<__is_unsigned(_Tp)> { };
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_unsigned_v = __is_unsigned(_Tp);
+#endif
+
+#else // __has_builtin(__is_unsigned)
+
+template ::value>
+struct __libcpp_is_unsigned_impl : public _BoolConstant<(_Tp(0) < _Tp(-1))> {};
+
+template
+struct __libcpp_is_unsigned_impl<_Tp, false> : public false_type {}; // floating point
+
+template ::value>
+struct __libcpp_is_unsigned : public __libcpp_is_unsigned_impl<_Tp> {};
+
+template struct __libcpp_is_unsigned<_Tp, false> : public false_type {};
+
+template struct _LIBCPP_TEMPLATE_VIS is_unsigned : public __libcpp_is_unsigned<_Tp> {};
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_unsigned_v = is_unsigned<_Tp>::value;
+#endif
+
+#endif // __has_builtin(__is_unsigned)
+
+// copied from libcxx/include/__type_traits/is_same.h
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+template
+struct _LIBCPP_TEMPLATE_VIS is_same : _BoolConstant<__is_same(_Tp, _Up)> { };
+
+#if _LIBCPP_STD_VER > 14
+template
+inline constexpr bool is_same_v = __is_same(_Tp, _Up);
+#endif
diff --git a/go/arrow/compute/internal/kernels/_lib/vendored/safe-math.h b/go/arrow/compute/internal/kernels/_lib/vendored/safe-math.h
new file mode 100644
index 00000000000..7f6426ac765
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/_lib/vendored/safe-math.h
@@ -0,0 +1,1072 @@
+/* Overflow-safe math functions
+ * Portable Snippets - https://github.com/nemequ/portable-snippets
+ * Created by Evan Nemerson
+ *
+ * To the extent possible under law, the authors have waived all
+ * copyright and related or neighboring rights to this code. For
+ * details, see the Creative Commons Zero 1.0 Universal license at
+ * https://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+#if !defined(PSNIP_SAFE_H)
+#define PSNIP_SAFE_H
+
+#if !defined(PSNIP_SAFE_FORCE_PORTABLE)
+# if defined(__has_builtin)
+# if __has_builtin(__builtin_add_overflow) && !defined(__ibmxl__)
+# define PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW
+# endif
+# elif defined(__GNUC__) && (__GNUC__ >= 5) && !defined(__INTEL_COMPILER)
+# define PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW
+# endif
+# if defined(__has_include)
+# if __has_include()
+# define PSNIP_SAFE_HAVE_INTSAFE_H
+# endif
+# elif defined(_WIN32)
+# define PSNIP_SAFE_HAVE_INTSAFE_H
+# endif
+#endif /* !defined(PSNIP_SAFE_FORCE_PORTABLE) */
+
+#if defined(__GNUC__)
+# define PSNIP_SAFE_LIKELY(expr) __builtin_expect(!!(expr), 1)
+# define PSNIP_SAFE_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+# define PSNIP_SAFE_LIKELY(expr) !!(expr)
+# define PSNIP_SAFE_UNLIKELY(expr) !!(expr)
+#endif /* defined(__GNUC__) */
+
+#if !defined(PSNIP_SAFE_STATIC_INLINE)
+# if defined(__GNUC__)
+# define PSNIP_SAFE__COMPILER_ATTRIBUTES __attribute__((__unused__))
+# else
+# define PSNIP_SAFE__COMPILER_ATTRIBUTES
+# endif
+
+# if defined(HEDLEY_INLINE)
+# define PSNIP_SAFE__INLINE HEDLEY_INLINE
+# elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+# define PSNIP_SAFE__INLINE inline
+# elif defined(__GNUC_STDC_INLINE__)
+# define PSNIP_SAFE__INLINE __inline__
+# elif defined(_MSC_VER) && _MSC_VER >= 1200
+# define PSNIP_SAFE__INLINE __inline
+# else
+# define PSNIP_SAFE__INLINE
+# endif
+
+# define PSNIP_SAFE__FUNCTION PSNIP_SAFE__COMPILER_ATTRIBUTES static PSNIP_SAFE__INLINE
+#endif
+
+// !defined(__cplusplus) added for Solaris support
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+# define psnip_safe_bool _Bool
+#else
+# define psnip_safe_bool int
+#endif
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+/* For maximum portability include the exact-int module from
+ portable snippets. */
+# if \
+ !defined(psnip_int64_t) || !defined(psnip_uint64_t) || \
+ !defined(psnip_int32_t) || !defined(psnip_uint32_t) || \
+ !defined(psnip_int16_t) || !defined(psnip_uint16_t) || \
+ !defined(psnip_int8_t) || !defined(psnip_uint8_t)
+# include
+# if !defined(psnip_int64_t)
+# define psnip_int64_t int64_t
+# endif
+# if !defined(psnip_uint64_t)
+# define psnip_uint64_t uint64_t
+# endif
+# if !defined(psnip_int32_t)
+# define psnip_int32_t int32_t
+# endif
+# if !defined(psnip_uint32_t)
+# define psnip_uint32_t uint32_t
+# endif
+# if !defined(psnip_int16_t)
+# define psnip_int16_t int16_t
+# endif
+# if !defined(psnip_uint16_t)
+# define psnip_uint16_t uint16_t
+# endif
+# if !defined(psnip_int8_t)
+# define psnip_int8_t int8_t
+# endif
+# if !defined(psnip_uint8_t)
+# define psnip_uint8_t uint8_t
+# endif
+# endif
+#endif /* !defined(PSNIP_SAFE_NO_FIXED) */
+#include
+#include
+
+#if !defined(PSNIP_SAFE_SIZE_MAX)
+# if defined(__SIZE_MAX__)
+# define PSNIP_SAFE_SIZE_MAX __SIZE_MAX__
+# elif defined(PSNIP_EXACT_INT_HAVE_STDINT)
+# include
+# endif
+#endif
+
+#if defined(PSNIP_SAFE_SIZE_MAX)
+# define PSNIP_SAFE__SIZE_MAX_RT PSNIP_SAFE_SIZE_MAX
+#else
+# define PSNIP_SAFE__SIZE_MAX_RT (~((size_t) 0))
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+/* In VS 10, stdint.h and intsafe.h both define (U)INTN_MIN/MAX, which
+ triggers warning C4005 (level 1). */
+# if defined(_MSC_VER) && (_MSC_VER == 1600)
+# pragma warning(push)
+# pragma warning(disable:4005)
+# endif
+# include
+# if defined(_MSC_VER) && (_MSC_VER == 1600)
+# pragma warning(pop)
+# endif
+#endif /* defined(PSNIP_SAFE_HAVE_INTSAFE_H) */
+
+/* If there is a type larger than the one we're concerned with it's
+ * likely much faster to simply promote the operands, perform the
+ * requested operation, verify that the result falls within the
+ * original type, then cast the result back to the original type. */
+
+#if !defined(PSNIP_SAFE_NO_PROMOTIONS)
+
+#define PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, op_name, op) \
+ PSNIP_SAFE__FUNCTION psnip_safe_##name##_larger \
+ psnip_safe_larger_##name##_##op_name (T a, T b) { \
+ return ((psnip_safe_##name##_larger) a) op ((psnip_safe_##name##_larger) b); \
+ }
+
+#define PSNIP_SAFE_DEFINE_LARGER_UNARY_OP(T, name, op_name, op) \
+ PSNIP_SAFE__FUNCTION psnip_safe_##name##_larger \
+ psnip_safe_larger_##name##_##op_name (T value) { \
+ return (op ((psnip_safe_##name##_larger) value)); \
+ }
+
+#define PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(T, name) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, add, +) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, sub, -) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mul, *) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, div, /) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mod, %) \
+ PSNIP_SAFE_DEFINE_LARGER_UNARY_OP (T, name, neg, -)
+
+#define PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(T, name) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, add, +) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, sub, -) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mul, *) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, div, /) \
+ PSNIP_SAFE_DEFINE_LARGER_BINARY_OP(T, name, mod, %)
+
+#define PSNIP_SAFE_IS_LARGER(ORIG_MAX, DEST_MAX) ((DEST_MAX / ORIG_MAX) >= ORIG_MAX)
+
+#if defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) && defined(__SIZEOF_INT128__) && !defined(__ibmxl__)
+#define PSNIP_SAFE_HAVE_128
+typedef __int128 psnip_safe_int128_t;
+typedef unsigned __int128 psnip_safe_uint128_t;
+#endif /* defined(__GNUC__) */
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+#define PSNIP_SAFE_HAVE_INT8_LARGER
+#define PSNIP_SAFE_HAVE_UINT8_LARGER
+typedef psnip_int16_t psnip_safe_int8_larger;
+typedef psnip_uint16_t psnip_safe_uint8_larger;
+
+#define PSNIP_SAFE_HAVE_INT16_LARGER
+typedef psnip_int32_t psnip_safe_int16_larger;
+typedef psnip_uint32_t psnip_safe_uint16_larger;
+
+#define PSNIP_SAFE_HAVE_INT32_LARGER
+typedef psnip_int64_t psnip_safe_int32_larger;
+typedef psnip_uint64_t psnip_safe_uint32_larger;
+
+#if defined(PSNIP_SAFE_HAVE_128)
+#define PSNIP_SAFE_HAVE_INT64_LARGER
+typedef psnip_safe_int128_t psnip_safe_int64_larger;
+typedef psnip_safe_uint128_t psnip_safe_uint64_larger;
+#endif /* defined(PSNIP_SAFE_HAVE_128) */
+#endif /* !defined(PSNIP_SAFE_NO_FIXED) */
+
+#define PSNIP_SAFE_HAVE_LARGER_SCHAR
+#if PSNIP_SAFE_IS_LARGER(SCHAR_MAX, SHRT_MAX)
+typedef short psnip_safe_schar_larger;
+#elif PSNIP_SAFE_IS_LARGER(SCHAR_MAX, INT_MAX)
+typedef int psnip_safe_schar_larger;
+#elif PSNIP_SAFE_IS_LARGER(SCHAR_MAX, LONG_MAX)
+typedef long psnip_safe_schar_larger;
+#elif PSNIP_SAFE_IS_LARGER(SCHAR_MAX, LLONG_MAX)
+typedef long long psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SCHAR_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SCHAR_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SCHAR_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_schar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (SCHAR_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_schar_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_SCHAR
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_UCHAR
+#if PSNIP_SAFE_IS_LARGER(UCHAR_MAX, USHRT_MAX)
+typedef unsigned short psnip_safe_uchar_larger;
+#elif PSNIP_SAFE_IS_LARGER(UCHAR_MAX, UINT_MAX)
+typedef unsigned int psnip_safe_uchar_larger;
+#elif PSNIP_SAFE_IS_LARGER(UCHAR_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_uchar_larger;
+#elif PSNIP_SAFE_IS_LARGER(UCHAR_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UCHAR_MAX, 0xffffU)
+typedef psnip_uint16_t psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UCHAR_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UCHAR_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_uchar_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (UCHAR_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_uchar_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_UCHAR
+#endif
+
+#if CHAR_MIN == 0 && defined(PSNIP_SAFE_HAVE_LARGER_UCHAR)
+#define PSNIP_SAFE_HAVE_LARGER_CHAR
+typedef psnip_safe_uchar_larger psnip_safe_char_larger;
+#elif CHAR_MIN < 0 && defined(PSNIP_SAFE_HAVE_LARGER_SCHAR)
+#define PSNIP_SAFE_HAVE_LARGER_CHAR
+typedef psnip_safe_schar_larger psnip_safe_char_larger;
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_SHRT
+#if PSNIP_SAFE_IS_LARGER(SHRT_MAX, INT_MAX)
+typedef int psnip_safe_short_larger;
+#elif PSNIP_SAFE_IS_LARGER(SHRT_MAX, LONG_MAX)
+typedef long psnip_safe_short_larger;
+#elif PSNIP_SAFE_IS_LARGER(SHRT_MAX, LLONG_MAX)
+typedef long long psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SHRT_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SHRT_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(SHRT_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_short_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (SHRT_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_short_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_SHRT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_USHRT
+#if PSNIP_SAFE_IS_LARGER(USHRT_MAX, UINT_MAX)
+typedef unsigned int psnip_safe_ushort_larger;
+#elif PSNIP_SAFE_IS_LARGER(USHRT_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_ushort_larger;
+#elif PSNIP_SAFE_IS_LARGER(USHRT_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(USHRT_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(USHRT_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(USHRT_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_ushort_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (USHRT_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_ushort_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_USHRT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_INT
+#if PSNIP_SAFE_IS_LARGER(INT_MAX, LONG_MAX)
+typedef long psnip_safe_int_larger;
+#elif PSNIP_SAFE_IS_LARGER(INT_MAX, LLONG_MAX)
+typedef long long psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(INT_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(INT_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(INT_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_int_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (INT_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_int_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_INT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_UINT
+#if PSNIP_SAFE_IS_LARGER(UINT_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_uint_larger;
+#elif PSNIP_SAFE_IS_LARGER(UINT_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UINT_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UINT_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(UINT_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_uint_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (UINT_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_uint_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_UINT
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_LONG
+#if PSNIP_SAFE_IS_LARGER(LONG_MAX, LLONG_MAX)
+typedef long long psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LONG_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LONG_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LONG_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_long_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (LONG_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_long_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_LONG
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_ULONG
+#if PSNIP_SAFE_IS_LARGER(ULONG_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULONG_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULONG_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULONG_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_ulong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (ULONG_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_ulong_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_ULONG
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_LLONG
+#if !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LLONG_MAX, 0x7fff)
+typedef psnip_int16_t psnip_safe_llong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LLONG_MAX, 0x7fffffffLL)
+typedef psnip_int32_t psnip_safe_llong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(LLONG_MAX, 0x7fffffffffffffffLL)
+typedef psnip_int64_t psnip_safe_llong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (LLONG_MAX <= 0x7fffffffffffffffLL)
+typedef psnip_safe_int128_t psnip_safe_llong_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_LLONG
+#endif
+
+#define PSNIP_SAFE_HAVE_LARGER_ULLONG
+#if !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULLONG_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_ullong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULLONG_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_ullong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(ULLONG_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_ullong_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (ULLONG_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_ullong_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_ULLONG
+#endif
+
+#if defined(PSNIP_SAFE_SIZE_MAX)
+#define PSNIP_SAFE_HAVE_LARGER_SIZE
+#if PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, USHRT_MAX)
+typedef unsigned short psnip_safe_size_larger;
+#elif PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, UINT_MAX)
+typedef unsigned int psnip_safe_size_larger;
+#elif PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, ULONG_MAX)
+typedef unsigned long psnip_safe_size_larger;
+#elif PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, ULLONG_MAX)
+typedef unsigned long long psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, 0xffff)
+typedef psnip_uint16_t psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, 0xffffffffUL)
+typedef psnip_uint32_t psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && PSNIP_SAFE_IS_LARGER(PSNIP_SAFE_SIZE_MAX, 0xffffffffffffffffULL)
+typedef psnip_uint64_t psnip_safe_size_larger;
+#elif !defined(PSNIP_SAFE_NO_FIXED) && defined(PSNIP_SAFE_HAVE_128) && (PSNIP_SAFE_SIZE_MAX <= 0xffffffffffffffffULL)
+typedef psnip_safe_uint128_t psnip_safe_size_larger;
+#else
+#undef PSNIP_SAFE_HAVE_LARGER_SIZE
+#endif
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_SCHAR)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(signed char, schar)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_UCHAR)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned char, uchar)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_CHAR)
+#if CHAR_MIN == 0
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(char, char)
+#else
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(char, char)
+#endif
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_SHORT)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(short, short)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_USHORT)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned short, ushort)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_INT)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(int, int)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_UINT)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned int, uint)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_LONG)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(long, long)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_ULONG)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned long, ulong)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_LLONG)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(long long, llong)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_ULLONG)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(unsigned long long, ullong)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_LARGER_SIZE)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(size_t, size)
+#endif
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int8_t, int8)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint8_t, uint8)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int16_t, int16)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint16_t, uint16)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int32_t, int32)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint32_t, uint32)
+#if defined(PSNIP_SAFE_HAVE_128)
+PSNIP_SAFE_DEFINE_LARGER_SIGNED_OPS(psnip_int64_t, int64)
+PSNIP_SAFE_DEFINE_LARGER_UNSIGNED_OPS(psnip_uint64_t, uint64)
+#endif
+#endif
+
+#endif /* !defined(PSNIP_SAFE_NO_PROMOTIONS) */
+
+#define PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(T, name, op_name) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op_name(T* res, T a, T b) { \
+ return !__builtin_##op_name##_overflow(a, b, res); \
+ }
+
+#define PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(T, name, op_name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op_name(T* res, T a, T b) { \
+ const psnip_safe_##name##_larger r = psnip_safe_larger_##name##_##op_name(a, b); \
+ *res = (T) r; \
+ return (r >= min) && (r <= max); \
+ }
+
+#define PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(T, name, op_name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op_name(T* res, T a, T b) { \
+ const psnip_safe_##name##_larger r = psnip_safe_larger_##name##_##op_name(a, b); \
+ *res = (T) r; \
+ return (r <= max); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_ADD(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_add (T* res, T a, T b) { \
+ psnip_safe_bool r = !( ((b > 0) && (a > (max - b))) || \
+ ((b < 0) && (a < (min - b))) ); \
+ if(PSNIP_SAFE_LIKELY(r)) \
+ *res = a + b; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_ADD(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_add (T* res, T a, T b) { \
+ *res = (T) (a + b); \
+ return !PSNIP_SAFE_UNLIKELY((b > 0) && (a > (max - b))); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_SUB(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_sub (T* res, T a, T b) { \
+ psnip_safe_bool r = !((b > 0 && a < (min + b)) || \
+ (b < 0 && a > (max + b))); \
+ if(PSNIP_SAFE_LIKELY(r)) \
+ *res = a - b; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_SUB(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_sub (T* res, T a, T b) { \
+ *res = a - b; \
+ return !PSNIP_SAFE_UNLIKELY(b > a); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_MUL(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mul (T* res, T a, T b) { \
+ psnip_safe_bool r = 1; \
+ if (a > 0) { \
+ if (b > 0) { \
+ if (a > (max / b)) { \
+ r = 0; \
+ } \
+ } else { \
+ if (b < (min / a)) { \
+ r = 0; \
+ } \
+ } \
+ } else { \
+ if (b > 0) { \
+ if (a < (min / b)) { \
+ r = 0; \
+ } \
+ } else { \
+ if ( (a != 0) && (b < (max / a))) { \
+ r = 0; \
+ } \
+ } \
+ } \
+ if(PSNIP_SAFE_LIKELY(r)) \
+ *res = a * b; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_MUL(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mul (T* res, T a, T b) { \
+ *res = (T) (a * b); \
+ return !PSNIP_SAFE_UNLIKELY((a > 0) && (b > 0) && (a > (max / b))); \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_DIV(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_div (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else if (PSNIP_SAFE_UNLIKELY(a == min && b == -1)) { \
+ *res = min; \
+ return 0; \
+ } else { \
+ *res = (T) (a / b); \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_DIV(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_div (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else { \
+ *res = a / b; \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_MOD(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mod (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else if (PSNIP_SAFE_UNLIKELY(a == min && b == -1)) { \
+ *res = min; \
+ return 0; \
+ } else { \
+ *res = (T) (a % b); \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_UNSIGNED_MOD(T, name, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_mod (T* res, T a, T b) { \
+ if (PSNIP_SAFE_UNLIKELY(b == 0)) { \
+ *res = 0; \
+ return 0; \
+ } else { \
+ *res = a % b; \
+ return 1; \
+ } \
+ }
+
+#define PSNIP_SAFE_DEFINE_SIGNED_NEG(T, name, min, max) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_neg (T* res, T value) { \
+ psnip_safe_bool r = value != min; \
+ *res = PSNIP_SAFE_LIKELY(r) ? -value : max; \
+ return r; \
+ }
+
+#define PSNIP_SAFE_DEFINE_INTSAFE(T, name, op, isf) \
+ PSNIP_SAFE__FUNCTION psnip_safe_bool \
+ psnip_safe_##name##_##op (T* res, T a, T b) { \
+ return isf(a, b, res) == S_OK; \
+ }
+
+#if CHAR_MIN == 0
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_CHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(char, char, add, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(char, char, sub, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(char, char, mul, CHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(char, char, CHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(char, char, CHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(char, char, CHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(char, char, CHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(char, char, CHAR_MAX)
+#else /* CHAR_MIN != 0 */
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(char, char, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_CHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(char, char, add, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(char, char, sub, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(char, char, mul, CHAR_MIN, CHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(char, char, CHAR_MIN, CHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(char, char, CHAR_MIN, CHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(char, char, CHAR_MIN, CHAR_MAX)
+#endif
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(signed char, schar, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(signed char, schar, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(signed char, schar, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_SCHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(signed char, schar, add, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(signed char, schar, sub, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(signed char, schar, mul, SCHAR_MIN, SCHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(signed char, schar, SCHAR_MIN, SCHAR_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned char, uchar, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned char, uchar, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned char, uchar, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UCHAR)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned char, uchar, add, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned char, uchar, sub, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned char, uchar, mul, UCHAR_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned char, uchar, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned char, uchar, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned char, uchar, UCHAR_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned char, uchar, UCHAR_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned char, uchar, UCHAR_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(short, short, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(short, short, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(short, short, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_SHORT)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(short, short, add, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(short, short, sub, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(short, short, mul, SHRT_MIN, SHRT_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(short, short, SHRT_MIN, SHRT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(short, short, SHRT_MIN, SHRT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(short, short, SHRT_MIN, SHRT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned short, ushort, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned short, ushort, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned short, ushort, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned short, ushort, add, UShortAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned short, ushort, sub, UShortSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned short, ushort, mul, UShortMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_USHORT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned short, ushort, add, USHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned short, ushort, sub, USHRT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned short, ushort, mul, USHRT_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned short, ushort, USHRT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned short, ushort, USHRT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned short, ushort, USHRT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned short, ushort, USHRT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned short, ushort, USHRT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(int, int, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(int, int, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(int, int, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(int, int, add, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(int, int, sub, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(int, int, mul, INT_MIN, INT_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(int, int, INT_MIN, INT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(int, int, INT_MIN, INT_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(int, int, INT_MIN, INT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned int, uint, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned int, uint, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned int, uint, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned int, uint, add, UIntAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned int, uint, sub, UIntSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned int, uint, mul, UIntMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned int, uint, add, UINT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned int, uint, sub, UINT_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned int, uint, mul, UINT_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned int, uint, UINT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned int, uint, UINT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned int, uint, UINT_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned int, uint, UINT_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned int, uint, UINT_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long, long, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long, long, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long, long, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_LONG)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long, long, add, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long, long, sub, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long, long, mul, LONG_MIN, LONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(long, long, LONG_MIN, LONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(long, long, LONG_MIN, LONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(long, long, LONG_MIN, LONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long, ulong, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long, ulong, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long, ulong, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long, ulong, add, ULongAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long, ulong, sub, ULongSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long, ulong, mul, ULongMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_ULONG)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long, ulong, add, ULONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long, ulong, sub, ULONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long, ulong, mul, ULONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned long, ulong, ULONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned long, ulong, ULONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned long, ulong, ULONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned long, ulong, ULONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned long, ulong, ULONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long long, llong, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long long, llong, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(long long, llong, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_LLONG)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long long, llong, add, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long long, llong, sub, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(long long, llong, mul, LLONG_MIN, LLONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(long long, llong, LLONG_MIN, LLONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(long long, llong, LLONG_MIN, LLONG_MAX)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(long long, llong, LLONG_MIN, LLONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long long, ullong, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long long, ullong, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(unsigned long long, ullong, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long long, ullong, add, ULongLongAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long long, ullong, sub, ULongLongSub)
+PSNIP_SAFE_DEFINE_INTSAFE(unsigned long long, ullong, mul, ULongLongMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_ULLONG)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long long, ullong, add, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long long, ullong, sub, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(unsigned long long, ullong, mul, ULLONG_MAX)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(unsigned long long, ullong, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(unsigned long long, ullong, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(unsigned long long, ullong, ULLONG_MAX)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(unsigned long long, ullong, ULLONG_MAX)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(unsigned long long, ullong, ULLONG_MAX)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(size_t, size, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(size_t, size, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(size_t, size, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H)
+PSNIP_SAFE_DEFINE_INTSAFE(size_t, size, add, SizeTAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(size_t, size, sub, SizeTSub)
+PSNIP_SAFE_DEFINE_INTSAFE(size_t, size, mul, SizeTMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_SIZE)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(size_t, size, add, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(size_t, size, sub, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(size_t, size, mul, PSNIP_SAFE__SIZE_MAX_RT)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(size_t, size, PSNIP_SAFE__SIZE_MAX_RT)
+
+#if !defined(PSNIP_SAFE_NO_FIXED)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int8_t, int8, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int8_t, int8, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int8_t, int8, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT8)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int8_t, int8, add, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int8_t, int8, sub, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int8_t, int8, mul, (-0x7fLL-1), 0x7f)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int8_t, int8, (-0x7fLL-1), 0x7f)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint8_t, uint8, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint8_t, uint8, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint8_t, uint8, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT8)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint8_t, uint8, add, 0xff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint8_t, uint8, sub, 0xff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint8_t, uint8, mul, 0xff)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint8_t, uint8, 0xff)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint8_t, uint8, 0xff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint8_t, uint8, 0xff)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint8_t, uint8, 0xff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint8_t, uint8, 0xff)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int16_t, int16, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int16_t, int16, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int16_t, int16, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT16)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int16_t, int16, add, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int16_t, int16, sub, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int16_t, int16, mul, (-32767-1), 0x7fff)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int16_t, int16, (-32767-1), 0x7fff)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int16_t, int16, (-32767-1), 0x7fff)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int16_t, int16, (-32767-1), 0x7fff)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint16_t, uint16, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint16_t, uint16, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint16_t, uint16, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H) && defined(_WIN32)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint16_t, uint16, add, UShortAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint16_t, uint16, sub, UShortSub)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint16_t, uint16, mul, UShortMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT16)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint16_t, uint16, add, 0xffff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint16_t, uint16, sub, 0xffff)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint16_t, uint16, mul, 0xffff)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint16_t, uint16, 0xffff)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint16_t, uint16, 0xffff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint16_t, uint16, 0xffff)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint16_t, uint16, 0xffff)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint16_t, uint16, 0xffff)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int32_t, int32, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int32_t, int32, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int32_t, int32, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT32)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int32_t, int32, add, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int32_t, int32, sub, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int32_t, int32, mul, (-0x7fffffffLL-1), 0x7fffffffLL)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int32_t, int32, (-0x7fffffffLL-1), 0x7fffffffLL)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint32_t, uint32, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint32_t, uint32, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint32_t, uint32, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H) && defined(_WIN32)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint32_t, uint32, add, UIntAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint32_t, uint32, sub, UIntSub)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint32_t, uint32, mul, UIntMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT32)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint32_t, uint32, add, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint32_t, uint32, sub, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint32_t, uint32, mul, 0xffffffffUL)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint32_t, uint32, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint32_t, uint32, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint32_t, uint32, 0xffffffffUL)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint32_t, uint32, 0xffffffffUL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint32_t, uint32, 0xffffffffUL)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int64_t, int64, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int64_t, int64, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_int64_t, int64, mul)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_INT64)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int64_t, int64, add, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int64_t, int64, sub, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_PROMOTED_SIGNED_BINARY_OP(psnip_int64_t, int64, mul, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+#else
+PSNIP_SAFE_DEFINE_SIGNED_ADD(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_SUB(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MUL(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+#endif
+PSNIP_SAFE_DEFINE_SIGNED_DIV(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_MOD(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+PSNIP_SAFE_DEFINE_SIGNED_NEG(psnip_int64_t, int64, (-0x7fffffffffffffffLL-1), 0x7fffffffffffffffLL)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint64_t, uint64, add)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint64_t, uint64, sub)
+PSNIP_SAFE_DEFINE_BUILTIN_BINARY_OP(psnip_uint64_t, uint64, mul)
+#elif defined(PSNIP_SAFE_HAVE_INTSAFE_H) && defined(_WIN32)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint64_t, uint64, add, ULongLongAdd)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint64_t, uint64, sub, ULongLongSub)
+PSNIP_SAFE_DEFINE_INTSAFE(psnip_uint64_t, uint64, mul, ULongLongMult)
+#elif defined(PSNIP_SAFE_HAVE_LARGER_UINT64)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint64_t, uint64, add, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint64_t, uint64, sub, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_PROMOTED_UNSIGNED_BINARY_OP(psnip_uint64_t, uint64, mul, 0xffffffffffffffffULL)
+#else
+PSNIP_SAFE_DEFINE_UNSIGNED_ADD(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_UNSIGNED_SUB(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MUL(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+#endif
+PSNIP_SAFE_DEFINE_UNSIGNED_DIV(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+PSNIP_SAFE_DEFINE_UNSIGNED_MOD(psnip_uint64_t, uint64, 0xffffffffffffffffULL)
+
+#endif /* !defined(PSNIP_SAFE_NO_FIXED) */
+
+#define PSNIP_SAFE_C11_GENERIC_SELECTION(res, op) \
+ _Generic((*res), \
+ char: psnip_safe_char_##op, \
+ unsigned char: psnip_safe_uchar_##op, \
+ short: psnip_safe_short_##op, \
+ unsigned short: psnip_safe_ushort_##op, \
+ int: psnip_safe_int_##op, \
+ unsigned int: psnip_safe_uint_##op, \
+ long: psnip_safe_long_##op, \
+ unsigned long: psnip_safe_ulong_##op, \
+ long long: psnip_safe_llong_##op, \
+ unsigned long long: psnip_safe_ullong_##op)
+
+#define PSNIP_SAFE_C11_GENERIC_BINARY_OP(op, res, a, b) \
+ PSNIP_SAFE_C11_GENERIC_SELECTION(res, op)(res, a, b)
+#define PSNIP_SAFE_C11_GENERIC_UNARY_OP(op, res, v) \
+ PSNIP_SAFE_C11_GENERIC_SELECTION(res, op)(res, v)
+
+#if defined(PSNIP_SAFE_HAVE_BUILTIN_OVERFLOW)
+#define psnip_safe_add(res, a, b) !__builtin_add_overflow(a, b, res)
+#define psnip_safe_sub(res, a, b) !__builtin_sub_overflow(a, b, res)
+#define psnip_safe_mul(res, a, b) !__builtin_mul_overflow(a, b, res)
+#define psnip_safe_div(res, a, b) !__builtin_div_overflow(a, b, res)
+#define psnip_safe_mod(res, a, b) !__builtin_mod_overflow(a, b, res)
+#define psnip_safe_neg(res, v) PSNIP_SAFE_C11_GENERIC_UNARY_OP (neg, res, v)
+
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+/* The are no fixed-length or size selections because they cause an
+ * error about _Generic specifying two compatible types. Hopefully
+ * this doesn't cause problems on exotic platforms, but if it does
+ * please let me know and I'll try to figure something out. */
+
+#define psnip_safe_add(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(add, res, a, b)
+#define psnip_safe_sub(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(sub, res, a, b)
+#define psnip_safe_mul(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(mul, res, a, b)
+#define psnip_safe_div(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(div, res, a, b)
+#define psnip_safe_mod(res, a, b) PSNIP_SAFE_C11_GENERIC_BINARY_OP(mod, res, a, b)
+#define psnip_safe_neg(res, v) PSNIP_SAFE_C11_GENERIC_UNARY_OP (neg, res, v)
+#endif
+
+#if !defined(PSNIP_SAFE_HAVE_BUILTINS) && (defined(PSNIP_SAFE_EMULATE_NATIVE) || defined(PSNIP_BUILTIN_EMULATE_NATIVE))
+# define __builtin_sadd_overflow(a, b, res) (!psnip_safe_int_add(res, a, b))
+# define __builtin_saddl_overflow(a, b, res) (!psnip_safe_long_add(res, a, b))
+# define __builtin_saddll_overflow(a, b, res) (!psnip_safe_llong_add(res, a, b))
+# define __builtin_uadd_overflow(a, b, res) (!psnip_safe_uint_add(res, a, b))
+# define __builtin_uaddl_overflow(a, b, res) (!psnip_safe_ulong_add(res, a, b))
+# define __builtin_uaddll_overflow(a, b, res) (!psnip_safe_ullong_add(res, a, b))
+
+# define __builtin_ssub_overflow(a, b, res) (!psnip_safe_int_sub(res, a, b))
+# define __builtin_ssubl_overflow(a, b, res) (!psnip_safe_long_sub(res, a, b))
+# define __builtin_ssubll_overflow(a, b, res) (!psnip_safe_llong_sub(res, a, b))
+# define __builtin_usub_overflow(a, b, res) (!psnip_safe_uint_sub(res, a, b))
+# define __builtin_usubl_overflow(a, b, res) (!psnip_safe_ulong_sub(res, a, b))
+# define __builtin_usubll_overflow(a, b, res) (!psnip_safe_ullong_sub(res, a, b))
+
+# define __builtin_smul_overflow(a, b, res) (!psnip_safe_int_mul(res, a, b))
+# define __builtin_smull_overflow(a, b, res) (!psnip_safe_long_mul(res, a, b))
+# define __builtin_smulll_overflow(a, b, res) (!psnip_safe_llong_mul(res, a, b))
+# define __builtin_umul_overflow(a, b, res) (!psnip_safe_uint_mul(res, a, b))
+# define __builtin_umull_overflow(a, b, res) (!psnip_safe_ulong_mul(res, a, b))
+# define __builtin_umulll_overflow(a, b, res) (!psnip_safe_ullong_mul(res, a, b))
+#endif
+
+#endif /* !defined(PSNIP_SAFE_H) */
diff --git a/go/arrow/compute/internal/kernels/base_arithmetic.go b/go/arrow/compute/internal/kernels/base_arithmetic.go
new file mode 100644
index 00000000000..67994bd65e8
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/base_arithmetic.go
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernels
+
+import (
+ "fmt"
+
+ "github.com/apache/arrow/go/v10/arrow"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/internal/debug"
+ "golang.org/x/exp/constraints"
+)
+
+type ArithmeticOp int8
+
+const (
+ OpAdd ArithmeticOp = iota
+ OpSub
+
+ OpAddChecked
+ OpSubChecked
+)
+
+func getGoArithmeticBinary[T exec.NumericTypes](op func(a, b T, e *error) T) binaryOps[T, T, T] {
+ return binaryOps[T, T, T]{
+ arrArr: func(_ *exec.KernelCtx, left, right, out []T) error {
+ var err error
+ for i := range out {
+ out[i] = op(left[i], right[i], &err)
+ }
+ return err
+ },
+ arrScalar: func(ctx *exec.KernelCtx, left []T, right T, out []T) error {
+ var err error
+ for i := range out {
+ out[i] = op(left[i], right, &err)
+ }
+ return err
+ },
+ scalarArr: func(ctx *exec.KernelCtx, left T, right, out []T) error {
+ var err error
+ for i := range out {
+ out[i] = op(left, right[i], &err)
+ }
+ return err
+ },
+ }
+}
+
+var errOverflow = fmt.Errorf("%w: overflow", arrow.ErrInvalid)
+
+func getGoArithmeticBinaryOpIntegral[T exec.UintTypes | exec.IntTypes](op ArithmeticOp) exec.ArrayKernelExec {
+ switch op {
+ case OpAdd:
+ return ScalarBinary(getGoArithmeticBinary(func(a, b T, _ *error) T { return a + b }))
+ case OpSub:
+ return ScalarBinary(getGoArithmeticBinary(func(a, b T, _ *error) T { return a - b }))
+ case OpAddChecked:
+ shiftBy := (SizeOf[T]() * 8) - 1
+ // ie: uint32 does a >> 31 at the end, int32 does >> 30
+ if ^T(0) < 0 {
+ shiftBy--
+ }
+ return ScalarBinaryNotNull(func(_ *exec.KernelCtx, a, b T, e *error) (out T) {
+ out = a + b
+ // see math/bits/bits.go Add64 for explanation of logic
+ carry := ((a & b) | ((a | b) &^ out)) >> shiftBy
+ if carry > 0 {
+ *e = errOverflow
+ }
+ return
+ })
+ case OpSubChecked:
+ shiftBy := (SizeOf[T]() * 8) - 1
+ // ie: uint32 does a >> 31 at the end, int32 does >> 30
+ if ^T(0) < 0 {
+ shiftBy--
+ }
+ return ScalarBinaryNotNull(func(_ *exec.KernelCtx, a, b T, e *error) (out T) {
+ out = a - b
+ // see math/bits/bits.go Sub64 for explanation of bit logic
+ carry := ((^a & b) | (^(a ^ b) & out)) >> shiftBy
+ if carry > 0 {
+ *e = errOverflow
+ }
+ return
+ })
+ }
+ debug.Assert(false, "invalid arithmetic op")
+ return nil
+}
+
+func getGoArithmeticBinaryOpFloating[T constraints.Float](op ArithmeticOp) exec.ArrayKernelExec {
+ if op >= OpAddChecked {
+ op -= OpAddChecked // floating checked is the same as floating unchecked
+ }
+ switch op {
+ case OpAdd:
+ return ScalarBinary(getGoArithmeticBinary(func(a, b T, _ *error) T { return a + b }))
+ case OpSub:
+ return ScalarBinary(getGoArithmeticBinary(func(a, b T, _ *error) T { return a - b }))
+ }
+ debug.Assert(false, "invalid arithmetic op")
+ return nil
+}
+
+func ArithmeticExec(ty arrow.Type, op ArithmeticOp) exec.ArrayKernelExec {
+ switch ty {
+ case arrow.INT8:
+ return getArithmeticBinaryOpIntegral[int8](op)
+ case arrow.UINT8:
+ return getArithmeticBinaryOpIntegral[uint8](op)
+ case arrow.INT16:
+ return getArithmeticBinaryOpIntegral[int16](op)
+ case arrow.UINT16:
+ return getArithmeticBinaryOpIntegral[uint16](op)
+ case arrow.INT32:
+ return getArithmeticBinaryOpIntegral[int32](op)
+ case arrow.UINT32:
+ return getArithmeticBinaryOpIntegral[uint32](op)
+ case arrow.INT64:
+ return getArithmeticBinaryOpIntegral[int64](op)
+ case arrow.UINT64:
+ return getArithmeticBinaryOpIntegral[uint64](op)
+ case arrow.FLOAT32:
+ return getArithmeticBinaryOpFloating[float32](op)
+ case arrow.FLOAT64:
+ return getArithmeticBinaryOpFloating[float64](op)
+ }
+ debug.Assert(false, "invalid arithmetic type")
+ return nil
+
+}
diff --git a/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go b/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go
new file mode 100644
index 00000000000..9ce0d60c9a9
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/base_arithmetic_amd64.go
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package kernels
+
+import (
+ "unsafe"
+
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "golang.org/x/exp/constraints"
+ "golang.org/x/sys/cpu"
+)
+
+func getAvx2ArithmeticBinaryNumeric[T exec.NumericTypes](op ArithmeticOp) binaryOps[T, T, T] {
+ typ := exec.GetType[T]()
+ return binaryOps[T, T, T]{
+ arrArr: func(_ *exec.KernelCtx, Arg0, Arg1, Out []T) error {
+ arithmeticAvx2(typ, op, exec.GetBytes(Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Out))
+ return nil
+ },
+ arrScalar: func(_ *exec.KernelCtx, Arg0 []T, Arg1 T, Out []T) error {
+ arithmeticArrScalarAvx2(typ, op, exec.GetBytes(Arg0), unsafe.Pointer(&Arg1), exec.GetBytes(Out), len(Out))
+ return nil
+ },
+ scalarArr: func(_ *exec.KernelCtx, Arg0 T, Arg1, Out []T) error {
+ arithmeticScalarArrAvx2(typ, op, unsafe.Pointer(&Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Out))
+ return nil
+ },
+ }
+}
+
+func getSSE4ArithmeticBinaryNumeric[T exec.NumericTypes](op ArithmeticOp) binaryOps[T, T, T] {
+ typ := exec.GetType[T]()
+ return binaryOps[T, T, T]{
+ arrArr: func(_ *exec.KernelCtx, Arg0, Arg1, Out []T) error {
+ arithmeticSSE4(typ, op, exec.GetBytes(Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Out))
+ return nil
+ },
+ arrScalar: func(_ *exec.KernelCtx, Arg0 []T, Arg1 T, Out []T) error {
+ arithmeticArrScalarSSE4(typ, op, exec.GetBytes(Arg0), unsafe.Pointer(&Arg1), exec.GetBytes(Out), len(Out))
+ return nil
+ },
+ scalarArr: func(_ *exec.KernelCtx, Arg0 T, Arg1, Out []T) error {
+ arithmeticScalarArrSSE4(typ, op, unsafe.Pointer(&Arg0), exec.GetBytes(Arg1), exec.GetBytes(Out), len(Out))
+ return nil
+ },
+ }
+}
+
+func getArithmeticBinaryOpIntegral[T exec.UintTypes | exec.IntTypes](op ArithmeticOp) exec.ArrayKernelExec {
+ if op >= OpAddChecked {
+ // integral checked funcs need to use ScalarBinaryNotNull
+ return getGoArithmeticBinaryOpIntegral[T](op)
+ }
+
+ if cpu.X86.HasAVX2 {
+ return ScalarBinary(getAvx2ArithmeticBinaryNumeric[T](op))
+ } else if cpu.X86.HasSSE42 {
+ return ScalarBinary(getSSE4ArithmeticBinaryNumeric[T](op))
+ }
+
+ return getGoArithmeticBinaryOpIntegral[T](op)
+}
+
+func getArithmeticBinaryOpFloating[T constraints.Float](op ArithmeticOp) exec.ArrayKernelExec {
+ if cpu.X86.HasAVX2 {
+ return ScalarBinary(getAvx2ArithmeticBinaryNumeric[T](op))
+ } else if cpu.X86.HasSSE42 {
+ return ScalarBinary(getSSE4ArithmeticBinaryNumeric[T](op))
+ }
+
+ return getGoArithmeticBinaryOpFloating[T](op)
+}
diff --git a/go/arrow/compute/internal/kernels/base_arithmetic_avx2_amd64.go b/go/arrow/compute/internal/kernels/base_arithmetic_avx2_amd64.go
new file mode 100644
index 00000000000..35e72f1cc83
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/base_arithmetic_avx2_amd64.go
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package kernels
+
+import (
+ "unsafe"
+
+ "github.com/apache/arrow/go/v10/arrow"
+)
+
+//go:noescape
+func _arithmetic_avx2(typ int, op int8, inLeft, inRight, out unsafe.Pointer, len int)
+
+func arithmeticAvx2(typ arrow.Type, op ArithmeticOp, left, right, out []byte, len int) {
+ _arithmetic_avx2(int(typ), int8(op), unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), len)
+}
+
+//go:noescape
+func _arithmetic_arr_scalar_avx2(typ int, op int8, inLeft, inRight, out unsafe.Pointer, len int)
+
+func arithmeticArrScalarAvx2(typ arrow.Type, op ArithmeticOp, left []byte, right unsafe.Pointer, out []byte, len int) {
+ _arithmetic_arr_scalar_avx2(int(typ), int8(op), unsafe.Pointer(&left[0]), right, unsafe.Pointer(&out[0]), len)
+}
+
+//go:noescape
+func _arithmetic_scalar_arr_avx2(typ int, op int8, inLeft, inRight, out unsafe.Pointer, len int)
+
+func arithmeticScalarArrAvx2(typ arrow.Type, op ArithmeticOp, left unsafe.Pointer, right, out []byte, len int) {
+ _arithmetic_scalar_arr_avx2(int(typ), int8(op), left, unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), len)
+}
diff --git a/go/arrow/compute/internal/kernels/base_arithmetic_avx2_amd64.s b/go/arrow/compute/internal/kernels/base_arithmetic_avx2_amd64.s
new file mode 100644
index 00000000000..9b5a1a7767c
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/base_arithmetic_avx2_amd64.s
@@ -0,0 +1,12845 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+TEXT ·_arithmetic_avx2(SB), $0-48
+
+ MOVQ typ+0(FP), DI
+ MOVQ op+8(FP), SI
+ MOVQ inLeft+16(FP), DX
+ MOVQ inRight+24(FP), CX
+ MOVQ out+32(FP), R8
+ MOVQ len+40(FP), R9
+
+ LONG $0x01fe8040 // cmp sil, 1
+ JG LBB0_10
+ WORD $0x8440; BYTE $0xf6 // test sil, sil
+ JE LBB0_19
+ LONG $0x01fe8040 // cmp sil, 1
+ JNE LBB0_537
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_291
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_5
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_324
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_336
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_348
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_353
+
+LBB0_10:
+ LONG $0x02fe8040 // cmp sil, 2
+ JE LBB0_152
+ LONG $0x03fe8040 // cmp sil, 3
+ JNE LBB0_537
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_417
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_14
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_450
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_462
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_474
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_479
+
+LBB0_19:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_32
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_21
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_65
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_77
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_89
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_94
+
+LBB0_152:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_165
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_154
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_198
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_210
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_222
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_227
+
+LBB0_291:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_292
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_378
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_390
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_402
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_407
+
+LBB0_417:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_418
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_504
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_516
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_528
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_533
+
+LBB0_32:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_33
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_119
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_131
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_143
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_148
+
+LBB0_165:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_166
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_252
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_264
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_276
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_281
+
+LBB0_5:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_303
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_315
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_320
+
+LBB0_14:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_429
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_441
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_446
+
+LBB0_21:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_44
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_56
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_61
+
+LBB0_154:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_177
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_189
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_194
+
+LBB0_292:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_357
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_369
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_374
+
+LBB0_418:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_483
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_495
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_500
+
+LBB0_33:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_98
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_110
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_115
+
+LBB0_166:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_231
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_537
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_243
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_248
+
+LBB0_324:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_327
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_332
+
+LBB0_336:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_339
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_344
+
+LBB0_450:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_453
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_458
+
+LBB0_462:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_465
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_470
+
+LBB0_65:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_68
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_73
+
+LBB0_77:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_80
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_85
+
+LBB0_198:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_201
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_206
+
+LBB0_210:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x40f98341 // cmp r9d, 64
+ JAE LBB0_213
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_218
+
+LBB0_378:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_381
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_386
+
+LBB0_390:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_393
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_398
+
+LBB0_504:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_507
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_512
+
+LBB0_516:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_519
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_524
+
+LBB0_119:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_122
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_127
+
+LBB0_131:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_134
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_139
+
+LBB0_252:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_255
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_260
+
+LBB0_264:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_267
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_272
+
+LBB0_303:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_306
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_311
+
+LBB0_429:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_432
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_437
+
+LBB0_44:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_47
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_52
+
+LBB0_177:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JAE LBB0_180
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_185
+
+LBB0_357:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_360
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_365
+
+LBB0_483:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_486
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_491
+
+LBB0_98:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_101
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_106
+
+LBB0_231:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_537
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_234
+ WORD $0xf631 // xor esi, esi
+ JMP LBB0_239
+
+LBB0_348:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_353
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_353
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_351:
+ LONG $0x046ffec5; BYTE $0xba // vmovdqu ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20ba // vmovdqu ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40ba // vmovdqu ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60ba // vmovdqu ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x04fafdc5; BYTE $0xb9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi]
+ LONG $0x4cfaf5c5; WORD $0x20b9 // vpsubd ymm1, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x54faedc5; WORD $0x40b9 // vpsubd ymm2, ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5cfae5c5; WORD $0x60b9 // vpsubd ymm3, ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_351
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_353:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_355
+
+LBB0_354:
+ WORD $0x3c8b; BYTE $0xb2 // mov edi, dword [rdx + 4*rsi]
+ WORD $0x3c2b; BYTE $0xb1 // sub edi, dword [rcx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_354
+
+LBB0_355:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_356:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_356
+ JMP LBB0_537
+
+LBB0_474:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_479
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_479
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_477:
+ LONG $0x046ffec5; BYTE $0xba // vmovdqu ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20ba // vmovdqu ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40ba // vmovdqu ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60ba // vmovdqu ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x04fafdc5; BYTE $0xb9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi]
+ LONG $0x4cfaf5c5; WORD $0x20b9 // vpsubd ymm1, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x54faedc5; WORD $0x40b9 // vpsubd ymm2, ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5cfae5c5; WORD $0x60b9 // vpsubd ymm3, ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_477
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_479:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_481
+
+LBB0_480:
+ WORD $0x3c8b; BYTE $0xb2 // mov edi, dword [rdx + 4*rsi]
+ WORD $0x3c2b; BYTE $0xb1 // sub edi, dword [rcx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_480
+
+LBB0_481:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_482:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_482
+ JMP LBB0_537
+
+LBB0_89:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_94
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_94
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_92:
+ LONG $0x046ffec5; BYTE $0xb9 // vmovdqu ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20b9 // vmovdqu ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40b9 // vmovdqu ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60b9 // vmovdqu ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x04fefdc5; BYTE $0xba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi]
+ LONG $0x4cfef5c5; WORD $0x20ba // vpaddd ymm1, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x54feedc5; WORD $0x40ba // vpaddd ymm2, ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5cfee5c5; WORD $0x60ba // vpaddd ymm3, ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_92
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_94:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_96
+
+LBB0_95:
+ WORD $0x3c8b; BYTE $0xb1 // mov edi, dword [rcx + 4*rsi]
+ WORD $0x3c03; BYTE $0xb2 // add edi, dword [rdx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_95
+
+LBB0_96:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_97:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_97
+ JMP LBB0_537
+
+LBB0_222:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_227
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_227
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_225:
+ LONG $0x046ffec5; BYTE $0xb9 // vmovdqu ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20b9 // vmovdqu ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40b9 // vmovdqu ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60b9 // vmovdqu ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x04fefdc5; BYTE $0xba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi]
+ LONG $0x4cfef5c5; WORD $0x20ba // vpaddd ymm1, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x54feedc5; WORD $0x40ba // vpaddd ymm2, ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5cfee5c5; WORD $0x60ba // vpaddd ymm3, ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_225
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_227:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_229
+
+LBB0_228:
+ WORD $0x3c8b; BYTE $0xb1 // mov edi, dword [rcx + 4*rsi]
+ WORD $0x3c03; BYTE $0xb2 // add edi, dword [rdx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_228
+
+LBB0_229:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_230:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_230
+ JMP LBB0_537
+
+LBB0_402:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_407
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_407
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_405:
+ LONG $0x0410fdc5; BYTE $0xfa // vmovupd ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c10fdc5; WORD $0x20fa // vmovupd ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x5410fdc5; WORD $0x40fa // vmovupd ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c10fdc5; WORD $0x60fa // vmovupd ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x045cfdc5; BYTE $0xf9 // vsubpd ymm0, ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c5cf5c5; WORD $0x20f9 // vsubpd ymm1, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x545cedc5; WORD $0x40f9 // vsubpd ymm2, ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c5ce5c5; WORD $0x60f9 // vsubpd ymm3, ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf804 // vmovupd yword [r8 + 8*rdi], ymm0
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x117dc1c4; WORD $0xf854; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_405
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_407:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_409
+
+LBB0_408:
+ LONG $0x0410fbc5; BYTE $0xf2 // vmovsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045cfbc5; BYTE $0xf1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_408
+
+LBB0_409:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_410:
+ LONG $0x0410fbc5; BYTE $0xf2 // vmovsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045cfbc5; BYTE $0xf1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x4410fbc5; WORD $0x08f2 // vmovsd xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x445cfbc5; WORD $0x08f1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x08 // vmovsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x4410fbc5; WORD $0x10f2 // vmovsd xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x445cfbc5; WORD $0x10f1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x10 // vmovsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x4410fbc5; WORD $0x18f2 // vmovsd xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x445cfbc5; WORD $0x18f1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x18 // vmovsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_410
+ JMP LBB0_537
+
+LBB0_528:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_533
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_533
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_531:
+ LONG $0x0410fdc5; BYTE $0xfa // vmovupd ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c10fdc5; WORD $0x20fa // vmovupd ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x5410fdc5; WORD $0x40fa // vmovupd ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c10fdc5; WORD $0x60fa // vmovupd ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x045cfdc5; BYTE $0xf9 // vsubpd ymm0, ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c5cf5c5; WORD $0x20f9 // vsubpd ymm1, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x545cedc5; WORD $0x40f9 // vsubpd ymm2, ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c5ce5c5; WORD $0x60f9 // vsubpd ymm3, ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf804 // vmovupd yword [r8 + 8*rdi], ymm0
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x117dc1c4; WORD $0xf854; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_531
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_533:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_535
+
+LBB0_534:
+ LONG $0x0410fbc5; BYTE $0xf2 // vmovsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045cfbc5; BYTE $0xf1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_534
+
+LBB0_535:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_536:
+ LONG $0x0410fbc5; BYTE $0xf2 // vmovsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045cfbc5; BYTE $0xf1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x4410fbc5; WORD $0x08f2 // vmovsd xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x445cfbc5; WORD $0x08f1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x08 // vmovsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x4410fbc5; WORD $0x10f2 // vmovsd xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x445cfbc5; WORD $0x10f1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x10 // vmovsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x4410fbc5; WORD $0x18f2 // vmovsd xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x445cfbc5; WORD $0x18f1 // vsubsd xmm0, xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x18 // vmovsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_536
+ JMP LBB0_537
+
+LBB0_143:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_148
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_148
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_146:
+ LONG $0x0410fdc5; BYTE $0xf9 // vmovupd ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c10fdc5; WORD $0x20f9 // vmovupd ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x5410fdc5; WORD $0x40f9 // vmovupd ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c10fdc5; WORD $0x60f9 // vmovupd ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x0458fdc5; BYTE $0xfa // vaddpd ymm0, ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c58f5c5; WORD $0x20fa // vaddpd ymm1, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x5458edc5; WORD $0x40fa // vaddpd ymm2, ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c58e5c5; WORD $0x60fa // vaddpd ymm3, ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf804 // vmovupd yword [r8 + 8*rdi], ymm0
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x117dc1c4; WORD $0xf854; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_146
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_148:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_150
+
+LBB0_149:
+ LONG $0x0410fbc5; BYTE $0xf1 // vmovsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x0458fbc5; BYTE $0xf2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_149
+
+LBB0_150:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_151:
+ LONG $0x0410fbc5; BYTE $0xf1 // vmovsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x0458fbc5; BYTE $0xf2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x4410fbc5; WORD $0x08f1 // vmovsd xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x4458fbc5; WORD $0x08f2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x08 // vmovsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x4410fbc5; WORD $0x10f1 // vmovsd xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x4458fbc5; WORD $0x10f2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x10 // vmovsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x4410fbc5; WORD $0x18f1 // vmovsd xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x4458fbc5; WORD $0x18f2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x18 // vmovsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_151
+ JMP LBB0_537
+
+LBB0_276:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_281
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_281
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_279:
+ LONG $0x0410fdc5; BYTE $0xf9 // vmovupd ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c10fdc5; WORD $0x20f9 // vmovupd ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x5410fdc5; WORD $0x40f9 // vmovupd ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c10fdc5; WORD $0x60f9 // vmovupd ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x0458fdc5; BYTE $0xfa // vaddpd ymm0, ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c58f5c5; WORD $0x20fa // vaddpd ymm1, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x5458edc5; WORD $0x40fa // vaddpd ymm2, ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c58e5c5; WORD $0x60fa // vaddpd ymm3, ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf804 // vmovupd yword [r8 + 8*rdi], ymm0
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x117dc1c4; WORD $0xf854; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_279
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_281:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_283
+
+LBB0_282:
+ LONG $0x0410fbc5; BYTE $0xf1 // vmovsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x0458fbc5; BYTE $0xf2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_282
+
+LBB0_283:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_284:
+ LONG $0x0410fbc5; BYTE $0xf1 // vmovsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x0458fbc5; BYTE $0xf2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi]
+ LONG $0x117bc1c4; WORD $0xf004 // vmovsd qword [r8 + 8*rsi], xmm0
+ LONG $0x4410fbc5; WORD $0x08f1 // vmovsd xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x4458fbc5; WORD $0x08f2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x08 // vmovsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x4410fbc5; WORD $0x10f1 // vmovsd xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x4458fbc5; WORD $0x10f2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x10 // vmovsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x4410fbc5; WORD $0x18f1 // vmovsd xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x4458fbc5; WORD $0x18f2 // vaddsd xmm0, xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x117bc1c4; WORD $0xf044; BYTE $0x18 // vmovsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_284
+ JMP LBB0_537
+
+LBB0_315:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_320
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_320
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_318:
+ LONG $0x046ffec5; BYTE $0x3a // vmovdqu ymm0, yword [rdx + rdi]
+ LONG $0x4c6ffec5; WORD $0x203a // vmovdqu ymm1, yword [rdx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x403a // vmovdqu ymm2, yword [rdx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x603a // vmovdqu ymm3, yword [rdx + rdi + 96]
+ LONG $0x04f8fdc5; BYTE $0x39 // vpsubb ymm0, ymm0, yword [rcx + rdi]
+ LONG $0x4cf8f5c5; WORD $0x2039 // vpsubb ymm1, ymm1, yword [rcx + rdi + 32]
+ LONG $0x54f8edc5; WORD $0x4039 // vpsubb ymm2, ymm2, yword [rcx + rdi + 64]
+ LONG $0x5cf8e5c5; WORD $0x6039 // vpsubb ymm3, ymm3, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_318
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_320:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_322
+
+LBB0_321:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_321
+
+LBB0_322:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_323:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_323
+ JMP LBB0_537
+
+LBB0_441:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_446
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_446
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_444:
+ LONG $0x046ffec5; BYTE $0x3a // vmovdqu ymm0, yword [rdx + rdi]
+ LONG $0x4c6ffec5; WORD $0x203a // vmovdqu ymm1, yword [rdx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x403a // vmovdqu ymm2, yword [rdx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x603a // vmovdqu ymm3, yword [rdx + rdi + 96]
+ LONG $0x04f8fdc5; BYTE $0x39 // vpsubb ymm0, ymm0, yword [rcx + rdi]
+ LONG $0x4cf8f5c5; WORD $0x2039 // vpsubb ymm1, ymm1, yword [rcx + rdi + 32]
+ LONG $0x54f8edc5; WORD $0x4039 // vpsubb ymm2, ymm2, yword [rcx + rdi + 64]
+ LONG $0x5cf8e5c5; WORD $0x6039 // vpsubb ymm3, ymm3, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_444
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_446:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_448
+
+LBB0_447:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_447
+
+LBB0_448:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_449:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_449
+ JMP LBB0_537
+
+LBB0_56:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_61
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_61
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_59:
+ LONG $0x046ffec5; BYTE $0x39 // vmovdqu ymm0, yword [rcx + rdi]
+ LONG $0x4c6ffec5; WORD $0x2039 // vmovdqu ymm1, yword [rcx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x4039 // vmovdqu ymm2, yword [rcx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6039 // vmovdqu ymm3, yword [rcx + rdi + 96]
+ LONG $0x04fcfdc5; BYTE $0x3a // vpaddb ymm0, ymm0, yword [rdx + rdi]
+ LONG $0x4cfcf5c5; WORD $0x203a // vpaddb ymm1, ymm1, yword [rdx + rdi + 32]
+ LONG $0x54fcedc5; WORD $0x403a // vpaddb ymm2, ymm2, yword [rdx + rdi + 64]
+ LONG $0x5cfce5c5; WORD $0x603a // vpaddb ymm3, ymm3, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_59
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_61:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_63
+
+LBB0_62:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_62
+
+LBB0_63:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_64:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_64
+ JMP LBB0_537
+
+LBB0_189:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_194
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_194
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_192:
+ LONG $0x046ffec5; BYTE $0x39 // vmovdqu ymm0, yword [rcx + rdi]
+ LONG $0x4c6ffec5; WORD $0x2039 // vmovdqu ymm1, yword [rcx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x4039 // vmovdqu ymm2, yword [rcx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6039 // vmovdqu ymm3, yword [rcx + rdi + 96]
+ LONG $0x04fcfdc5; BYTE $0x3a // vpaddb ymm0, ymm0, yword [rdx + rdi]
+ LONG $0x4cfcf5c5; WORD $0x203a // vpaddb ymm1, ymm1, yword [rdx + rdi + 32]
+ LONG $0x54fcedc5; WORD $0x403a // vpaddb ymm2, ymm2, yword [rdx + rdi + 64]
+ LONG $0x5cfce5c5; WORD $0x603a // vpaddb ymm3, ymm3, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_192
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_194:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_196
+
+LBB0_195:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_195
+
+LBB0_196:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_197:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_197
+ JMP LBB0_537
+
+LBB0_369:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_374
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_374
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_372:
+ LONG $0x046ffec5; BYTE $0xfa // vmovdqu ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20fa // vmovdqu ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40fa // vmovdqu ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60fa // vmovdqu ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x04fbfdc5; BYTE $0xf9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi]
+ LONG $0x4cfbf5c5; WORD $0x20f9 // vpsubq ymm1, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x54fbedc5; WORD $0x40f9 // vpsubq ymm2, ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5cfbe5c5; WORD $0x60f9 // vpsubq ymm3, ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_372
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_374:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_376
+
+LBB0_375:
+ LONG $0xf23c8b48 // mov rdi, qword [rdx + 8*rsi]
+ LONG $0xf13c2b48 // sub rdi, qword [rcx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_375
+
+LBB0_376:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_377:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_377
+ JMP LBB0_537
+
+LBB0_495:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_500
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_500
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_498:
+ LONG $0x046ffec5; BYTE $0xfa // vmovdqu ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20fa // vmovdqu ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40fa // vmovdqu ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60fa // vmovdqu ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x04fbfdc5; BYTE $0xf9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi]
+ LONG $0x4cfbf5c5; WORD $0x20f9 // vpsubq ymm1, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x54fbedc5; WORD $0x40f9 // vpsubq ymm2, ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5cfbe5c5; WORD $0x60f9 // vpsubq ymm3, ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_498
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_500:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_502
+
+LBB0_501:
+ LONG $0xf23c8b48 // mov rdi, qword [rdx + 8*rsi]
+ LONG $0xf13c2b48 // sub rdi, qword [rcx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_501
+
+LBB0_502:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_503:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_503
+ JMP LBB0_537
+
+LBB0_110:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_115
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_115
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_113:
+ LONG $0x046ffec5; BYTE $0xf9 // vmovdqu ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20f9 // vmovdqu ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40f9 // vmovdqu ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60f9 // vmovdqu ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x04d4fdc5; BYTE $0xfa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi]
+ LONG $0x4cd4f5c5; WORD $0x20fa // vpaddq ymm1, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x54d4edc5; WORD $0x40fa // vpaddq ymm2, ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5cd4e5c5; WORD $0x60fa // vpaddq ymm3, ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_113
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_115:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_117
+
+LBB0_116:
+ LONG $0xf13c8b48 // mov rdi, qword [rcx + 8*rsi]
+ LONG $0xf23c0348 // add rdi, qword [rdx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_116
+
+LBB0_117:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_118:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_118
+ JMP LBB0_537
+
+LBB0_243:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_248
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_248
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_246:
+ LONG $0x046ffec5; BYTE $0xf9 // vmovdqu ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20f9 // vmovdqu ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40f9 // vmovdqu ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60f9 // vmovdqu ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x04d4fdc5; BYTE $0xfa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi]
+ LONG $0x4cd4f5c5; WORD $0x20fa // vpaddq ymm1, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x54d4edc5; WORD $0x40fa // vpaddq ymm2, ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5cd4e5c5; WORD $0x60fa // vpaddq ymm3, ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_246
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_248:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_250
+
+LBB0_249:
+ LONG $0xf13c8b48 // mov rdi, qword [rcx + 8*rsi]
+ LONG $0xf23c0348 // add rdi, qword [rdx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_249
+
+LBB0_250:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_251:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_251
+ JMP LBB0_537
+
+LBB0_327:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_332
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_332
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_330:
+ LONG $0x046ffec5; BYTE $0x7a // vmovdqu ymm0, yword [rdx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x207a // vmovdqu ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x407a // vmovdqu ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x607a // vmovdqu ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x04f9fdc5; BYTE $0x79 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi]
+ LONG $0x4cf9f5c5; WORD $0x2079 // vpsubw ymm1, ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x54f9edc5; WORD $0x4079 // vpsubw ymm2, ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5cf9e5c5; WORD $0x6079 // vpsubw ymm3, ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_330
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_332:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_334
+
+LBB0_333:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_333
+
+LBB0_334:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_335:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_335
+ JMP LBB0_537
+
+LBB0_339:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_344
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_344
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_342:
+ LONG $0x046ffec5; BYTE $0x7a // vmovdqu ymm0, yword [rdx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x207a // vmovdqu ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x407a // vmovdqu ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x607a // vmovdqu ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x04f9fdc5; BYTE $0x79 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi]
+ LONG $0x4cf9f5c5; WORD $0x2079 // vpsubw ymm1, ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x54f9edc5; WORD $0x4079 // vpsubw ymm2, ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5cf9e5c5; WORD $0x6079 // vpsubw ymm3, ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_342
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_344:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_346
+
+LBB0_345:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_345
+
+LBB0_346:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_347:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_347
+ JMP LBB0_537
+
+LBB0_453:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_458
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_458
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_456:
+ LONG $0x046ffec5; BYTE $0x7a // vmovdqu ymm0, yword [rdx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x207a // vmovdqu ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x407a // vmovdqu ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x607a // vmovdqu ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x04f9fdc5; BYTE $0x79 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi]
+ LONG $0x4cf9f5c5; WORD $0x2079 // vpsubw ymm1, ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x54f9edc5; WORD $0x4079 // vpsubw ymm2, ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5cf9e5c5; WORD $0x6079 // vpsubw ymm3, ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_456
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_458:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_460
+
+LBB0_459:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_459
+
+LBB0_460:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_461:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_461
+ JMP LBB0_537
+
+LBB0_465:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_470
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_470
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_468:
+ LONG $0x046ffec5; BYTE $0x7a // vmovdqu ymm0, yword [rdx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x207a // vmovdqu ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x407a // vmovdqu ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x607a // vmovdqu ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x04f9fdc5; BYTE $0x79 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi]
+ LONG $0x4cf9f5c5; WORD $0x2079 // vpsubw ymm1, ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x54f9edc5; WORD $0x4079 // vpsubw ymm2, ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5cf9e5c5; WORD $0x6079 // vpsubw ymm3, ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_468
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_470:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_472
+
+LBB0_471:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_471
+
+LBB0_472:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_473:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_473
+ JMP LBB0_537
+
+LBB0_68:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_73
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_73
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_71:
+ LONG $0x046ffec5; BYTE $0x79 // vmovdqu ymm0, yword [rcx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x2079 // vmovdqu ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x4079 // vmovdqu ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6079 // vmovdqu ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x04fdfdc5; BYTE $0x7a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi]
+ LONG $0x4cfdf5c5; WORD $0x207a // vpaddw ymm1, ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x54fdedc5; WORD $0x407a // vpaddw ymm2, ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5cfde5c5; WORD $0x607a // vpaddw ymm3, ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_71
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_73:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_75
+
+LBB0_74:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ LONG $0x723c0366 // add di, word [rdx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_74
+
+LBB0_75:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_76:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_76
+ JMP LBB0_537
+
+LBB0_80:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_85
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_85
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_83:
+ LONG $0x046ffec5; BYTE $0x79 // vmovdqu ymm0, yword [rcx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x2079 // vmovdqu ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x4079 // vmovdqu ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6079 // vmovdqu ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x04fdfdc5; BYTE $0x7a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi]
+ LONG $0x4cfdf5c5; WORD $0x207a // vpaddw ymm1, ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x54fdedc5; WORD $0x407a // vpaddw ymm2, ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5cfde5c5; WORD $0x607a // vpaddw ymm3, ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_83
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_85:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_87
+
+LBB0_86:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ LONG $0x723c0366 // add di, word [rdx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_86
+
+LBB0_87:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_88:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_88
+ JMP LBB0_537
+
+LBB0_201:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_206
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_206
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_204:
+ LONG $0x046ffec5; BYTE $0x79 // vmovdqu ymm0, yword [rcx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x2079 // vmovdqu ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x4079 // vmovdqu ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6079 // vmovdqu ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x04fdfdc5; BYTE $0x7a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi]
+ LONG $0x4cfdf5c5; WORD $0x207a // vpaddw ymm1, ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x54fdedc5; WORD $0x407a // vpaddw ymm2, ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5cfde5c5; WORD $0x607a // vpaddw ymm3, ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_204
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_206:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_208
+
+LBB0_207:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ LONG $0x723c0366 // add di, word [rdx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_207
+
+LBB0_208:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_209:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_209
+ JMP LBB0_537
+
+LBB0_213:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_218
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_218
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xc0 // and esi, -64
+ WORD $0xff31 // xor edi, edi
+
+LBB0_216:
+ LONG $0x046ffec5; BYTE $0x79 // vmovdqu ymm0, yword [rcx + 2*rdi]
+ LONG $0x4c6ffec5; WORD $0x2079 // vmovdqu ymm1, yword [rcx + 2*rdi + 32]
+ LONG $0x546ffec5; WORD $0x4079 // vmovdqu ymm2, yword [rcx + 2*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6079 // vmovdqu ymm3, yword [rcx + 2*rdi + 96]
+ LONG $0x04fdfdc5; BYTE $0x7a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi]
+ LONG $0x4cfdf5c5; WORD $0x207a // vpaddw ymm1, ymm1, yword [rdx + 2*rdi + 32]
+ LONG $0x54fdedc5; WORD $0x407a // vpaddw ymm2, ymm2, yword [rdx + 2*rdi + 64]
+ LONG $0x5cfde5c5; WORD $0x607a // vpaddw ymm3, ymm3, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x7804 // vmovdqu yword [r8 + 2*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x785c; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm3
+ LONG $0x40c78348 // add rdi, 64
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_216
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_218:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_220
+
+LBB0_219:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ LONG $0x723c0366 // add di, word [rdx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_219
+
+LBB0_220:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_221:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_221
+ JMP LBB0_537
+
+LBB0_381:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_386
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_386
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_384:
+ LONG $0x046ffec5; BYTE $0xfa // vmovdqu ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20fa // vmovdqu ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40fa // vmovdqu ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60fa // vmovdqu ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x04fbfdc5; BYTE $0xf9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi]
+ LONG $0x4cfbf5c5; WORD $0x20f9 // vpsubq ymm1, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x54fbedc5; WORD $0x40f9 // vpsubq ymm2, ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5cfbe5c5; WORD $0x60f9 // vpsubq ymm3, ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_384
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_386:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_388
+
+LBB0_387:
+ LONG $0xf23c8b48 // mov rdi, qword [rdx + 8*rsi]
+ LONG $0xf13c2b48 // sub rdi, qword [rcx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_387
+
+LBB0_388:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_389:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_389
+ JMP LBB0_537
+
+LBB0_393:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_398
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_398
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_396:
+ LONG $0x0410fcc5; BYTE $0xba // vmovups ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c10fcc5; WORD $0x20ba // vmovups ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x5410fcc5; WORD $0x40ba // vmovups ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c10fcc5; WORD $0x60ba // vmovups ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x045cfcc5; BYTE $0xb9 // vsubps ymm0, ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c5cf4c5; WORD $0x20b9 // vsubps ymm1, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x545cecc5; WORD $0x40b9 // vsubps ymm2, ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c5ce4c5; WORD $0x60b9 // vsubps ymm3, ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb804 // vmovups yword [r8 + 4*rdi], ymm0
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x117cc1c4; WORD $0xb854; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_396
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_398:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_400
+
+LBB0_399:
+ LONG $0x0410fac5; BYTE $0xb2 // vmovss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045cfac5; BYTE $0xb1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_399
+
+LBB0_400:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_401:
+ LONG $0x0410fac5; BYTE $0xb2 // vmovss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045cfac5; BYTE $0xb1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x4410fac5; WORD $0x04b2 // vmovss xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x445cfac5; WORD $0x04b1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x04 // vmovss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x4410fac5; WORD $0x08b2 // vmovss xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x445cfac5; WORD $0x08b1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x08 // vmovss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x4410fac5; WORD $0x0cb2 // vmovss xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x445cfac5; WORD $0x0cb1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x0c // vmovss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_401
+ JMP LBB0_537
+
+LBB0_507:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_512
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_512
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_510:
+ LONG $0x046ffec5; BYTE $0xfa // vmovdqu ymm0, yword [rdx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20fa // vmovdqu ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40fa // vmovdqu ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60fa // vmovdqu ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x04fbfdc5; BYTE $0xf9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi]
+ LONG $0x4cfbf5c5; WORD $0x20f9 // vpsubq ymm1, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x54fbedc5; WORD $0x40f9 // vpsubq ymm2, ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5cfbe5c5; WORD $0x60f9 // vpsubq ymm3, ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_510
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_512:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_514
+
+LBB0_513:
+ LONG $0xf23c8b48 // mov rdi, qword [rdx + 8*rsi]
+ LONG $0xf13c2b48 // sub rdi, qword [rcx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_513
+
+LBB0_514:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_515:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_515
+ JMP LBB0_537
+
+LBB0_519:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_524
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_524
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_522:
+ LONG $0x0410fcc5; BYTE $0xba // vmovups ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c10fcc5; WORD $0x20ba // vmovups ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x5410fcc5; WORD $0x40ba // vmovups ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c10fcc5; WORD $0x60ba // vmovups ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x045cfcc5; BYTE $0xb9 // vsubps ymm0, ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c5cf4c5; WORD $0x20b9 // vsubps ymm1, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x545cecc5; WORD $0x40b9 // vsubps ymm2, ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c5ce4c5; WORD $0x60b9 // vsubps ymm3, ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb804 // vmovups yword [r8 + 4*rdi], ymm0
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x117cc1c4; WORD $0xb854; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_522
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_524:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_526
+
+LBB0_525:
+ LONG $0x0410fac5; BYTE $0xb2 // vmovss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045cfac5; BYTE $0xb1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_525
+
+LBB0_526:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_527:
+ LONG $0x0410fac5; BYTE $0xb2 // vmovss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045cfac5; BYTE $0xb1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x4410fac5; WORD $0x04b2 // vmovss xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x445cfac5; WORD $0x04b1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x04 // vmovss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x4410fac5; WORD $0x08b2 // vmovss xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x445cfac5; WORD $0x08b1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x08 // vmovss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x4410fac5; WORD $0x0cb2 // vmovss xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x445cfac5; WORD $0x0cb1 // vsubss xmm0, xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x0c // vmovss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_527
+ JMP LBB0_537
+
+LBB0_122:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_127
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_127
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_125:
+ LONG $0x046ffec5; BYTE $0xf9 // vmovdqu ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20f9 // vmovdqu ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40f9 // vmovdqu ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60f9 // vmovdqu ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x04d4fdc5; BYTE $0xfa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi]
+ LONG $0x4cd4f5c5; WORD $0x20fa // vpaddq ymm1, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x54d4edc5; WORD $0x40fa // vpaddq ymm2, ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5cd4e5c5; WORD $0x60fa // vpaddq ymm3, ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_125
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_127:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_129
+
+LBB0_128:
+ LONG $0xf13c8b48 // mov rdi, qword [rcx + 8*rsi]
+ LONG $0xf23c0348 // add rdi, qword [rdx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_128
+
+LBB0_129:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_130:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_130
+ JMP LBB0_537
+
+LBB0_134:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_139
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_139
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_137:
+ LONG $0x0410fcc5; BYTE $0xb9 // vmovups ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c10fcc5; WORD $0x20b9 // vmovups ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x5410fcc5; WORD $0x40b9 // vmovups ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c10fcc5; WORD $0x60b9 // vmovups ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x0458fcc5; BYTE $0xba // vaddps ymm0, ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c58f4c5; WORD $0x20ba // vaddps ymm1, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x5458ecc5; WORD $0x40ba // vaddps ymm2, ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c58e4c5; WORD $0x60ba // vaddps ymm3, ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb804 // vmovups yword [r8 + 4*rdi], ymm0
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x117cc1c4; WORD $0xb854; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_137
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_139:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_141
+
+LBB0_140:
+ LONG $0x0410fac5; BYTE $0xb1 // vmovss xmm0, dword [rcx + 4*rsi]
+ LONG $0x0458fac5; BYTE $0xb2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_140
+
+LBB0_141:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_142:
+ LONG $0x0410fac5; BYTE $0xb1 // vmovss xmm0, dword [rcx + 4*rsi]
+ LONG $0x0458fac5; BYTE $0xb2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x4410fac5; WORD $0x04b1 // vmovss xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x4458fac5; WORD $0x04b2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x04 // vmovss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x4410fac5; WORD $0x08b1 // vmovss xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x4458fac5; WORD $0x08b2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x08 // vmovss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x4410fac5; WORD $0x0cb1 // vmovss xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x4458fac5; WORD $0x0cb2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x0c // vmovss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_142
+ JMP LBB0_537
+
+LBB0_255:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_260
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_260
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ WORD $0xff31 // xor edi, edi
+
+LBB0_258:
+ LONG $0x046ffec5; BYTE $0xf9 // vmovdqu ymm0, yword [rcx + 8*rdi]
+ LONG $0x4c6ffec5; WORD $0x20f9 // vmovdqu ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40f9 // vmovdqu ymm2, yword [rcx + 8*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60f9 // vmovdqu ymm3, yword [rcx + 8*rdi + 96]
+ LONG $0x04d4fdc5; BYTE $0xfa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi]
+ LONG $0x4cd4f5c5; WORD $0x20fa // vpaddq ymm1, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x54d4edc5; WORD $0x40fa // vpaddq ymm2, ymm2, yword [rdx + 8*rdi + 64]
+ LONG $0x5cd4e5c5; WORD $0x60fa // vpaddq ymm3, ymm3, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf804 // vmovdqu yword [r8 + 8*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xf84c; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm3
+ LONG $0x10c78348 // add rdi, 16
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_258
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_260:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_262
+
+LBB0_261:
+ LONG $0xf13c8b48 // mov rdi, qword [rcx + 8*rsi]
+ LONG $0xf23c0348 // add rdi, qword [rdx + 8*rsi]
+ LONG $0xf03c8949 // mov qword [r8 + 8*rsi], rdi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_261
+
+LBB0_262:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_263:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_263
+ JMP LBB0_537
+
+LBB0_267:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_272
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_272
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_270:
+ LONG $0x0410fcc5; BYTE $0xb9 // vmovups ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c10fcc5; WORD $0x20b9 // vmovups ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x5410fcc5; WORD $0x40b9 // vmovups ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c10fcc5; WORD $0x60b9 // vmovups ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x0458fcc5; BYTE $0xba // vaddps ymm0, ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c58f4c5; WORD $0x20ba // vaddps ymm1, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x5458ecc5; WORD $0x40ba // vaddps ymm2, ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c58e4c5; WORD $0x60ba // vaddps ymm3, ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb804 // vmovups yword [r8 + 4*rdi], ymm0
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x117cc1c4; WORD $0xb854; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_270
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_272:
+ WORD $0x8948; BYTE $0xf7 // mov rdi, rsi
+ WORD $0xf748; BYTE $0xd7 // not rdi
+ WORD $0x014c; BYTE $0xd7 // add rdi, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_274
+
+LBB0_273:
+ LONG $0x0410fac5; BYTE $0xb1 // vmovss xmm0, dword [rcx + 4*rsi]
+ LONG $0x0458fac5; BYTE $0xb2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_273
+
+LBB0_274:
+ LONG $0x03ff8348 // cmp rdi, 3
+ JB LBB0_537
+
+LBB0_275:
+ LONG $0x0410fac5; BYTE $0xb1 // vmovss xmm0, dword [rcx + 4*rsi]
+ LONG $0x0458fac5; BYTE $0xb2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi]
+ LONG $0x117ac1c4; WORD $0xb004 // vmovss dword [r8 + 4*rsi], xmm0
+ LONG $0x4410fac5; WORD $0x04b1 // vmovss xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x4458fac5; WORD $0x04b2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x04 // vmovss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x4410fac5; WORD $0x08b1 // vmovss xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x4458fac5; WORD $0x08b2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x08 // vmovss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x4410fac5; WORD $0x0cb1 // vmovss xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x4458fac5; WORD $0x0cb2 // vaddss xmm0, xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x117ac1c4; WORD $0xb044; BYTE $0x0c // vmovss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_275
+ JMP LBB0_537
+
+LBB0_306:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_311
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_311
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_309:
+ LONG $0x046ffec5; BYTE $0x3a // vmovdqu ymm0, yword [rdx + rdi]
+ LONG $0x4c6ffec5; WORD $0x203a // vmovdqu ymm1, yword [rdx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x403a // vmovdqu ymm2, yword [rdx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x603a // vmovdqu ymm3, yword [rdx + rdi + 96]
+ LONG $0x04f8fdc5; BYTE $0x39 // vpsubb ymm0, ymm0, yword [rcx + rdi]
+ LONG $0x4cf8f5c5; WORD $0x2039 // vpsubb ymm1, ymm1, yword [rcx + rdi + 32]
+ LONG $0x54f8edc5; WORD $0x4039 // vpsubb ymm2, ymm2, yword [rcx + rdi + 64]
+ LONG $0x5cf8e5c5; WORD $0x6039 // vpsubb ymm3, ymm3, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_309
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_311:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_313
+
+LBB0_312:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_312
+
+LBB0_313:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_314:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_314
+ JMP LBB0_537
+
+LBB0_432:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_437
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_437
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_435:
+ LONG $0x046ffec5; BYTE $0x3a // vmovdqu ymm0, yword [rdx + rdi]
+ LONG $0x4c6ffec5; WORD $0x203a // vmovdqu ymm1, yword [rdx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x403a // vmovdqu ymm2, yword [rdx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x603a // vmovdqu ymm3, yword [rdx + rdi + 96]
+ LONG $0x04f8fdc5; BYTE $0x39 // vpsubb ymm0, ymm0, yword [rcx + rdi]
+ LONG $0x4cf8f5c5; WORD $0x2039 // vpsubb ymm1, ymm1, yword [rcx + rdi + 32]
+ LONG $0x54f8edc5; WORD $0x4039 // vpsubb ymm2, ymm2, yword [rcx + rdi + 64]
+ LONG $0x5cf8e5c5; WORD $0x6039 // vpsubb ymm3, ymm3, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_435
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_437:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_439
+
+LBB0_438:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_438
+
+LBB0_439:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_440:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_440
+ JMP LBB0_537
+
+LBB0_47:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_52
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_52
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_50:
+ LONG $0x046ffec5; BYTE $0x39 // vmovdqu ymm0, yword [rcx + rdi]
+ LONG $0x4c6ffec5; WORD $0x2039 // vmovdqu ymm1, yword [rcx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x4039 // vmovdqu ymm2, yword [rcx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6039 // vmovdqu ymm3, yword [rcx + rdi + 96]
+ LONG $0x04fcfdc5; BYTE $0x3a // vpaddb ymm0, ymm0, yword [rdx + rdi]
+ LONG $0x4cfcf5c5; WORD $0x203a // vpaddb ymm1, ymm1, yword [rdx + rdi + 32]
+ LONG $0x54fcedc5; WORD $0x403a // vpaddb ymm2, ymm2, yword [rdx + rdi + 64]
+ LONG $0x5cfce5c5; WORD $0x603a // vpaddb ymm3, ymm3, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_50
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_52:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_54
+
+LBB0_53:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_53
+
+LBB0_54:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_55:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_55
+ JMP LBB0_537
+
+LBB0_180:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_185
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_185
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ WORD $0xff31 // xor edi, edi
+
+LBB0_183:
+ LONG $0x046ffec5; BYTE $0x39 // vmovdqu ymm0, yword [rcx + rdi]
+ LONG $0x4c6ffec5; WORD $0x2039 // vmovdqu ymm1, yword [rcx + rdi + 32]
+ LONG $0x546ffec5; WORD $0x4039 // vmovdqu ymm2, yword [rcx + rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x6039 // vmovdqu ymm3, yword [rcx + rdi + 96]
+ LONG $0x04fcfdc5; BYTE $0x3a // vpaddb ymm0, ymm0, yword [rdx + rdi]
+ LONG $0x4cfcf5c5; WORD $0x203a // vpaddb ymm1, ymm1, yword [rdx + rdi + 32]
+ LONG $0x54fcedc5; WORD $0x403a // vpaddb ymm2, ymm2, yword [rdx + rdi + 64]
+ LONG $0x5cfce5c5; WORD $0x603a // vpaddb ymm3, ymm3, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x3804 // vmovdqu yword [r8 + rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0x384c; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm3
+ LONG $0x80ef8348 // sub rdi, -128
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_183
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_185:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_187
+
+LBB0_186:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_186
+
+LBB0_187:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_188:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_188
+ JMP LBB0_537
+
+LBB0_360:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_365
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_365
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_363:
+ LONG $0x046ffec5; BYTE $0xba // vmovdqu ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20ba // vmovdqu ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40ba // vmovdqu ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60ba // vmovdqu ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x04fafdc5; BYTE $0xb9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi]
+ LONG $0x4cfaf5c5; WORD $0x20b9 // vpsubd ymm1, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x54faedc5; WORD $0x40b9 // vpsubd ymm2, ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5cfae5c5; WORD $0x60b9 // vpsubd ymm3, ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_363
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_365:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_367
+
+LBB0_366:
+ WORD $0x3c8b; BYTE $0xb2 // mov edi, dword [rdx + 4*rsi]
+ WORD $0x3c2b; BYTE $0xb1 // sub edi, dword [rcx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_366
+
+LBB0_367:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_368:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_368
+ JMP LBB0_537
+
+LBB0_486:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_491
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_491
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_489:
+ LONG $0x046ffec5; BYTE $0xba // vmovdqu ymm0, yword [rdx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20ba // vmovdqu ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40ba // vmovdqu ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60ba // vmovdqu ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x04fafdc5; BYTE $0xb9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi]
+ LONG $0x4cfaf5c5; WORD $0x20b9 // vpsubd ymm1, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x54faedc5; WORD $0x40b9 // vpsubd ymm2, ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5cfae5c5; WORD $0x60b9 // vpsubd ymm3, ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_489
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_491:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_493
+
+LBB0_492:
+ WORD $0x3c8b; BYTE $0xb2 // mov edi, dword [rdx + 4*rsi]
+ WORD $0x3c2b; BYTE $0xb1 // sub edi, dword [rcx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_492
+
+LBB0_493:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_494:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_494
+ JMP LBB0_537
+
+LBB0_101:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_106
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_106
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_104:
+ LONG $0x046ffec5; BYTE $0xb9 // vmovdqu ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20b9 // vmovdqu ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40b9 // vmovdqu ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60b9 // vmovdqu ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x04fefdc5; BYTE $0xba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi]
+ LONG $0x4cfef5c5; WORD $0x20ba // vpaddd ymm1, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x54feedc5; WORD $0x40ba // vpaddd ymm2, ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5cfee5c5; WORD $0x60ba // vpaddd ymm3, ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_104
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_106:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_108
+
+LBB0_107:
+ WORD $0x3c8b; BYTE $0xb1 // mov edi, dword [rcx + 4*rsi]
+ WORD $0x3c03; BYTE $0xb2 // add edi, dword [rdx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_107
+
+LBB0_108:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_109:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_109
+ JMP LBB0_537
+
+LBB0_234:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_239
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_239
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xff31 // xor edi, edi
+
+LBB0_237:
+ LONG $0x046ffec5; BYTE $0xb9 // vmovdqu ymm0, yword [rcx + 4*rdi]
+ LONG $0x4c6ffec5; WORD $0x20b9 // vmovdqu ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x546ffec5; WORD $0x40b9 // vmovdqu ymm2, yword [rcx + 4*rdi + 64]
+ LONG $0x5c6ffec5; WORD $0x60b9 // vmovdqu ymm3, yword [rcx + 4*rdi + 96]
+ LONG $0x04fefdc5; BYTE $0xba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi]
+ LONG $0x4cfef5c5; WORD $0x20ba // vpaddd ymm1, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x54feedc5; WORD $0x40ba // vpaddd ymm2, ymm2, yword [rdx + 4*rdi + 64]
+ LONG $0x5cfee5c5; WORD $0x60ba // vpaddd ymm3, ymm3, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb804 // vmovdqu yword [r8 + 4*rdi], ymm0
+ LONG $0x7f7ec1c4; WORD $0xb84c; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm3
+ LONG $0x20c78348 // add rdi, 32
+ WORD $0x3948; BYTE $0xfe // cmp rsi, rdi
+ JNE LBB0_237
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB0_537
+
+LBB0_239:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd0 // mov rax, r10
+ LONG $0x03e08348 // and rax, 3
+ JE LBB0_241
+
+LBB0_240:
+ WORD $0x3c8b; BYTE $0xb1 // mov edi, dword [rcx + 4*rsi]
+ WORD $0x3c03; BYTE $0xb2 // add edi, dword [rdx + 4*rsi]
+ LONG $0xb03c8941 // mov dword [r8 + 4*rsi], edi
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc08348 // add rax, -1
+ JNE LBB0_240
+
+LBB0_241:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_537
+
+LBB0_242:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_242
+
+LBB0_537:
+ VZEROUPPER
+ RET
+
+TEXT ·_arithmetic_arr_scalar_avx2(SB), $0-48
+
+ MOVQ typ+0(FP), DI
+ MOVQ op+8(FP), SI
+ MOVQ inLeft+16(FP), DX
+ MOVQ inRight+24(FP), CX
+ MOVQ out+32(FP), R8
+ MOVQ len+40(FP), R9
+
+ LONG $0x01fe8040 // cmp sil, 1
+ JG LBB1_11
+ WORD $0x8440; BYTE $0xf6 // test sil, sil
+ JE LBB1_21
+ LONG $0x01fe8040 // cmp sil, 1
+ JNE LBB1_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_37
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_65
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_105
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_108
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_10
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_297
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_297
+
+LBB1_10:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_421:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_423
+
+LBB1_422:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_422
+
+LBB1_423:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_424:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_424
+ JMP LBB1_737
+
+LBB1_11:
+ LONG $0x02fe8040 // cmp sil, 2
+ JE LBB1_29
+ LONG $0x03fe8040 // cmp sil, 3
+ JNE LBB1_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_44
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_70
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_111
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_114
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_20
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_300
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_300
+
+LBB1_20:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_429:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_431
+
+LBB1_430:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_430
+
+LBB1_431:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_432:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_432
+ JMP LBB1_737
+
+LBB1_21:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_51
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_75
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_117
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_120
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_28
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_303
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_303
+
+LBB1_28:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_437:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_439
+
+LBB1_438:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_438
+
+LBB1_439:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_440:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_440
+ JMP LBB1_737
+
+LBB1_29:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_58
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_80
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_123
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_126
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_36
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_306
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_306
+
+LBB1_36:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_445:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_447
+
+LBB1_446:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_446
+
+LBB1_447:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_448:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_448
+ JMP LBB1_737
+
+LBB1_37:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_85
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_129
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_132
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fbc5 // vmovsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_43
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_309
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_309
+
+LBB1_43:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_453:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_455
+
+LBB1_454:
+ LONG $0x0c10fbc5; BYTE $0xca // vmovsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_454
+
+LBB1_455:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_456:
+ LONG $0x0c10fbc5; BYTE $0xca // vmovsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c10fbc5; WORD $0x08ca // vmovsd xmm1, qword [rdx + 8*rcx + 8]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x08 // vmovsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c10fbc5; WORD $0x10ca // vmovsd xmm1, qword [rdx + 8*rcx + 16]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x10 // vmovsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c10fbc5; WORD $0x18ca // vmovsd xmm1, qword [rdx + 8*rcx + 24]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x18 // vmovsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_456
+ JMP LBB1_737
+
+LBB1_44:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_90
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_135
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_138
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fbc5 // vmovsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_50
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_312
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_312
+
+LBB1_50:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_461:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_463
+
+LBB1_462:
+ LONG $0x0c10fbc5; BYTE $0xca // vmovsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_462
+
+LBB1_463:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_464:
+ LONG $0x0c10fbc5; BYTE $0xca // vmovsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c10fbc5; WORD $0x08ca // vmovsd xmm1, qword [rdx + 8*rcx + 8]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x08 // vmovsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c10fbc5; WORD $0x10ca // vmovsd xmm1, qword [rdx + 8*rcx + 16]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x10 // vmovsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c10fbc5; WORD $0x18ca // vmovsd xmm1, qword [rdx + 8*rcx + 24]
+ LONG $0xc85cf3c5 // vsubsd xmm1, xmm1, xmm0
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x18 // vmovsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_464
+ JMP LBB1_737
+
+LBB1_51:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_95
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_141
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_144
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fbc5 // vmovsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_57
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_315
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_315
+
+LBB1_57:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_469:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_471
+
+LBB1_470:
+ LONG $0x0c58fbc5; BYTE $0xca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx]
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_470
+
+LBB1_471:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_472:
+ LONG $0x0c58fbc5; BYTE $0xca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx]
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c58fbc5; WORD $0x08ca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx + 8]
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x08 // vmovsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c58fbc5; WORD $0x10ca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx + 16]
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x10 // vmovsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c58fbc5; WORD $0x18ca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx + 24]
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x18 // vmovsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_472
+ JMP LBB1_737
+
+LBB1_58:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_100
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_147
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_150
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fbc5 // vmovsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_64
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_318
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_318
+
+LBB1_64:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_477:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_479
+
+LBB1_478:
+ LONG $0x0c58fbc5; BYTE $0xca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx]
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_478
+
+LBB1_479:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_480:
+ LONG $0x0c58fbc5; BYTE $0xca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx]
+ LONG $0x117bc1c4; WORD $0xc80c // vmovsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c58fbc5; WORD $0x08ca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx + 8]
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x08 // vmovsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c58fbc5; WORD $0x10ca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx + 16]
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x10 // vmovsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c58fbc5; WORD $0x18ca // vaddsd xmm1, xmm0, qword [rdx + 8*rcx + 24]
+ LONG $0x117bc1c4; WORD $0xc84c; BYTE $0x18 // vmovsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_480
+ JMP LBB1_737
+
+LBB1_65:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_153
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_69
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_321
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_321
+
+LBB1_69:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_485:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_487
+
+LBB1_486:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_486
+
+LBB1_487:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_488:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_488
+ JMP LBB1_737
+
+LBB1_70:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_156
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_74
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_324
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_324
+
+LBB1_74:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_493:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_495
+
+LBB1_494:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_494
+
+LBB1_495:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_496:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_496
+ JMP LBB1_737
+
+LBB1_75:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_159
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_79
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_327
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_327
+
+LBB1_79:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_501:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_503
+
+LBB1_502:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_502
+
+LBB1_503:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_504:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_504
+ JMP LBB1_737
+
+LBB1_80:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_162
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_84
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_330
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_330
+
+LBB1_84:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_509:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_511
+
+LBB1_510:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_510
+
+LBB1_511:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_512:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_512
+ JMP LBB1_737
+
+LBB1_85:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_165
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_89
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_333
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_333
+
+LBB1_89:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_517:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_519
+
+LBB1_518:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_518
+
+LBB1_519:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_520:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_520
+ JMP LBB1_737
+
+LBB1_90:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_168
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_94
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_336
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_336
+
+LBB1_94:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_525:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_527
+
+LBB1_526:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_526
+
+LBB1_527:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_528:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_528
+ JMP LBB1_737
+
+LBB1_95:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_171
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_99
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_339
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_339
+
+LBB1_99:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_533:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_535
+
+LBB1_534:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_534
+
+LBB1_535:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_536:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_536
+ JMP LBB1_737
+
+LBB1_100:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_174
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_104
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_342
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_342
+
+LBB1_104:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_541:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_543
+
+LBB1_542:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_542
+
+LBB1_543:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_544:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_544
+ JMP LBB1_737
+
+LBB1_105:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_107
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_345
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_345
+
+LBB1_107:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_549:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_551
+
+LBB1_550:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0xc729 // sub edi, eax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_550
+
+LBB1_551:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_552:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_552
+ JMP LBB1_737
+
+LBB1_108:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_110
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_348
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_348
+
+LBB1_110:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_557:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_559
+
+LBB1_558:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0xc729 // sub edi, eax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_558
+
+LBB1_559:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_560:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_560
+ JMP LBB1_737
+
+LBB1_111:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_113
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_351
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_351
+
+LBB1_113:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_565:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_567
+
+LBB1_566:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0xc729 // sub edi, eax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_566
+
+LBB1_567:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_568:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_568
+ JMP LBB1_737
+
+LBB1_114:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_116
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_354
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_354
+
+LBB1_116:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_573:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_575
+
+LBB1_574:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0xc729 // sub edi, eax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_574
+
+LBB1_575:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_576:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_576
+ JMP LBB1_737
+
+LBB1_117:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_119
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_357
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_357
+
+LBB1_119:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_581:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_583
+
+LBB1_582:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_582
+
+LBB1_583:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_584:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_584
+ JMP LBB1_737
+
+LBB1_120:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_122
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_360
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_360
+
+LBB1_122:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_589:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_591
+
+LBB1_590:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_590
+
+LBB1_591:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_592:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_592
+ JMP LBB1_737
+
+LBB1_123:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_125
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_363
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_363
+
+LBB1_125:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_597:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_599
+
+LBB1_598:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_598
+
+LBB1_599:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_600:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_600
+ JMP LBB1_737
+
+LBB1_126:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_128
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_366
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_366
+
+LBB1_128:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_605:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd1 // mov rcx, r10
+ LONG $0x03e18348 // and rcx, 3
+ JE LBB1_607
+
+LBB1_606:
+ LONG $0x723cb70f // movzx edi, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc18348 // add rcx, -1
+ JNE LBB1_606
+
+LBB1_607:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_608:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_608
+ JMP LBB1_737
+
+LBB1_129:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_131
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_369
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_369
+
+LBB1_131:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_613:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_615
+
+LBB1_614:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_614
+
+LBB1_615:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_616:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_616
+ JMP LBB1_737
+
+LBB1_132:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fac5 // vmovss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_134
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_372
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_372
+
+LBB1_134:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_621:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_623
+
+LBB1_622:
+ LONG $0x0c10fac5; BYTE $0x8a // vmovss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_622
+
+LBB1_623:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_624:
+ LONG $0x0c10fac5; BYTE $0x8a // vmovss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c10fac5; WORD $0x048a // vmovss xmm1, dword [rdx + 4*rcx + 4]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x04 // vmovss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c10fac5; WORD $0x088a // vmovss xmm1, dword [rdx + 4*rcx + 8]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x08 // vmovss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c10fac5; WORD $0x0c8a // vmovss xmm1, dword [rdx + 4*rcx + 12]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x0c // vmovss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_624
+ JMP LBB1_737
+
+LBB1_135:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_137
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_375
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_375
+
+LBB1_137:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_629:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_631
+
+LBB1_630:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_630
+
+LBB1_631:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_632:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_632
+ JMP LBB1_737
+
+LBB1_138:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fac5 // vmovss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_140
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_378
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_378
+
+LBB1_140:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_637:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_639
+
+LBB1_638:
+ LONG $0x0c10fac5; BYTE $0x8a // vmovss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_638
+
+LBB1_639:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_640:
+ LONG $0x0c10fac5; BYTE $0x8a // vmovss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c10fac5; WORD $0x048a // vmovss xmm1, dword [rdx + 4*rcx + 4]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x04 // vmovss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c10fac5; WORD $0x088a // vmovss xmm1, dword [rdx + 4*rcx + 8]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x08 // vmovss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c10fac5; WORD $0x0c8a // vmovss xmm1, dword [rdx + 4*rcx + 12]
+ LONG $0xc85cf2c5 // vsubss xmm1, xmm1, xmm0
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x0c // vmovss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_640
+ JMP LBB1_737
+
+LBB1_141:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_143
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_381
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_381
+
+LBB1_143:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_645:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_647
+
+LBB1_646:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_646
+
+LBB1_647:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_648:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_648
+ JMP LBB1_737
+
+LBB1_144:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fac5 // vmovss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_146
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_384
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_384
+
+LBB1_146:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_653:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_655
+
+LBB1_654:
+ LONG $0x0c58fac5; BYTE $0x8a // vaddss xmm1, xmm0, dword [rdx + 4*rcx]
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_654
+
+LBB1_655:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_656:
+ LONG $0x0c58fac5; BYTE $0x8a // vaddss xmm1, xmm0, dword [rdx + 4*rcx]
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c58fac5; WORD $0x048a // vaddss xmm1, xmm0, dword [rdx + 4*rcx + 4]
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x04 // vmovss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c58fac5; WORD $0x088a // vaddss xmm1, xmm0, dword [rdx + 4*rcx + 8]
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x08 // vmovss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c58fac5; WORD $0x0c8a // vaddss xmm1, xmm0, dword [rdx + 4*rcx + 12]
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x0c // vmovss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_656
+ JMP LBB1_737
+
+LBB1_147:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_149
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_387
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_387
+
+LBB1_149:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_661:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_663
+
+LBB1_662:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_662
+
+LBB1_663:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_664:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_664
+ JMP LBB1_737
+
+LBB1_150:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x0110fac5 // vmovss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_152
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_390
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_390
+
+LBB1_152:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_669:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_671
+
+LBB1_670:
+ LONG $0x0c58fac5; BYTE $0x8a // vaddss xmm1, xmm0, dword [rdx + 4*rcx]
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_670
+
+LBB1_671:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_672:
+ LONG $0x0c58fac5; BYTE $0x8a // vaddss xmm1, xmm0, dword [rdx + 4*rcx]
+ LONG $0x117ac1c4; WORD $0x880c // vmovss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c58fac5; WORD $0x048a // vaddss xmm1, xmm0, dword [rdx + 4*rcx + 4]
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x04 // vmovss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c58fac5; WORD $0x088a // vaddss xmm1, xmm0, dword [rdx + 4*rcx + 8]
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x08 // vmovss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c58fac5; WORD $0x0c8a // vaddss xmm1, xmm0, dword [rdx + 4*rcx + 12]
+ LONG $0x117ac1c4; WORD $0x884c; BYTE $0x0c // vmovss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_672
+ JMP LBB1_737
+
+LBB1_153:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_155
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_393
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_393
+
+LBB1_155:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_677:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_679
+
+LBB1_678:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_678
+
+LBB1_679:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_680:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_680
+ JMP LBB1_737
+
+LBB1_156:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_158
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_396
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_396
+
+LBB1_158:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_685:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_687
+
+LBB1_686:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_686
+
+LBB1_687:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_688:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_688
+ JMP LBB1_737
+
+LBB1_159:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_161
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_399
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_399
+
+LBB1_161:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_693:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_695
+
+LBB1_694:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_694
+
+LBB1_695:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_696:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_696
+ JMP LBB1_737
+
+LBB1_162:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB1_164
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_402
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_402
+
+LBB1_164:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_701:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_703
+
+LBB1_702:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_702
+
+LBB1_703:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_704:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_704
+ JMP LBB1_737
+
+LBB1_165:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_167
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_405
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_405
+
+LBB1_167:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_709:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_711
+
+LBB1_710:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_710
+
+LBB1_711:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_712:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_712
+ JMP LBB1_737
+
+LBB1_168:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_170
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_408
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_408
+
+LBB1_170:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_717:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_719
+
+LBB1_718:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_718
+
+LBB1_719:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_720:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_720
+ JMP LBB1_737
+
+LBB1_171:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_173
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_411
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_411
+
+LBB1_173:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_725:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_727
+
+LBB1_726:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_726
+
+LBB1_727:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_728:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_728
+ JMP LBB1_737
+
+LBB1_174:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_176
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_414
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_414
+
+LBB1_176:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_733:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_735
+
+LBB1_734:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_734
+
+LBB1_735:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_736:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_736
+ JMP LBB1_737
+
+LBB1_297:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_417
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_299:
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 4*rdi + 224]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_299
+ JMP LBB1_418
+
+LBB1_300:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_425
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_302:
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 4*rdi + 224]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_302
+ JMP LBB1_426
+
+LBB1_303:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_433
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_305:
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60ba // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba94fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_305
+ JMP LBB1_434
+
+LBB1_306:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_441
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_308:
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60ba // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba94fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_308
+ JMP LBB1_442
+
+LBB1_309:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf0 // and ecx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0718d48 // lea rsi, [rcx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_449
+ WORD $0x894c; BYTE $0xcf // mov rdi, r9
+ LONG $0xfee78348 // and rdi, -2
+ WORD $0xf748; BYTE $0xdf // neg rdi
+ WORD $0xf631 // xor esi, esi
+
+LBB1_311:
+ LONG $0x1410fdc5; BYTE $0xf2 // vmovupd ymm2, yword [rdx + 8*rsi]
+ LONG $0x5c10fdc5; WORD $0x20f2 // vmovupd ymm3, yword [rdx + 8*rsi + 32]
+ LONG $0x6410fdc5; WORD $0x40f2 // vmovupd ymm4, yword [rdx + 8*rsi + 64]
+ LONG $0x6c10fdc5; WORD $0x60f2 // vmovupd ymm5, yword [rdx + 8*rsi + 96]
+ LONG $0xd15cedc5 // vsubpd ymm2, ymm2, ymm1
+ LONG $0xd95ce5c5 // vsubpd ymm3, ymm3, ymm1
+ LONG $0xe15cddc5 // vsubpd ymm4, ymm4, ymm1
+ LONG $0xe95cd5c5 // vsubpd ymm5, ymm5, ymm1
+ LONG $0x117dc1c4; WORD $0xf014 // vmovupd yword [r8 + 8*rsi], ymm2
+ LONG $0x117dc1c4; WORD $0xf05c; BYTE $0x20 // vmovupd yword [r8 + 8*rsi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf064; BYTE $0x40 // vmovupd yword [r8 + 8*rsi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf06c; BYTE $0x60 // vmovupd yword [r8 + 8*rsi + 96], ymm5
+ QUAD $0x000080f29410fdc5; BYTE $0x00 // vmovupd ymm2, yword [rdx + 8*rsi + 128]
+ QUAD $0x0000a0f29c10fdc5; BYTE $0x00 // vmovupd ymm3, yword [rdx + 8*rsi + 160]
+ QUAD $0x0000c0f2a410fdc5; BYTE $0x00 // vmovupd ymm4, yword [rdx + 8*rsi + 192]
+ QUAD $0x0000e0f2ac10fdc5; BYTE $0x00 // vmovupd ymm5, yword [rdx + 8*rsi + 224]
+ LONG $0xd15cedc5 // vsubpd ymm2, ymm2, ymm1
+ LONG $0xd95ce5c5 // vsubpd ymm3, ymm3, ymm1
+ LONG $0xe15cddc5 // vsubpd ymm4, ymm4, ymm1
+ LONG $0xe95cd5c5 // vsubpd ymm5, ymm5, ymm1
+ QUAD $0x0080f094117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 128], ymm2
+ QUAD $0x00a0f09c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 160], ymm3
+ QUAD $0x00c0f0a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 192], ymm4
+ QUAD $0x00e0f0ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 224], ymm5
+ LONG $0x20c68348 // add rsi, 32
+ LONG $0x02c78348 // add rdi, 2
+ JNE LBB1_311
+ JMP LBB1_450
+
+LBB1_312:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf0 // and ecx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0718d48 // lea rsi, [rcx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_457
+ WORD $0x894c; BYTE $0xcf // mov rdi, r9
+ LONG $0xfee78348 // and rdi, -2
+ WORD $0xf748; BYTE $0xdf // neg rdi
+ WORD $0xf631 // xor esi, esi
+
+LBB1_314:
+ LONG $0x1410fdc5; BYTE $0xf2 // vmovupd ymm2, yword [rdx + 8*rsi]
+ LONG $0x5c10fdc5; WORD $0x20f2 // vmovupd ymm3, yword [rdx + 8*rsi + 32]
+ LONG $0x6410fdc5; WORD $0x40f2 // vmovupd ymm4, yword [rdx + 8*rsi + 64]
+ LONG $0x6c10fdc5; WORD $0x60f2 // vmovupd ymm5, yword [rdx + 8*rsi + 96]
+ LONG $0xd15cedc5 // vsubpd ymm2, ymm2, ymm1
+ LONG $0xd95ce5c5 // vsubpd ymm3, ymm3, ymm1
+ LONG $0xe15cddc5 // vsubpd ymm4, ymm4, ymm1
+ LONG $0xe95cd5c5 // vsubpd ymm5, ymm5, ymm1
+ LONG $0x117dc1c4; WORD $0xf014 // vmovupd yword [r8 + 8*rsi], ymm2
+ LONG $0x117dc1c4; WORD $0xf05c; BYTE $0x20 // vmovupd yword [r8 + 8*rsi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf064; BYTE $0x40 // vmovupd yword [r8 + 8*rsi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf06c; BYTE $0x60 // vmovupd yword [r8 + 8*rsi + 96], ymm5
+ QUAD $0x000080f29410fdc5; BYTE $0x00 // vmovupd ymm2, yword [rdx + 8*rsi + 128]
+ QUAD $0x0000a0f29c10fdc5; BYTE $0x00 // vmovupd ymm3, yword [rdx + 8*rsi + 160]
+ QUAD $0x0000c0f2a410fdc5; BYTE $0x00 // vmovupd ymm4, yword [rdx + 8*rsi + 192]
+ QUAD $0x0000e0f2ac10fdc5; BYTE $0x00 // vmovupd ymm5, yword [rdx + 8*rsi + 224]
+ LONG $0xd15cedc5 // vsubpd ymm2, ymm2, ymm1
+ LONG $0xd95ce5c5 // vsubpd ymm3, ymm3, ymm1
+ LONG $0xe15cddc5 // vsubpd ymm4, ymm4, ymm1
+ LONG $0xe95cd5c5 // vsubpd ymm5, ymm5, ymm1
+ QUAD $0x0080f094117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 128], ymm2
+ QUAD $0x00a0f09c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 160], ymm3
+ QUAD $0x00c0f0a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 192], ymm4
+ QUAD $0x00e0f0ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rsi + 224], ymm5
+ LONG $0x20c68348 // add rsi, 32
+ LONG $0x02c78348 // add rdi, 2
+ JNE LBB1_314
+ JMP LBB1_458
+
+LBB1_315:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf0 // and ecx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0718d48 // lea rsi, [rcx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_465
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_317:
+ LONG $0x1458f5c5; BYTE $0xfa // vaddpd ymm2, ymm1, yword [rdx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20fa // vaddpd ymm3, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40fa // vaddpd ymm4, ymm1, yword [rdx + 8*rdi + 64]
+ LONG $0x6c58f5c5; WORD $0x60fa // vaddpd ymm5, ymm1, yword [rdx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf86c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm5
+ QUAD $0x000080fa9458f5c5; BYTE $0x00 // vaddpd ymm2, ymm1, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa9c58f5c5; BYTE $0x00 // vaddpd ymm3, ymm1, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0faa458f5c5; BYTE $0x00 // vaddpd ymm4, ymm1, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faac58f5c5; BYTE $0x00 // vaddpd ymm5, ymm1, yword [rdx + 8*rdi + 224]
+ QUAD $0x0080f894117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 128], ymm2
+ QUAD $0x00a0f89c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 160], ymm3
+ QUAD $0x00c0f8a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 192], ymm4
+ QUAD $0x00e0f8ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 224], ymm5
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_317
+ JMP LBB1_466
+
+LBB1_318:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf0 // and ecx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0718d48 // lea rsi, [rcx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_473
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_320:
+ LONG $0x1458f5c5; BYTE $0xfa // vaddpd ymm2, ymm1, yword [rdx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20fa // vaddpd ymm3, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40fa // vaddpd ymm4, ymm1, yword [rdx + 8*rdi + 64]
+ LONG $0x6c58f5c5; WORD $0x60fa // vaddpd ymm5, ymm1, yword [rdx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf86c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm5
+ QUAD $0x000080fa9458f5c5; BYTE $0x00 // vaddpd ymm2, ymm1, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa9c58f5c5; BYTE $0x00 // vaddpd ymm3, ymm1, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0faa458f5c5; BYTE $0x00 // vaddpd ymm4, ymm1, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faac58f5c5; BYTE $0x00 // vaddpd ymm5, ymm1, yword [rdx + 8*rdi + 224]
+ QUAD $0x0080f894117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 128], ymm2
+ QUAD $0x00a0f89c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 160], ymm3
+ QUAD $0x00c0f8a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 192], ymm4
+ QUAD $0x00e0f8ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 224], ymm5
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_320
+ JMP LBB1_474
+
+LBB1_321:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_481
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_323:
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + rdi + 224]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_323
+ JMP LBB1_482
+
+LBB1_324:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_489
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_326:
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + rdi + 224]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_326
+ JMP LBB1_490
+
+LBB1_327:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_497
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_329:
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x603a // vpaddb ymm4, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a94fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rdx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_329
+ JMP LBB1_498
+
+LBB1_330:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_505
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_332:
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x603a // vpaddb ymm4, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a94fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rdx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_332
+ JMP LBB1_506
+
+LBB1_333:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_513
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_335:
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 8*rdi + 224]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_335
+ JMP LBB1_514
+
+LBB1_336:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_521
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_338:
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 8*rdi + 224]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_338
+ JMP LBB1_522
+
+LBB1_339:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_529
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_341:
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60fa // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa94d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_341
+ JMP LBB1_530
+
+LBB1_342:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_537
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_344:
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60fa // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa94d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_344
+ JMP LBB1_538
+
+LBB1_345:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_545
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_347:
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4c6ffec5; WORD $0x407a // vmovdqu ymm1, yword [rdx + 2*rdi + 64]
+ LONG $0x546ffec5; WORD $0x607a // vmovdqu ymm2, yword [rdx + 2*rdi + 96]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_347
+ JMP LBB1_546
+
+LBB1_348:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_553
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_350:
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4c6ffec5; WORD $0x407a // vmovdqu ymm1, yword [rdx + 2*rdi + 64]
+ LONG $0x546ffec5; WORD $0x607a // vmovdqu ymm2, yword [rdx + 2*rdi + 96]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_350
+ JMP LBB1_554
+
+LBB1_351:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_561
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_353:
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4c6ffec5; WORD $0x407a // vmovdqu ymm1, yword [rdx + 2*rdi + 64]
+ LONG $0x546ffec5; WORD $0x607a // vmovdqu ymm2, yword [rdx + 2*rdi + 96]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_353
+ JMP LBB1_562
+
+LBB1_354:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_569
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_356:
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4c6ffec5; WORD $0x407a // vmovdqu ymm1, yword [rdx + 2*rdi + 64]
+ LONG $0x546ffec5; WORD $0x607a // vmovdqu ymm2, yword [rdx + 2*rdi + 96]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xd0f9edc5 // vpsubw ymm2, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_356
+ JMP LBB1_570
+
+LBB1_357:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_577
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_359:
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x207a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x407a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x607a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_359
+ JMP LBB1_578
+
+LBB1_360:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_585
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_362:
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x207a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x407a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x607a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_362
+ JMP LBB1_586
+
+LBB1_363:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_593
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_365:
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x207a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x407a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x607a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_365
+ JMP LBB1_594
+
+LBB1_366:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_601
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_368:
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x207a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x407a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x607a // vpaddw ymm2, ymm0, yword [rdx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_368
+ JMP LBB1_602
+
+LBB1_369:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_609
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_371:
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 8*rdi + 224]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_371
+ JMP LBB1_610
+
+LBB1_372:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xe0 // and ecx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0718d48 // lea rsi, [rcx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_617
+ WORD $0x894c; BYTE $0xcf // mov rdi, r9
+ LONG $0xfee78348 // and rdi, -2
+ WORD $0xf748; BYTE $0xdf // neg rdi
+ WORD $0xf631 // xor esi, esi
+
+LBB1_374:
+ LONG $0x1410fcc5; BYTE $0xb2 // vmovups ymm2, yword [rdx + 4*rsi]
+ LONG $0x5c10fcc5; WORD $0x20b2 // vmovups ymm3, yword [rdx + 4*rsi + 32]
+ LONG $0x6410fcc5; WORD $0x40b2 // vmovups ymm4, yword [rdx + 4*rsi + 64]
+ LONG $0x6c10fcc5; WORD $0x60b2 // vmovups ymm5, yword [rdx + 4*rsi + 96]
+ LONG $0xd15cecc5 // vsubps ymm2, ymm2, ymm1
+ LONG $0xd95ce4c5 // vsubps ymm3, ymm3, ymm1
+ LONG $0xe15cdcc5 // vsubps ymm4, ymm4, ymm1
+ LONG $0xe95cd4c5 // vsubps ymm5, ymm5, ymm1
+ LONG $0x117cc1c4; WORD $0xb014 // vmovups yword [r8 + 4*rsi], ymm2
+ LONG $0x117cc1c4; WORD $0xb05c; BYTE $0x20 // vmovups yword [r8 + 4*rsi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb064; BYTE $0x40 // vmovups yword [r8 + 4*rsi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb06c; BYTE $0x60 // vmovups yword [r8 + 4*rsi + 96], ymm5
+ QUAD $0x000080b29410fcc5; BYTE $0x00 // vmovups ymm2, yword [rdx + 4*rsi + 128]
+ QUAD $0x0000a0b29c10fcc5; BYTE $0x00 // vmovups ymm3, yword [rdx + 4*rsi + 160]
+ QUAD $0x0000c0b2a410fcc5; BYTE $0x00 // vmovups ymm4, yword [rdx + 4*rsi + 192]
+ QUAD $0x0000e0b2ac10fcc5; BYTE $0x00 // vmovups ymm5, yword [rdx + 4*rsi + 224]
+ LONG $0xd15cecc5 // vsubps ymm2, ymm2, ymm1
+ LONG $0xd95ce4c5 // vsubps ymm3, ymm3, ymm1
+ LONG $0xe15cdcc5 // vsubps ymm4, ymm4, ymm1
+ LONG $0xe95cd4c5 // vsubps ymm5, ymm5, ymm1
+ QUAD $0x0080b094117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 128], ymm2
+ QUAD $0x00a0b09c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 160], ymm3
+ QUAD $0x00c0b0a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 192], ymm4
+ QUAD $0x00e0b0ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 224], ymm5
+ LONG $0x40c68348 // add rsi, 64
+ LONG $0x02c78348 // add rdi, 2
+ JNE LBB1_374
+ JMP LBB1_618
+
+LBB1_375:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_625
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_377:
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 8*rdi + 224]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xe0fbddc5 // vpsubq ymm4, ymm4, ymm0
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_377
+ JMP LBB1_626
+
+LBB1_378:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xe0 // and ecx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0718d48 // lea rsi, [rcx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_633
+ WORD $0x894c; BYTE $0xcf // mov rdi, r9
+ LONG $0xfee78348 // and rdi, -2
+ WORD $0xf748; BYTE $0xdf // neg rdi
+ WORD $0xf631 // xor esi, esi
+
+LBB1_380:
+ LONG $0x1410fcc5; BYTE $0xb2 // vmovups ymm2, yword [rdx + 4*rsi]
+ LONG $0x5c10fcc5; WORD $0x20b2 // vmovups ymm3, yword [rdx + 4*rsi + 32]
+ LONG $0x6410fcc5; WORD $0x40b2 // vmovups ymm4, yword [rdx + 4*rsi + 64]
+ LONG $0x6c10fcc5; WORD $0x60b2 // vmovups ymm5, yword [rdx + 4*rsi + 96]
+ LONG $0xd15cecc5 // vsubps ymm2, ymm2, ymm1
+ LONG $0xd95ce4c5 // vsubps ymm3, ymm3, ymm1
+ LONG $0xe15cdcc5 // vsubps ymm4, ymm4, ymm1
+ LONG $0xe95cd4c5 // vsubps ymm5, ymm5, ymm1
+ LONG $0x117cc1c4; WORD $0xb014 // vmovups yword [r8 + 4*rsi], ymm2
+ LONG $0x117cc1c4; WORD $0xb05c; BYTE $0x20 // vmovups yword [r8 + 4*rsi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb064; BYTE $0x40 // vmovups yword [r8 + 4*rsi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb06c; BYTE $0x60 // vmovups yword [r8 + 4*rsi + 96], ymm5
+ QUAD $0x000080b29410fcc5; BYTE $0x00 // vmovups ymm2, yword [rdx + 4*rsi + 128]
+ QUAD $0x0000a0b29c10fcc5; BYTE $0x00 // vmovups ymm3, yword [rdx + 4*rsi + 160]
+ QUAD $0x0000c0b2a410fcc5; BYTE $0x00 // vmovups ymm4, yword [rdx + 4*rsi + 192]
+ QUAD $0x0000e0b2ac10fcc5; BYTE $0x00 // vmovups ymm5, yword [rdx + 4*rsi + 224]
+ LONG $0xd15cecc5 // vsubps ymm2, ymm2, ymm1
+ LONG $0xd95ce4c5 // vsubps ymm3, ymm3, ymm1
+ LONG $0xe15cdcc5 // vsubps ymm4, ymm4, ymm1
+ LONG $0xe95cd4c5 // vsubps ymm5, ymm5, ymm1
+ QUAD $0x0080b094117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 128], ymm2
+ QUAD $0x00a0b09c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 160], ymm3
+ QUAD $0x00c0b0a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 192], ymm4
+ QUAD $0x00e0b0ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rsi + 224], ymm5
+ LONG $0x40c68348 // add rsi, 64
+ LONG $0x02c78348 // add rdi, 2
+ JNE LBB1_380
+ JMP LBB1_634
+
+LBB1_381:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_641
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_383:
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60fa // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa94d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_383
+ JMP LBB1_642
+
+LBB1_384:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xe0 // and ecx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0718d48 // lea rsi, [rcx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_649
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_386:
+ LONG $0x1458f4c5; BYTE $0xba // vaddps ymm2, ymm1, yword [rdx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20ba // vaddps ymm3, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40ba // vaddps ymm4, ymm1, yword [rdx + 4*rdi + 64]
+ LONG $0x6c58f4c5; WORD $0x60ba // vaddps ymm5, ymm1, yword [rdx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb86c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm5
+ QUAD $0x000080ba9458f4c5; BYTE $0x00 // vaddps ymm2, ymm1, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba9c58f4c5; BYTE $0x00 // vaddps ymm3, ymm1, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0baa458f4c5; BYTE $0x00 // vaddps ymm4, ymm1, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baac58f4c5; BYTE $0x00 // vaddps ymm5, ymm1, yword [rdx + 4*rdi + 224]
+ QUAD $0x0080b894117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 128], ymm2
+ QUAD $0x00a0b89c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 160], ymm3
+ QUAD $0x00c0b8a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 192], ymm4
+ QUAD $0x00e0b8ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 224], ymm5
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_386
+ JMP LBB1_650
+
+LBB1_387:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_657
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_389:
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60fa // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080fa8cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rdx + 8*rdi + 128]
+ QUAD $0x0000a0fa94d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 160]
+ QUAD $0x0000c0fa9cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 192]
+ QUAD $0x0000e0faa4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rdx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_389
+ JMP LBB1_658
+
+LBB1_390:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xe0 // and ecx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0718d48 // lea rsi, [rcx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_665
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_392:
+ LONG $0x1458f4c5; BYTE $0xba // vaddps ymm2, ymm1, yword [rdx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20ba // vaddps ymm3, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40ba // vaddps ymm4, ymm1, yword [rdx + 4*rdi + 64]
+ LONG $0x6c58f4c5; WORD $0x60ba // vaddps ymm5, ymm1, yword [rdx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb86c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm5
+ QUAD $0x000080ba9458f4c5; BYTE $0x00 // vaddps ymm2, ymm1, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba9c58f4c5; BYTE $0x00 // vaddps ymm3, ymm1, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0baa458f4c5; BYTE $0x00 // vaddps ymm4, ymm1, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baac58f4c5; BYTE $0x00 // vaddps ymm5, ymm1, yword [rdx + 4*rdi + 224]
+ QUAD $0x0080b894117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 128], ymm2
+ QUAD $0x00a0b89c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 160], ymm3
+ QUAD $0x00c0b8a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 192], ymm4
+ QUAD $0x00e0b8ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 224], ymm5
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_392
+ JMP LBB1_666
+
+LBB1_393:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_673
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_395:
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + rdi + 224]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_395
+ JMP LBB1_674
+
+LBB1_396:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_681
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_398:
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + rdi + 224]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xe0f8ddc5 // vpsubb ymm4, ymm4, ymm0
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_398
+ JMP LBB1_682
+
+LBB1_399:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_689
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_401:
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x603a // vpaddb ymm4, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a94fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rdx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_401
+ JMP LBB1_690
+
+LBB1_402:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x804e8d48 // lea rcx, [rsi - 128]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_697
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_404:
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x603a // vpaddb ymm4, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x0000803a8cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rdx + rdi + 128]
+ QUAD $0x0000a03a94fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rdx + rdi + 160]
+ QUAD $0x0000c03a9cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rdx + rdi + 192]
+ QUAD $0x0000e03aa4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rdx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_404
+ JMP LBB1_698
+
+LBB1_405:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_705
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_407:
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 4*rdi + 224]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_407
+ JMP LBB1_706
+
+LBB1_408:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_713
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_410:
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8c6ffec5; BYTE $0x00 // vmovdqu ymm1, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdx + 4*rdi + 224]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xe0faddc5 // vpsubd ymm4, ymm4, ymm0
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_410
+ JMP LBB1_714
+
+LBB1_411:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_721
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_413:
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60ba // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba94fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_413
+ JMP LBB1_722
+
+LBB1_414:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_729
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_416:
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60ba // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080ba8cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rdx + 4*rdi + 128]
+ QUAD $0x0000a0ba94fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 160]
+ QUAD $0x0000c0ba9cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 192]
+ QUAD $0x0000e0baa4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rdx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_416
+ JMP LBB1_730
+
+LBB1_417:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_418:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_420
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xc0faddc5 // vpsubd ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_420:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_421
+
+LBB1_425:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_426:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_428
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xc0faddc5 // vpsubd ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_428:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_429
+
+LBB1_433:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_434:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_436
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60ba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_436:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_437
+
+LBB1_441:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_442:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_444
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60ba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_444:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_445
+
+LBB1_449:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_450:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_452
+ LONG $0x1410fdc5; BYTE $0xf2 // vmovupd ymm2, yword [rdx + 8*rsi]
+ LONG $0x5c10fdc5; WORD $0x20f2 // vmovupd ymm3, yword [rdx + 8*rsi + 32]
+ LONG $0x6410fdc5; WORD $0x40f2 // vmovupd ymm4, yword [rdx + 8*rsi + 64]
+ LONG $0x6c10fdc5; WORD $0x60f2 // vmovupd ymm5, yword [rdx + 8*rsi + 96]
+ LONG $0xd15cedc5 // vsubpd ymm2, ymm2, ymm1
+ LONG $0xd95ce5c5 // vsubpd ymm3, ymm3, ymm1
+ LONG $0xe15cddc5 // vsubpd ymm4, ymm4, ymm1
+ LONG $0xc95cd5c5 // vsubpd ymm1, ymm5, ymm1
+ LONG $0x117dc1c4; WORD $0xf014 // vmovupd yword [r8 + 8*rsi], ymm2
+ LONG $0x117dc1c4; WORD $0xf05c; BYTE $0x20 // vmovupd yword [r8 + 8*rsi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf064; BYTE $0x40 // vmovupd yword [r8 + 8*rsi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf04c; BYTE $0x60 // vmovupd yword [r8 + 8*rsi + 96], ymm1
+
+LBB1_452:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_453
+
+LBB1_457:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_458:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_460
+ LONG $0x1410fdc5; BYTE $0xf2 // vmovupd ymm2, yword [rdx + 8*rsi]
+ LONG $0x5c10fdc5; WORD $0x20f2 // vmovupd ymm3, yword [rdx + 8*rsi + 32]
+ LONG $0x6410fdc5; WORD $0x40f2 // vmovupd ymm4, yword [rdx + 8*rsi + 64]
+ LONG $0x6c10fdc5; WORD $0x60f2 // vmovupd ymm5, yword [rdx + 8*rsi + 96]
+ LONG $0xd15cedc5 // vsubpd ymm2, ymm2, ymm1
+ LONG $0xd95ce5c5 // vsubpd ymm3, ymm3, ymm1
+ LONG $0xe15cddc5 // vsubpd ymm4, ymm4, ymm1
+ LONG $0xc95cd5c5 // vsubpd ymm1, ymm5, ymm1
+ LONG $0x117dc1c4; WORD $0xf014 // vmovupd yword [r8 + 8*rsi], ymm2
+ LONG $0x117dc1c4; WORD $0xf05c; BYTE $0x20 // vmovupd yword [r8 + 8*rsi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf064; BYTE $0x40 // vmovupd yword [r8 + 8*rsi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf04c; BYTE $0x60 // vmovupd yword [r8 + 8*rsi + 96], ymm1
+
+LBB1_460:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_461
+
+LBB1_465:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_466:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_468
+ LONG $0x1458f5c5; BYTE $0xfa // vaddpd ymm2, ymm1, yword [rdx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20fa // vaddpd ymm3, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40fa // vaddpd ymm4, ymm1, yword [rdx + 8*rdi + 64]
+ LONG $0x4c58f5c5; WORD $0x60fa // vaddpd ymm1, ymm1, yword [rdx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm1
+
+LBB1_468:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_469
+
+LBB1_473:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_474:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_476
+ LONG $0x1458f5c5; BYTE $0xfa // vaddpd ymm2, ymm1, yword [rdx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20fa // vaddpd ymm3, ymm1, yword [rdx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40fa // vaddpd ymm4, ymm1, yword [rdx + 8*rdi + 64]
+ LONG $0x4c58f5c5; WORD $0x60fa // vaddpd ymm1, ymm1, yword [rdx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm1
+
+LBB1_476:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_477
+
+LBB1_481:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_482:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_484
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xc0f8ddc5 // vpsubb ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_484:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_485
+
+LBB1_489:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_490:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_492
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xc0f8ddc5 // vpsubb ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_492:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_493
+
+LBB1_497:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_498:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_500
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x603a // vpaddb ymm0, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_500:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_501
+
+LBB1_505:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_506:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_508
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x603a // vpaddb ymm0, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_508:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_509
+
+LBB1_513:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_514:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_516
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xc0fbddc5 // vpsubq ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_516:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_517
+
+LBB1_521:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_522:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_524
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xc0fbddc5 // vpsubq ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_524:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_525
+
+LBB1_529:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_530:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_532
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60fa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_532:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_533
+
+LBB1_537:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_538:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_540
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60fa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_540:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_541
+
+LBB1_545:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_546:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_548
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xc0f9edc5 // vpsubw ymm0, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_548:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_549
+
+LBB1_553:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_554:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_556
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xc0f9edc5 // vpsubw ymm0, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_556:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_557
+
+LBB1_561:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_562:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_564
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xc0f9edc5 // vpsubw ymm0, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_564:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_565
+
+LBB1_569:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_570:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_572
+ LONG $0x0c6ffec5; BYTE $0x7a // vmovdqu ymm1, yword [rdx + 2*rdi]
+ LONG $0x546ffec5; WORD $0x207a // vmovdqu ymm2, yword [rdx + 2*rdi + 32]
+ LONG $0xc8f9f5c5 // vpsubw ymm1, ymm1, ymm0
+ LONG $0xc0f9edc5 // vpsubw ymm0, ymm2, ymm0
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_572:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_573
+
+LBB1_577:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_578:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_580
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x207a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_580:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_581
+
+LBB1_585:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_586:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_588
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x207a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_588:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_589
+
+LBB1_593:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_594:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_596
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x207a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_596:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_597
+
+LBB1_601:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_602:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_604
+ LONG $0x0cfdfdc5; BYTE $0x7a // vpaddw ymm1, ymm0, yword [rdx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x207a // vpaddw ymm0, ymm0, yword [rdx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB1_604:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_605
+
+LBB1_609:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_610:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_612
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xc0fbddc5 // vpsubq ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_612:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_613
+
+LBB1_617:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_618:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_620
+ LONG $0x1410fcc5; BYTE $0xb2 // vmovups ymm2, yword [rdx + 4*rsi]
+ LONG $0x5c10fcc5; WORD $0x20b2 // vmovups ymm3, yword [rdx + 4*rsi + 32]
+ LONG $0x6410fcc5; WORD $0x40b2 // vmovups ymm4, yword [rdx + 4*rsi + 64]
+ LONG $0x6c10fcc5; WORD $0x60b2 // vmovups ymm5, yword [rdx + 4*rsi + 96]
+ LONG $0xd15cecc5 // vsubps ymm2, ymm2, ymm1
+ LONG $0xd95ce4c5 // vsubps ymm3, ymm3, ymm1
+ LONG $0xe15cdcc5 // vsubps ymm4, ymm4, ymm1
+ LONG $0xc95cd4c5 // vsubps ymm1, ymm5, ymm1
+ LONG $0x117cc1c4; WORD $0xb014 // vmovups yword [r8 + 4*rsi], ymm2
+ LONG $0x117cc1c4; WORD $0xb05c; BYTE $0x20 // vmovups yword [r8 + 4*rsi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb064; BYTE $0x40 // vmovups yword [r8 + 4*rsi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb04c; BYTE $0x60 // vmovups yword [r8 + 4*rsi + 96], ymm1
+
+LBB1_620:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_621
+
+LBB1_625:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_626:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_628
+ LONG $0x0c6ffec5; BYTE $0xfa // vmovdqu ymm1, yword [rdx + 8*rdi]
+ LONG $0x546ffec5; WORD $0x20fa // vmovdqu ymm2, yword [rdx + 8*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40fa // vmovdqu ymm3, yword [rdx + 8*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60fa // vmovdqu ymm4, yword [rdx + 8*rdi + 96]
+ LONG $0xc8fbf5c5 // vpsubq ymm1, ymm1, ymm0
+ LONG $0xd0fbedc5 // vpsubq ymm2, ymm2, ymm0
+ LONG $0xd8fbe5c5 // vpsubq ymm3, ymm3, ymm0
+ LONG $0xc0fbddc5 // vpsubq ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_628:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_629
+
+LBB1_633:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_634:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_636
+ LONG $0x1410fcc5; BYTE $0xb2 // vmovups ymm2, yword [rdx + 4*rsi]
+ LONG $0x5c10fcc5; WORD $0x20b2 // vmovups ymm3, yword [rdx + 4*rsi + 32]
+ LONG $0x6410fcc5; WORD $0x40b2 // vmovups ymm4, yword [rdx + 4*rsi + 64]
+ LONG $0x6c10fcc5; WORD $0x60b2 // vmovups ymm5, yword [rdx + 4*rsi + 96]
+ LONG $0xd15cecc5 // vsubps ymm2, ymm2, ymm1
+ LONG $0xd95ce4c5 // vsubps ymm3, ymm3, ymm1
+ LONG $0xe15cdcc5 // vsubps ymm4, ymm4, ymm1
+ LONG $0xc95cd4c5 // vsubps ymm1, ymm5, ymm1
+ LONG $0x117cc1c4; WORD $0xb014 // vmovups yword [r8 + 4*rsi], ymm2
+ LONG $0x117cc1c4; WORD $0xb05c; BYTE $0x20 // vmovups yword [r8 + 4*rsi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb064; BYTE $0x40 // vmovups yword [r8 + 4*rsi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb04c; BYTE $0x60 // vmovups yword [r8 + 4*rsi + 96], ymm1
+
+LBB1_636:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_637
+
+LBB1_641:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_642:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_644
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60fa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_644:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_645
+
+LBB1_649:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_650:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_652
+ LONG $0x1458f4c5; BYTE $0xba // vaddps ymm2, ymm1, yword [rdx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20ba // vaddps ymm3, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40ba // vaddps ymm4, ymm1, yword [rdx + 4*rdi + 64]
+ LONG $0x4c58f4c5; WORD $0x60ba // vaddps ymm1, ymm1, yword [rdx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm1
+
+LBB1_652:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_653
+
+LBB1_657:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_658:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_660
+ LONG $0x0cd4fdc5; BYTE $0xfa // vpaddq ymm1, ymm0, yword [rdx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20fa // vpaddq ymm2, ymm0, yword [rdx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40fa // vpaddq ymm3, ymm0, yword [rdx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60fa // vpaddq ymm0, ymm0, yword [rdx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB1_660:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_661
+
+LBB1_665:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_666:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_668
+ LONG $0x1458f4c5; BYTE $0xba // vaddps ymm2, ymm1, yword [rdx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20ba // vaddps ymm3, ymm1, yword [rdx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40ba // vaddps ymm4, ymm1, yword [rdx + 4*rdi + 64]
+ LONG $0x4c58f4c5; WORD $0x60ba // vaddps ymm1, ymm1, yword [rdx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm1
+
+LBB1_668:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_669
+
+LBB1_673:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_674:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_676
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xc0f8ddc5 // vpsubb ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_676:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_677
+
+LBB1_681:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_682:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_684
+ LONG $0x0c6ffec5; BYTE $0x3a // vmovdqu ymm1, yword [rdx + rdi]
+ LONG $0x546ffec5; WORD $0x203a // vmovdqu ymm2, yword [rdx + rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x403a // vmovdqu ymm3, yword [rdx + rdi + 64]
+ LONG $0x646ffec5; WORD $0x603a // vmovdqu ymm4, yword [rdx + rdi + 96]
+ LONG $0xc8f8f5c5 // vpsubb ymm1, ymm1, ymm0
+ LONG $0xd0f8edc5 // vpsubb ymm2, ymm2, ymm0
+ LONG $0xd8f8e5c5 // vpsubb ymm3, ymm3, ymm0
+ LONG $0xc0f8ddc5 // vpsubb ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_684:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_685
+
+LBB1_689:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_690:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_692
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x603a // vpaddb ymm0, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_692:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_693
+
+LBB1_697:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_698:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_700
+ LONG $0x0cfcfdc5; BYTE $0x3a // vpaddb ymm1, ymm0, yword [rdx + rdi]
+ LONG $0x54fcfdc5; WORD $0x203a // vpaddb ymm2, ymm0, yword [rdx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x403a // vpaddb ymm3, ymm0, yword [rdx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x603a // vpaddb ymm0, ymm0, yword [rdx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB1_700:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_701
+
+LBB1_705:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_706:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_708
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xc0faddc5 // vpsubd ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_708:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_709
+
+LBB1_713:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_714:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_716
+ LONG $0x0c6ffec5; BYTE $0xba // vmovdqu ymm1, yword [rdx + 4*rdi]
+ LONG $0x546ffec5; WORD $0x20ba // vmovdqu ymm2, yword [rdx + 4*rdi + 32]
+ LONG $0x5c6ffec5; WORD $0x40ba // vmovdqu ymm3, yword [rdx + 4*rdi + 64]
+ LONG $0x646ffec5; WORD $0x60ba // vmovdqu ymm4, yword [rdx + 4*rdi + 96]
+ LONG $0xc8faf5c5 // vpsubd ymm1, ymm1, ymm0
+ LONG $0xd0faedc5 // vpsubd ymm2, ymm2, ymm0
+ LONG $0xd8fae5c5 // vpsubd ymm3, ymm3, ymm0
+ LONG $0xc0faddc5 // vpsubd ymm0, ymm4, ymm0
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_716:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_717
+
+LBB1_721:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_722:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_724
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60ba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_724:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_725
+
+LBB1_729:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_730:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_732
+ LONG $0x0cfefdc5; BYTE $0xba // vpaddd ymm1, ymm0, yword [rdx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20ba // vpaddd ymm2, ymm0, yword [rdx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40ba // vpaddd ymm3, ymm0, yword [rdx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60ba // vpaddd ymm0, ymm0, yword [rdx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB1_732:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB1_733
+
+LBB1_737:
+ VZEROUPPER
+ RET
+
+TEXT ·_arithmetic_scalar_arr_avx2(SB), $0-48
+
+ MOVQ typ+0(FP), DI
+ MOVQ op+8(FP), SI
+ MOVQ inLeft+16(FP), DX
+ MOVQ inRight+24(FP), CX
+ MOVQ out+32(FP), R8
+ MOVQ len+40(FP), R9
+
+ LONG $0x01fe8040 // cmp sil, 1
+ JG LBB2_11
+ WORD $0x8440; BYTE $0xf6 // test sil, sil
+ JE LBB2_21
+ LONG $0x01fe8040 // cmp sil, 1
+ JNE LBB2_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_37
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_65
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_105
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_108
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_10
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_297
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_297
+
+LBB2_10:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_421:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_423
+
+LBB2_422:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_422
+
+LBB2_423:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_424:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_424
+ JMP LBB2_737
+
+LBB2_11:
+ LONG $0x02fe8040 // cmp sil, 2
+ JE LBB2_29
+ LONG $0x03fe8040 // cmp sil, 3
+ JNE LBB2_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_44
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_70
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_111
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_114
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_20
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_300
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_300
+
+LBB2_20:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_429:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_431
+
+LBB2_430:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_430
+
+LBB2_431:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_432:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_432
+ JMP LBB2_737
+
+LBB2_21:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_51
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_75
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_117
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_120
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_28
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_303
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_303
+
+LBB2_28:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_437:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_439
+
+LBB2_438:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_438
+
+LBB2_439:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_440:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_440
+ JMP LBB2_737
+
+LBB2_29:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_58
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_80
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_123
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_126
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_36
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_306
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_306
+
+LBB2_36:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_445:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_447
+
+LBB2_446:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_446
+
+LBB2_447:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_448:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_448
+ JMP LBB2_737
+
+LBB2_37:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_85
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_129
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_132
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fbc5 // vmovsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_43
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_309
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_309
+
+LBB2_43:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_453:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_455
+
+LBB2_454:
+ LONG $0x0c5cfbc5; BYTE $0xd1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_454
+
+LBB2_455:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_456:
+ LONG $0x0c5cfbc5; BYTE $0xd1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x4c5cfbc5; WORD $0x08d1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx + 8]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x08 // vmovsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0x4c5cfbc5; WORD $0x10d1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx + 16]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x10 // vmovsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0x4c5cfbc5; WORD $0x18d1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx + 24]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x18 // vmovsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_456
+ JMP LBB2_737
+
+LBB2_44:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_90
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_135
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_138
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fbc5 // vmovsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_50
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_312
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_312
+
+LBB2_50:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_461:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_463
+
+LBB2_462:
+ LONG $0x0c5cfbc5; BYTE $0xd1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_462
+
+LBB2_463:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_464:
+ LONG $0x0c5cfbc5; BYTE $0xd1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x4c5cfbc5; WORD $0x08d1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx + 8]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x08 // vmovsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0x4c5cfbc5; WORD $0x10d1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx + 16]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x10 // vmovsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0x4c5cfbc5; WORD $0x18d1 // vsubsd xmm1, xmm0, qword [rcx + 8*rdx + 24]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x18 // vmovsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_464
+ JMP LBB2_737
+
+LBB2_51:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_95
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_141
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_144
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fbc5 // vmovsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_57
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_315
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_315
+
+LBB2_57:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_469:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_471
+
+LBB2_470:
+ LONG $0x0c58fbc5; BYTE $0xd1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_470
+
+LBB2_471:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_472:
+ LONG $0x0c58fbc5; BYTE $0xd1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x4c58fbc5; WORD $0x08d1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx + 8]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x08 // vmovsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0x4c58fbc5; WORD $0x10d1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx + 16]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x10 // vmovsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0x4c58fbc5; WORD $0x18d1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx + 24]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x18 // vmovsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_472
+ JMP LBB2_737
+
+LBB2_58:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_100
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_147
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_150
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fbc5 // vmovsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_64
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_318
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_318
+
+LBB2_64:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_477:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_479
+
+LBB2_478:
+ LONG $0x0c58fbc5; BYTE $0xd1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_478
+
+LBB2_479:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_480:
+ LONG $0x0c58fbc5; BYTE $0xd1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx]
+ LONG $0x117bc1c4; WORD $0xd00c // vmovsd qword [r8 + 8*rdx], xmm1
+ LONG $0x4c58fbc5; WORD $0x08d1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx + 8]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x08 // vmovsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0x4c58fbc5; WORD $0x10d1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx + 16]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x10 // vmovsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0x4c58fbc5; WORD $0x18d1 // vaddsd xmm1, xmm0, qword [rcx + 8*rdx + 24]
+ LONG $0x117bc1c4; WORD $0xd04c; BYTE $0x18 // vmovsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_480
+ JMP LBB2_737
+
+LBB2_65:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_153
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_69
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_321
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_321
+
+LBB2_69:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_485:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_487
+
+LBB2_486:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_486
+
+LBB2_487:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_488:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0131542a // sub dl, byte [rcx + rsi + 1]
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0231542a // sub dl, byte [rcx + rsi + 2]
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0331542a // sub dl, byte [rcx + rsi + 3]
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_488
+ JMP LBB2_737
+
+LBB2_70:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_156
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_74
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_324
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_324
+
+LBB2_74:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_493:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_495
+
+LBB2_494:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_494
+
+LBB2_495:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_496:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0131542a // sub dl, byte [rcx + rsi + 1]
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0231542a // sub dl, byte [rcx + rsi + 2]
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0331542a // sub dl, byte [rcx + rsi + 3]
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_496
+ JMP LBB2_737
+
+LBB2_75:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_159
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_79
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_327
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_327
+
+LBB2_79:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_501:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_503
+
+LBB2_502:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_502
+
+LBB2_503:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_504:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_504
+ JMP LBB2_737
+
+LBB2_80:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_162
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_84
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_330
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_330
+
+LBB2_84:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_509:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_511
+
+LBB2_510:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_510
+
+LBB2_511:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_512:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_512
+ JMP LBB2_737
+
+LBB2_85:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_165
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_89
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_333
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_333
+
+LBB2_89:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_517:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_519
+
+LBB2_518:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_518
+
+LBB2_519:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_520:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_520
+ JMP LBB2_737
+
+LBB2_90:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_168
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_94
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_336
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_336
+
+LBB2_94:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_525:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_527
+
+LBB2_526:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_526
+
+LBB2_527:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_528:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_528
+ JMP LBB2_737
+
+LBB2_95:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_171
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_99
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_339
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_339
+
+LBB2_99:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_533:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_535
+
+LBB2_534:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_534
+
+LBB2_535:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_536:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_536
+ JMP LBB2_737
+
+LBB2_100:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_174
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_104
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_342
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_342
+
+LBB2_104:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_541:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_543
+
+LBB2_542:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_542
+
+LBB2_543:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_544:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_544
+ JMP LBB2_737
+
+LBB2_105:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_107
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_345
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_345
+
+LBB2_107:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_549:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_551
+
+LBB2_550:
+ WORD $0xc789 // mov edi, eax
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_550
+
+LBB2_551:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_552:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_552
+ JMP LBB2_737
+
+LBB2_108:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_110
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_348
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_348
+
+LBB2_110:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_557:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_559
+
+LBB2_558:
+ WORD $0xc789 // mov edi, eax
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_558
+
+LBB2_559:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_560:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_560
+ JMP LBB2_737
+
+LBB2_111:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_113
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_351
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_351
+
+LBB2_113:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_565:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_567
+
+LBB2_566:
+ WORD $0xc789 // mov edi, eax
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_566
+
+LBB2_567:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_568:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_568
+ JMP LBB2_737
+
+LBB2_114:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_116
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_354
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_354
+
+LBB2_116:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_573:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_575
+
+LBB2_574:
+ WORD $0xc789 // mov edi, eax
+ LONG $0x713c2b66 // sub di, word [rcx + 2*rsi]
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_574
+
+LBB2_575:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_576:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_576
+ JMP LBB2_737
+
+LBB2_117:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_119
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_357
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_357
+
+LBB2_119:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_581:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_583
+
+LBB2_582:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_582
+
+LBB2_583:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_584:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_584
+ JMP LBB2_737
+
+LBB2_120:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_122
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_360
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_360
+
+LBB2_122:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_589:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_591
+
+LBB2_590:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_590
+
+LBB2_591:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_592:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_592
+ JMP LBB2_737
+
+LBB2_123:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_125
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_363
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_363
+
+LBB2_125:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_597:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_599
+
+LBB2_598:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_598
+
+LBB2_599:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_600:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_600
+ JMP LBB2_737
+
+LBB2_126:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_128
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_366
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_366
+
+LBB2_128:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_605:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd2 // mov rdx, r10
+ LONG $0x03e28348 // and rdx, 3
+ JE LBB2_607
+
+LBB2_606:
+ LONG $0x713cb70f // movzx edi, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc7 // add di, ax
+ LONG $0x3c894166; BYTE $0x70 // mov word [r8 + 2*rsi], di
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc28348 // add rdx, -1
+ JNE LBB2_606
+
+LBB2_607:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_608:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_608
+ JMP LBB2_737
+
+LBB2_129:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_131
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_369
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_369
+
+LBB2_131:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_613:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_615
+
+LBB2_614:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_614
+
+LBB2_615:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_616:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_616
+ JMP LBB2_737
+
+LBB2_132:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fac5 // vmovss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_134
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_372
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_372
+
+LBB2_134:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_621:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_623
+
+LBB2_622:
+ LONG $0x0c5cfac5; BYTE $0x91 // vsubss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_622
+
+LBB2_623:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_624:
+ LONG $0x0c5cfac5; BYTE $0x91 // vsubss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x4c5cfac5; WORD $0x0491 // vsubss xmm1, xmm0, dword [rcx + 4*rdx + 4]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x04 // vmovss dword [r8 + 4*rdx + 4], xmm1
+ LONG $0x4c5cfac5; WORD $0x0891 // vsubss xmm1, xmm0, dword [rcx + 4*rdx + 8]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x08 // vmovss dword [r8 + 4*rdx + 8], xmm1
+ LONG $0x4c5cfac5; WORD $0x0c91 // vsubss xmm1, xmm0, dword [rcx + 4*rdx + 12]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x0c // vmovss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_624
+ JMP LBB2_737
+
+LBB2_135:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_137
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_375
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_375
+
+LBB2_137:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_629:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_631
+
+LBB2_630:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_630
+
+LBB2_631:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_632:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_632
+ JMP LBB2_737
+
+LBB2_138:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fac5 // vmovss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_140
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_378
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_378
+
+LBB2_140:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_637:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_639
+
+LBB2_638:
+ LONG $0x0c5cfac5; BYTE $0x91 // vsubss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_638
+
+LBB2_639:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_640:
+ LONG $0x0c5cfac5; BYTE $0x91 // vsubss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x4c5cfac5; WORD $0x0491 // vsubss xmm1, xmm0, dword [rcx + 4*rdx + 4]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x04 // vmovss dword [r8 + 4*rdx + 4], xmm1
+ LONG $0x4c5cfac5; WORD $0x0891 // vsubss xmm1, xmm0, dword [rcx + 4*rdx + 8]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x08 // vmovss dword [r8 + 4*rdx + 8], xmm1
+ LONG $0x4c5cfac5; WORD $0x0c91 // vsubss xmm1, xmm0, dword [rcx + 4*rdx + 12]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x0c // vmovss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_640
+ JMP LBB2_737
+
+LBB2_141:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_143
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_381
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_381
+
+LBB2_143:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_645:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_647
+
+LBB2_646:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_646
+
+LBB2_647:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_648:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_648
+ JMP LBB2_737
+
+LBB2_144:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fac5 // vmovss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_146
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_384
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_384
+
+LBB2_146:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_653:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_655
+
+LBB2_654:
+ LONG $0x0c58fac5; BYTE $0x91 // vaddss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_654
+
+LBB2_655:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_656:
+ LONG $0x0c58fac5; BYTE $0x91 // vaddss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x4c58fac5; WORD $0x0491 // vaddss xmm1, xmm0, dword [rcx + 4*rdx + 4]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x04 // vmovss dword [r8 + 4*rdx + 4], xmm1
+ LONG $0x4c58fac5; WORD $0x0891 // vaddss xmm1, xmm0, dword [rcx + 4*rdx + 8]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x08 // vmovss dword [r8 + 4*rdx + 8], xmm1
+ LONG $0x4c58fac5; WORD $0x0c91 // vaddss xmm1, xmm0, dword [rcx + 4*rdx + 12]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x0c // vmovss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_656
+ JMP LBB2_737
+
+LBB2_147:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_149
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_387
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_387
+
+LBB2_149:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_661:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_663
+
+LBB2_662:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_662
+
+LBB2_663:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_664:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_664
+ JMP LBB2_737
+
+LBB2_150:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x0210fac5 // vmovss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_152
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_390
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_390
+
+LBB2_152:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_669:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_671
+
+LBB2_670:
+ LONG $0x0c58fac5; BYTE $0x91 // vaddss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_670
+
+LBB2_671:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_672:
+ LONG $0x0c58fac5; BYTE $0x91 // vaddss xmm1, xmm0, dword [rcx + 4*rdx]
+ LONG $0x117ac1c4; WORD $0x900c // vmovss dword [r8 + 4*rdx], xmm1
+ LONG $0x4c58fac5; WORD $0x0491 // vaddss xmm1, xmm0, dword [rcx + 4*rdx + 4]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x04 // vmovss dword [r8 + 4*rdx + 4], xmm1
+ LONG $0x4c58fac5; WORD $0x0891 // vaddss xmm1, xmm0, dword [rcx + 4*rdx + 8]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x08 // vmovss dword [r8 + 4*rdx + 8], xmm1
+ LONG $0x4c58fac5; WORD $0x0c91 // vaddss xmm1, xmm0, dword [rcx + 4*rdx + 12]
+ LONG $0x117ac1c4; WORD $0x904c; BYTE $0x0c // vmovss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_672
+ JMP LBB2_737
+
+LBB2_153:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_155
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_393
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_393
+
+LBB2_155:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_677:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_679
+
+LBB2_678:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_678
+
+LBB2_679:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_680:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0131542a // sub dl, byte [rcx + rsi + 1]
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0231542a // sub dl, byte [rcx + rsi + 2]
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0331542a // sub dl, byte [rcx + rsi + 3]
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_680
+ JMP LBB2_737
+
+LBB2_156:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_158
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_396
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_396
+
+LBB2_158:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_685:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_687
+
+LBB2_686:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_686
+
+LBB2_687:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_688:
+ WORD $0xc289 // mov edx, eax
+ WORD $0x142a; BYTE $0x31 // sub dl, byte [rcx + rsi]
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0131542a // sub dl, byte [rcx + rsi + 1]
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0231542a // sub dl, byte [rcx + rsi + 2]
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ WORD $0xc289 // mov edx, eax
+ LONG $0x0331542a // sub dl, byte [rcx + rsi + 3]
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_688
+ JMP LBB2_737
+
+LBB2_159:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_161
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_399
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_399
+
+LBB2_161:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_693:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_695
+
+LBB2_694:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_694
+
+LBB2_695:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_696:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_696
+ JMP LBB2_737
+
+LBB2_162:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x80f98141; WORD $0x0000; BYTE $0x00 // cmp r9d, 128
+ JB LBB2_164
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_402
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_402
+
+LBB2_164:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_701:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_703
+
+LBB2_702:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_702
+
+LBB2_703:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_704:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_704
+ JMP LBB2_737
+
+LBB2_165:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_167
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_405
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_405
+
+LBB2_167:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_709:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_711
+
+LBB2_710:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_710
+
+LBB2_711:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_712:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_712
+ JMP LBB2_737
+
+LBB2_168:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_170
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_408
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_408
+
+LBB2_170:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_717:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_719
+
+LBB2_718:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_718
+
+LBB2_719:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_720:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_720
+ JMP LBB2_737
+
+LBB2_171:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_173
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_411
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_411
+
+LBB2_173:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_725:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_727
+
+LBB2_726:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_726
+
+LBB2_727:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_728:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_728
+ JMP LBB2_737
+
+LBB2_174:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_176
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_414
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_414
+
+LBB2_176:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_733:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_735
+
+LBB2_734:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_734
+
+LBB2_735:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_736:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_736
+ JMP LBB2_737
+
+LBB2_297:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0x6e79c1c4; BYTE $0xc3 // vmovd xmm0, r11d
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_417
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_299:
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fafdc5; WORD $0x60b9 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfafdc5; BYTE $0x00 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fafdc5; BYTE $0x00 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfafdc5; BYTE $0x00 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fafdc5; BYTE $0x00 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_299
+ JMP LBB2_418
+
+LBB2_300:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0x6e79c1c4; BYTE $0xc3 // vmovd xmm0, r11d
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_425
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_302:
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fafdc5; WORD $0x60b9 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfafdc5; BYTE $0x00 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fafdc5; BYTE $0x00 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfafdc5; BYTE $0x00 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fafdc5; BYTE $0x00 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_302
+ JMP LBB2_426
+
+LBB2_303:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_433
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_305:
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60b9 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_305
+ JMP LBB2_434
+
+LBB2_306:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_441
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_308:
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60b9 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_308
+ JMP LBB2_442
+
+LBB2_309:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf0 // and edx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0728d48 // lea rsi, [rdx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_449
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_311:
+ LONG $0x145cf5c5; BYTE $0xf9 // vsubpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c5cf5c5; WORD $0x20f9 // vsubpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x645cf5c5; WORD $0x40f9 // vsubpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x6c5cf5c5; WORD $0x60f9 // vsubpd ymm5, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf86c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm5
+ QUAD $0x000080f9945cf5c5; BYTE $0x00 // vsubpd ymm2, ymm1, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f99c5cf5c5; BYTE $0x00 // vsubpd ymm3, ymm1, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f9a45cf5c5; BYTE $0x00 // vsubpd ymm4, ymm1, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9ac5cf5c5; BYTE $0x00 // vsubpd ymm5, ymm1, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f894117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 128], ymm2
+ QUAD $0x00a0f89c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 160], ymm3
+ QUAD $0x00c0f8a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 192], ymm4
+ QUAD $0x00e0f8ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 224], ymm5
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_311
+ JMP LBB2_450
+
+LBB2_312:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf0 // and edx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0728d48 // lea rsi, [rdx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_457
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_314:
+ LONG $0x145cf5c5; BYTE $0xf9 // vsubpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c5cf5c5; WORD $0x20f9 // vsubpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x645cf5c5; WORD $0x40f9 // vsubpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x6c5cf5c5; WORD $0x60f9 // vsubpd ymm5, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf86c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm5
+ QUAD $0x000080f9945cf5c5; BYTE $0x00 // vsubpd ymm2, ymm1, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f99c5cf5c5; BYTE $0x00 // vsubpd ymm3, ymm1, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f9a45cf5c5; BYTE $0x00 // vsubpd ymm4, ymm1, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9ac5cf5c5; BYTE $0x00 // vsubpd ymm5, ymm1, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f894117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 128], ymm2
+ QUAD $0x00a0f89c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 160], ymm3
+ QUAD $0x00c0f8a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 192], ymm4
+ QUAD $0x00e0f8ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 224], ymm5
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_314
+ JMP LBB2_458
+
+LBB2_315:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf0 // and edx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0728d48 // lea rsi, [rdx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_465
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_317:
+ LONG $0x1458f5c5; BYTE $0xf9 // vaddpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20f9 // vaddpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40f9 // vaddpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x6c58f5c5; WORD $0x60f9 // vaddpd ymm5, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf86c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm5
+ QUAD $0x000080f99458f5c5; BYTE $0x00 // vaddpd ymm2, ymm1, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f99c58f5c5; BYTE $0x00 // vaddpd ymm3, ymm1, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f9a458f5c5; BYTE $0x00 // vaddpd ymm4, ymm1, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9ac58f5c5; BYTE $0x00 // vaddpd ymm5, ymm1, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f894117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 128], ymm2
+ QUAD $0x00a0f89c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 160], ymm3
+ QUAD $0x00c0f8a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 192], ymm4
+ QUAD $0x00e0f8ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 224], ymm5
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_317
+ JMP LBB2_466
+
+LBB2_318:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf0 // and edx, -16
+ LONG $0x197de2c4; BYTE $0xc8 // vbroadcastsd ymm1, xmm0
+ LONG $0xf0728d48 // lea rsi, [rdx - 16]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_473
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_320:
+ LONG $0x1458f5c5; BYTE $0xf9 // vaddpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20f9 // vaddpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40f9 // vaddpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x6c58f5c5; WORD $0x60f9 // vaddpd ymm5, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf86c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm5
+ QUAD $0x000080f99458f5c5; BYTE $0x00 // vaddpd ymm2, ymm1, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f99c58f5c5; BYTE $0x00 // vaddpd ymm3, ymm1, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f9a458f5c5; BYTE $0x00 // vaddpd ymm4, ymm1, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9ac58f5c5; BYTE $0x00 // vaddpd ymm5, ymm1, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f894117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 128], ymm2
+ QUAD $0x00a0f89c117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 160], ymm3
+ QUAD $0x00c0f8a4117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 192], ymm4
+ QUAD $0x00e0f8ac117dc1c4; WORD $0x0000 // vmovupd yword [r8 + 8*rdi + 224], ymm5
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_320
+ JMP LBB2_474
+
+LBB2_321:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_481
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_323:
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64f8fdc5; WORD $0x6039 // vpsubb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cf8fdc5; BYTE $0x00 // vpsubb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994f8fdc5; BYTE $0x00 // vpsubb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cf8fdc5; BYTE $0x00 // vpsubb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4f8fdc5; BYTE $0x00 // vpsubb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_323
+ JMP LBB2_482
+
+LBB2_324:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_489
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_326:
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64f8fdc5; WORD $0x6039 // vpsubb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cf8fdc5; BYTE $0x00 // vpsubb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994f8fdc5; BYTE $0x00 // vpsubb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cf8fdc5; BYTE $0x00 // vpsubb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4f8fdc5; BYTE $0x00 // vpsubb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_326
+ JMP LBB2_490
+
+LBB2_327:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_497
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_329:
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x6039 // vpaddb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_329
+ JMP LBB2_498
+
+LBB2_330:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_505
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_332:
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x6039 // vpaddb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_332
+ JMP LBB2_506
+
+LBB2_333:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9c1c4; BYTE $0xc3 // vmovq xmm0, r11
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_513
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_335:
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64fbfdc5; WORD $0x60f9 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cfbfdc5; BYTE $0x00 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994fbfdc5; BYTE $0x00 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cfbfdc5; BYTE $0x00 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4fbfdc5; BYTE $0x00 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_335
+ JMP LBB2_514
+
+LBB2_336:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9c1c4; BYTE $0xc3 // vmovq xmm0, r11
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_521
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_338:
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64fbfdc5; WORD $0x60f9 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cfbfdc5; BYTE $0x00 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994fbfdc5; BYTE $0x00 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cfbfdc5; BYTE $0x00 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4fbfdc5; BYTE $0x00 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_338
+ JMP LBB2_522
+
+LBB2_339:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_529
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_341:
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60f9 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_341
+ JMP LBB2_530
+
+LBB2_342:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_537
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_344:
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60f9 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_344
+ JMP LBB2_538
+
+LBB2_345:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_545
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_347:
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54f9fdc5; WORD $0x2079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cf9fdc5; WORD $0x4079 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54f9fdc5; WORD $0x6079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_347
+ JMP LBB2_546
+
+LBB2_348:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_553
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_350:
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54f9fdc5; WORD $0x2079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cf9fdc5; WORD $0x4079 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54f9fdc5; WORD $0x6079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_350
+ JMP LBB2_554
+
+LBB2_351:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_561
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_353:
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54f9fdc5; WORD $0x2079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cf9fdc5; WORD $0x4079 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54f9fdc5; WORD $0x6079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_353
+ JMP LBB2_562
+
+LBB2_354:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_569
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_356:
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54f9fdc5; WORD $0x2079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cf9fdc5; WORD $0x4079 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54f9fdc5; WORD $0x6079 // vpsubw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_356
+ JMP LBB2_570
+
+LBB2_357:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_577
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_359:
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x2079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x4079 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x6079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_359
+ JMP LBB2_578
+
+LBB2_360:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_585
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_362:
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x2079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x4079 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x6079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_362
+ JMP LBB2_586
+
+LBB2_363:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_593
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_365:
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x2079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x4079 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x6079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_365
+ JMP LBB2_594
+
+LBB2_366:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x797de2c4; BYTE $0xc0 // vpbroadcastw ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_601
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_368:
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x54fdfdc5; WORD $0x2079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm2
+ LONG $0x4cfdfdc5; WORD $0x4079 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi + 64]
+ LONG $0x54fdfdc5; WORD $0x6079 // vpaddw ymm2, ymm0, yword [rcx + 2*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x784c; BYTE $0x40 // vmovdqu yword [r8 + 2*rdi + 64], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7854; BYTE $0x60 // vmovdqu yword [r8 + 2*rdi + 96], ymm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_368
+ JMP LBB2_602
+
+LBB2_369:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9c1c4; BYTE $0xc3 // vmovq xmm0, r11
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_609
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_371:
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64fbfdc5; WORD $0x60f9 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cfbfdc5; BYTE $0x00 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994fbfdc5; BYTE $0x00 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cfbfdc5; BYTE $0x00 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4fbfdc5; BYTE $0x00 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_371
+ JMP LBB2_610
+
+LBB2_372:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xe0 // and edx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0728d48 // lea rsi, [rdx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_617
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_374:
+ LONG $0x145cf4c5; BYTE $0xb9 // vsubps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c5cf4c5; WORD $0x20b9 // vsubps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x645cf4c5; WORD $0x40b9 // vsubps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x6c5cf4c5; WORD $0x60b9 // vsubps ymm5, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb86c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm5
+ QUAD $0x000080b9945cf4c5; BYTE $0x00 // vsubps ymm2, ymm1, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b99c5cf4c5; BYTE $0x00 // vsubps ymm3, ymm1, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b9a45cf4c5; BYTE $0x00 // vsubps ymm4, ymm1, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9ac5cf4c5; BYTE $0x00 // vsubps ymm5, ymm1, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b894117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 128], ymm2
+ QUAD $0x00a0b89c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 160], ymm3
+ QUAD $0x00c0b8a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 192], ymm4
+ QUAD $0x00e0b8ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 224], ymm5
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_374
+ JMP LBB2_618
+
+LBB2_375:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9c1c4; BYTE $0xc3 // vmovq xmm0, r11
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_625
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_377:
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64fbfdc5; WORD $0x60f9 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cfbfdc5; BYTE $0x00 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994fbfdc5; BYTE $0x00 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cfbfdc5; BYTE $0x00 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4fbfdc5; BYTE $0x00 // vpsubq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_377
+ JMP LBB2_626
+
+LBB2_378:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xe0 // and edx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0728d48 // lea rsi, [rdx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_633
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_380:
+ LONG $0x145cf4c5; BYTE $0xb9 // vsubps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c5cf4c5; WORD $0x20b9 // vsubps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x645cf4c5; WORD $0x40b9 // vsubps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x6c5cf4c5; WORD $0x60b9 // vsubps ymm5, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb86c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm5
+ QUAD $0x000080b9945cf4c5; BYTE $0x00 // vsubps ymm2, ymm1, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b99c5cf4c5; BYTE $0x00 // vsubps ymm3, ymm1, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b9a45cf4c5; BYTE $0x00 // vsubps ymm4, ymm1, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9ac5cf4c5; BYTE $0x00 // vsubps ymm5, ymm1, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b894117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 128], ymm2
+ QUAD $0x00a0b89c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 160], ymm3
+ QUAD $0x00c0b8a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 192], ymm4
+ QUAD $0x00e0b8ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 224], ymm5
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_380
+ JMP LBB2_634
+
+LBB2_381:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_641
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_383:
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60f9 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_383
+ JMP LBB2_642
+
+LBB2_384:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xe0 // and edx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0728d48 // lea rsi, [rdx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_649
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_386:
+ LONG $0x1458f4c5; BYTE $0xb9 // vaddps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20b9 // vaddps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40b9 // vaddps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x6c58f4c5; WORD $0x60b9 // vaddps ymm5, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb86c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm5
+ QUAD $0x000080b99458f4c5; BYTE $0x00 // vaddps ymm2, ymm1, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b99c58f4c5; BYTE $0x00 // vaddps ymm3, ymm1, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b9a458f4c5; BYTE $0x00 // vaddps ymm4, ymm1, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9ac58f4c5; BYTE $0x00 // vaddps ymm5, ymm1, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b894117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 128], ymm2
+ QUAD $0x00a0b89c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 160], ymm3
+ QUAD $0x00c0b8a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 192], ymm4
+ QUAD $0x00e0b8ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 224], ymm5
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_386
+ JMP LBB2_650
+
+LBB2_387:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0x6ef9e1c4; BYTE $0xc0 // vmovq xmm0, rax
+ LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_657
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_389:
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x64d4fdc5; WORD $0x60f9 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf864; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm4
+ QUAD $0x000080f98cd4fdc5; BYTE $0x00 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi + 128]
+ QUAD $0x0000a0f994d4fdc5; BYTE $0x00 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 160]
+ QUAD $0x0000c0f99cd4fdc5; BYTE $0x00 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 192]
+ QUAD $0x0000e0f9a4d4fdc5; BYTE $0x00 // vpaddq ymm4, ymm0, yword [rcx + 8*rdi + 224]
+ QUAD $0x0080f88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 128], ymm1
+ QUAD $0x00a0f8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 160], ymm2
+ QUAD $0x00c0f89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 192], ymm3
+ QUAD $0x00e0f8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 8*rdi + 224], ymm4
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_389
+ JMP LBB2_658
+
+LBB2_390:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xe0 // and edx, -32
+ LONG $0x187de2c4; BYTE $0xc8 // vbroadcastss ymm1, xmm0
+ LONG $0xe0728d48 // lea rsi, [rdx - 32]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_665
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_392:
+ LONG $0x1458f4c5; BYTE $0xb9 // vaddps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20b9 // vaddps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40b9 // vaddps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x6c58f4c5; WORD $0x60b9 // vaddps ymm5, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb86c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm5
+ QUAD $0x000080b99458f4c5; BYTE $0x00 // vaddps ymm2, ymm1, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b99c58f4c5; BYTE $0x00 // vaddps ymm3, ymm1, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b9a458f4c5; BYTE $0x00 // vaddps ymm4, ymm1, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9ac58f4c5; BYTE $0x00 // vaddps ymm5, ymm1, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b894117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 128], ymm2
+ QUAD $0x00a0b89c117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 160], ymm3
+ QUAD $0x00c0b8a4117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 192], ymm4
+ QUAD $0x00e0b8ac117cc1c4; WORD $0x0000 // vmovups yword [r8 + 4*rdi + 224], ymm5
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_392
+ JMP LBB2_666
+
+LBB2_393:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_673
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_395:
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64f8fdc5; WORD $0x6039 // vpsubb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cf8fdc5; BYTE $0x00 // vpsubb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994f8fdc5; BYTE $0x00 // vpsubb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cf8fdc5; BYTE $0x00 // vpsubb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4f8fdc5; BYTE $0x00 // vpsubb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_395
+ JMP LBB2_674
+
+LBB2_396:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_681
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_398:
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64f8fdc5; WORD $0x6039 // vpsubb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cf8fdc5; BYTE $0x00 // vpsubb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994f8fdc5; BYTE $0x00 // vpsubb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cf8fdc5; BYTE $0x00 // vpsubb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4f8fdc5; BYTE $0x00 // vpsubb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_398
+ JMP LBB2_682
+
+LBB2_399:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_689
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_401:
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x6039 // vpaddb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_401
+ JMP LBB2_690
+
+LBB2_402:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0x80 // and esi, -128
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x787de2c4; BYTE $0xc0 // vpbroadcastb ymm0, xmm0
+ LONG $0x80568d48 // lea rdx, [rsi - 128]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x07e9c149 // shr r9, 7
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_697
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_404:
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x64fcfdc5; WORD $0x6039 // vpaddb ymm4, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3864; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm4
+ QUAD $0x000080398cfcfdc5; BYTE $0x00 // vpaddb ymm1, ymm0, yword [rcx + rdi + 128]
+ QUAD $0x0000a03994fcfdc5; BYTE $0x00 // vpaddb ymm2, ymm0, yword [rcx + rdi + 160]
+ QUAD $0x0000c0399cfcfdc5; BYTE $0x00 // vpaddb ymm3, ymm0, yword [rcx + rdi + 192]
+ QUAD $0x0000e039a4fcfdc5; BYTE $0x00 // vpaddb ymm4, ymm0, yword [rcx + rdi + 224]
+ QUAD $0x0080388c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 128], ymm1
+ QUAD $0x00a038947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 160], ymm2
+ QUAD $0x00c0389c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 192], ymm3
+ QUAD $0x00e038a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + rdi + 224], ymm4
+ LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // add rdi, 256
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_404
+ JMP LBB2_698
+
+LBB2_405:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0x6e79c1c4; BYTE $0xc3 // vmovd xmm0, r11d
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_705
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_407:
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fafdc5; WORD $0x60b9 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfafdc5; BYTE $0x00 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fafdc5; BYTE $0x00 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfafdc5; BYTE $0x00 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fafdc5; BYTE $0x00 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_407
+ JMP LBB2_706
+
+LBB2_408:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0x6e79c1c4; BYTE $0xc3 // vmovd xmm0, r11d
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_713
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_410:
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fafdc5; WORD $0x60b9 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfafdc5; BYTE $0x00 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fafdc5; BYTE $0x00 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfafdc5; BYTE $0x00 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fafdc5; BYTE $0x00 // vpsubd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_410
+ JMP LBB2_714
+
+LBB2_411:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_721
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_413:
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60b9 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_413
+ JMP LBB2_722
+
+LBB2_414:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xc06ef9c5 // vmovd xmm0, eax
+ LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_729
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_416:
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x64fefdc5; WORD $0x60b9 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb864; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm4
+ QUAD $0x000080b98cfefdc5; BYTE $0x00 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi + 128]
+ QUAD $0x0000a0b994fefdc5; BYTE $0x00 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 160]
+ QUAD $0x0000c0b99cfefdc5; BYTE $0x00 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 192]
+ QUAD $0x0000e0b9a4fefdc5; BYTE $0x00 // vpaddd ymm4, ymm0, yword [rcx + 4*rdi + 224]
+ QUAD $0x0080b88c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 128], ymm1
+ QUAD $0x00a0b8947f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 160], ymm2
+ QUAD $0x00c0b89c7f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 192], ymm3
+ QUAD $0x00e0b8a47f7ec1c4; WORD $0x0000 // vmovdqu yword [r8 + 4*rdi + 224], ymm4
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_416
+ JMP LBB2_730
+
+LBB2_417:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_418:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_420
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fafdc5; WORD $0x60b9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_420:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_421
+
+LBB2_425:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_426:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_428
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fafdc5; WORD $0x60b9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_428:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_429
+
+LBB2_433:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_434:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_436
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60b9 // vpaddd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_436:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_437
+
+LBB2_441:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_442:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_444
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60b9 // vpaddd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_444:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_445
+
+LBB2_449:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_450:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_452
+ LONG $0x145cf5c5; BYTE $0xf9 // vsubpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c5cf5c5; WORD $0x20f9 // vsubpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x645cf5c5; WORD $0x40f9 // vsubpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x4c5cf5c5; WORD $0x60f9 // vsubpd ymm1, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm1
+
+LBB2_452:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_453
+
+LBB2_457:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_458:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_460
+ LONG $0x145cf5c5; BYTE $0xf9 // vsubpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c5cf5c5; WORD $0x20f9 // vsubpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x645cf5c5; WORD $0x40f9 // vsubpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x4c5cf5c5; WORD $0x60f9 // vsubpd ymm1, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm1
+
+LBB2_460:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_461
+
+LBB2_465:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_466:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_468
+ LONG $0x1458f5c5; BYTE $0xf9 // vaddpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20f9 // vaddpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40f9 // vaddpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x4c58f5c5; WORD $0x60f9 // vaddpd ymm1, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm1
+
+LBB2_468:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_469
+
+LBB2_473:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_474:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_476
+ LONG $0x1458f5c5; BYTE $0xf9 // vaddpd ymm2, ymm1, yword [rcx + 8*rdi]
+ LONG $0x5c58f5c5; WORD $0x20f9 // vaddpd ymm3, ymm1, yword [rcx + 8*rdi + 32]
+ LONG $0x6458f5c5; WORD $0x40f9 // vaddpd ymm4, ymm1, yword [rcx + 8*rdi + 64]
+ LONG $0x4c58f5c5; WORD $0x60f9 // vaddpd ymm1, ymm1, yword [rcx + 8*rdi + 96]
+ LONG $0x117dc1c4; WORD $0xf814 // vmovupd yword [r8 + 8*rdi], ymm2
+ LONG $0x117dc1c4; WORD $0xf85c; BYTE $0x20 // vmovupd yword [r8 + 8*rdi + 32], ymm3
+ LONG $0x117dc1c4; WORD $0xf864; BYTE $0x40 // vmovupd yword [r8 + 8*rdi + 64], ymm4
+ LONG $0x117dc1c4; WORD $0xf84c; BYTE $0x60 // vmovupd yword [r8 + 8*rdi + 96], ymm1
+
+LBB2_476:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_477
+
+LBB2_481:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_482:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_484
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44f8fdc5; WORD $0x6039 // vpsubb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_484:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_485
+
+LBB2_489:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_490:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_492
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44f8fdc5; WORD $0x6039 // vpsubb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_492:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_493
+
+LBB2_497:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_498:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_500
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x6039 // vpaddb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_500:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_501
+
+LBB2_505:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_506:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_508
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x6039 // vpaddb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_508:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_509
+
+LBB2_513:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_514:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_516
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44fbfdc5; WORD $0x60f9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_516:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_517
+
+LBB2_521:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_522:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_524
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44fbfdc5; WORD $0x60f9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_524:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_525
+
+LBB2_529:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_530:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_532
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60f9 // vpaddq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_532:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_533
+
+LBB2_537:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_538:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_540
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60f9 // vpaddq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_540:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_541
+
+LBB2_545:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_546:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_548
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44f9fdc5; WORD $0x2079 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_548:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_549
+
+LBB2_553:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_554:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_556
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44f9fdc5; WORD $0x2079 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_556:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_557
+
+LBB2_561:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_562:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_564
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44f9fdc5; WORD $0x2079 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_564:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_565
+
+LBB2_569:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_570:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_572
+ LONG $0x0cf9fdc5; BYTE $0x79 // vpsubw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44f9fdc5; WORD $0x2079 // vpsubw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_572:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_573
+
+LBB2_577:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_578:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_580
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x2079 // vpaddw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_580:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_581
+
+LBB2_585:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_586:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_588
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x2079 // vpaddw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_588:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_589
+
+LBB2_593:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_594:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_596
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x2079 // vpaddw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_596:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_597
+
+LBB2_601:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_602:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_604
+ LONG $0x0cfdfdc5; BYTE $0x79 // vpaddw ymm1, ymm0, yword [rcx + 2*rdi]
+ LONG $0x44fdfdc5; WORD $0x2079 // vpaddw ymm0, ymm0, yword [rcx + 2*rdi + 32]
+ LONG $0x7f7ec1c4; WORD $0x780c // vmovdqu yword [r8 + 2*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x7844; BYTE $0x20 // vmovdqu yword [r8 + 2*rdi + 32], ymm0
+
+LBB2_604:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_605
+
+LBB2_609:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_610:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_612
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44fbfdc5; WORD $0x60f9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_612:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_613
+
+LBB2_617:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_618:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_620
+ LONG $0x145cf4c5; BYTE $0xb9 // vsubps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c5cf4c5; WORD $0x20b9 // vsubps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x645cf4c5; WORD $0x40b9 // vsubps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x4c5cf4c5; WORD $0x60b9 // vsubps ymm1, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm1
+
+LBB2_620:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_621
+
+LBB2_625:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_626:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_628
+ LONG $0x0cfbfdc5; BYTE $0xf9 // vpsubq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54fbfdc5; WORD $0x20f9 // vpsubq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cfbfdc5; WORD $0x40f9 // vpsubq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44fbfdc5; WORD $0x60f9 // vpsubq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_628:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_629
+
+LBB2_633:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_634:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_636
+ LONG $0x145cf4c5; BYTE $0xb9 // vsubps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c5cf4c5; WORD $0x20b9 // vsubps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x645cf4c5; WORD $0x40b9 // vsubps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x4c5cf4c5; WORD $0x60b9 // vsubps ymm1, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm1
+
+LBB2_636:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_637
+
+LBB2_641:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_642:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_644
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60f9 // vpaddq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_644:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_645
+
+LBB2_649:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_650:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_652
+ LONG $0x1458f4c5; BYTE $0xb9 // vaddps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20b9 // vaddps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40b9 // vaddps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x4c58f4c5; WORD $0x60b9 // vaddps ymm1, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm1
+
+LBB2_652:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_653
+
+LBB2_657:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_658:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_660
+ LONG $0x0cd4fdc5; BYTE $0xf9 // vpaddq ymm1, ymm0, yword [rcx + 8*rdi]
+ LONG $0x54d4fdc5; WORD $0x20f9 // vpaddq ymm2, ymm0, yword [rcx + 8*rdi + 32]
+ LONG $0x5cd4fdc5; WORD $0x40f9 // vpaddq ymm3, ymm0, yword [rcx + 8*rdi + 64]
+ LONG $0x44d4fdc5; WORD $0x60f9 // vpaddq ymm0, ymm0, yword [rcx + 8*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xf80c // vmovdqu yword [r8 + 8*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xf854; BYTE $0x20 // vmovdqu yword [r8 + 8*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xf85c; BYTE $0x40 // vmovdqu yword [r8 + 8*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xf844; BYTE $0x60 // vmovdqu yword [r8 + 8*rdi + 96], ymm0
+
+LBB2_660:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_661
+
+LBB2_665:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_666:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_668
+ LONG $0x1458f4c5; BYTE $0xb9 // vaddps ymm2, ymm1, yword [rcx + 4*rdi]
+ LONG $0x5c58f4c5; WORD $0x20b9 // vaddps ymm3, ymm1, yword [rcx + 4*rdi + 32]
+ LONG $0x6458f4c5; WORD $0x40b9 // vaddps ymm4, ymm1, yword [rcx + 4*rdi + 64]
+ LONG $0x4c58f4c5; WORD $0x60b9 // vaddps ymm1, ymm1, yword [rcx + 4*rdi + 96]
+ LONG $0x117cc1c4; WORD $0xb814 // vmovups yword [r8 + 4*rdi], ymm2
+ LONG $0x117cc1c4; WORD $0xb85c; BYTE $0x20 // vmovups yword [r8 + 4*rdi + 32], ymm3
+ LONG $0x117cc1c4; WORD $0xb864; BYTE $0x40 // vmovups yword [r8 + 4*rdi + 64], ymm4
+ LONG $0x117cc1c4; WORD $0xb84c; BYTE $0x60 // vmovups yword [r8 + 4*rdi + 96], ymm1
+
+LBB2_668:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_669
+
+LBB2_673:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_674:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_676
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44f8fdc5; WORD $0x6039 // vpsubb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_676:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_677
+
+LBB2_681:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_682:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_684
+ LONG $0x0cf8fdc5; BYTE $0x39 // vpsubb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54f8fdc5; WORD $0x2039 // vpsubb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cf8fdc5; WORD $0x4039 // vpsubb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44f8fdc5; WORD $0x6039 // vpsubb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_684:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_685
+
+LBB2_689:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_690:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_692
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x6039 // vpaddb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_692:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_693
+
+LBB2_697:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_698:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_700
+ LONG $0x0cfcfdc5; BYTE $0x39 // vpaddb ymm1, ymm0, yword [rcx + rdi]
+ LONG $0x54fcfdc5; WORD $0x2039 // vpaddb ymm2, ymm0, yword [rcx + rdi + 32]
+ LONG $0x5cfcfdc5; WORD $0x4039 // vpaddb ymm3, ymm0, yword [rcx + rdi + 64]
+ LONG $0x44fcfdc5; WORD $0x6039 // vpaddb ymm0, ymm0, yword [rcx + rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0x380c // vmovdqu yword [r8 + rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0x3854; BYTE $0x20 // vmovdqu yword [r8 + rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0x385c; BYTE $0x40 // vmovdqu yword [r8 + rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0x3844; BYTE $0x60 // vmovdqu yword [r8 + rdi + 96], ymm0
+
+LBB2_700:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_701
+
+LBB2_705:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_706:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_708
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fafdc5; WORD $0x60b9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_708:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_709
+
+LBB2_713:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_714:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_716
+ LONG $0x0cfafdc5; BYTE $0xb9 // vpsubd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fafdc5; WORD $0x20b9 // vpsubd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfafdc5; WORD $0x40b9 // vpsubd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fafdc5; WORD $0x60b9 // vpsubd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_716:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_717
+
+LBB2_721:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_722:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_724
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60b9 // vpaddd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_724:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_725
+
+LBB2_729:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_730:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_732
+ LONG $0x0cfefdc5; BYTE $0xb9 // vpaddd ymm1, ymm0, yword [rcx + 4*rdi]
+ LONG $0x54fefdc5; WORD $0x20b9 // vpaddd ymm2, ymm0, yword [rcx + 4*rdi + 32]
+ LONG $0x5cfefdc5; WORD $0x40b9 // vpaddd ymm3, ymm0, yword [rcx + 4*rdi + 64]
+ LONG $0x44fefdc5; WORD $0x60b9 // vpaddd ymm0, ymm0, yword [rcx + 4*rdi + 96]
+ LONG $0x7f7ec1c4; WORD $0xb80c // vmovdqu yword [r8 + 4*rdi], ymm1
+ LONG $0x7f7ec1c4; WORD $0xb854; BYTE $0x20 // vmovdqu yword [r8 + 4*rdi + 32], ymm2
+ LONG $0x7f7ec1c4; WORD $0xb85c; BYTE $0x40 // vmovdqu yword [r8 + 4*rdi + 64], ymm3
+ LONG $0x7f7ec1c4; WORD $0xb844; BYTE $0x60 // vmovdqu yword [r8 + 4*rdi + 96], ymm0
+
+LBB2_732:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB2_733
+
+LBB2_737:
+ VZEROUPPER
+ RET
diff --git a/go/arrow/compute/internal/kernels/base_arithmetic_sse4_amd64.go b/go/arrow/compute/internal/kernels/base_arithmetic_sse4_amd64.go
new file mode 100644
index 00000000000..6e5d6504bc6
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/base_arithmetic_sse4_amd64.go
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package kernels
+
+import (
+ "unsafe"
+
+ "github.com/apache/arrow/go/v10/arrow"
+)
+
+//go:noescape
+func _arithmetic_sse4(typ int, op int8, inLeft, inRight, out unsafe.Pointer, len int)
+
+func arithmeticSSE4(typ arrow.Type, op ArithmeticOp, left, right, out []byte, len int) {
+ _arithmetic_sse4(int(typ), int8(op), unsafe.Pointer(&left[0]), unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), len)
+}
+
+//go:noescape
+func _arithmetic_arr_scalar_sse4(typ int, op int8, inLeft, inRight, out unsafe.Pointer, len int)
+
+func arithmeticArrScalarSSE4(typ arrow.Type, op ArithmeticOp, left []byte, right unsafe.Pointer, out []byte, len int) {
+ _arithmetic_arr_scalar_sse4(int(typ), int8(op), unsafe.Pointer(&left[0]), right, unsafe.Pointer(&out[0]), len)
+}
+
+//go:noescape
+func _arithmetic_scalar_arr_sse4(typ int, op int8, inLeft, inRight, out unsafe.Pointer, len int)
+
+func arithmeticScalarArrSSE4(typ arrow.Type, op ArithmeticOp, left unsafe.Pointer, right, out []byte, len int) {
+ _arithmetic_scalar_arr_sse4(int(typ), int8(op), left, unsafe.Pointer(&right[0]), unsafe.Pointer(&out[0]), len)
+}
diff --git a/go/arrow/compute/internal/kernels/base_arithmetic_sse4_amd64.s b/go/arrow/compute/internal/kernels/base_arithmetic_sse4_amd64.s
new file mode 100644
index 00000000000..c7cb89a61ab
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/base_arithmetic_sse4_amd64.s
@@ -0,0 +1,13794 @@
+//+build !noasm !appengine
+// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
+
+TEXT ·_arithmetic_sse4(SB), $0-48
+
+ MOVQ typ+0(FP), DI
+ MOVQ op+8(FP), SI
+ MOVQ inLeft+16(FP), DX
+ MOVQ inRight+24(FP), CX
+ MOVQ out+32(FP), R8
+ MOVQ len+40(FP), R9
+
+ LONG $0x01fe8040 // cmp sil, 1
+ JG LBB0_10
+ WORD $0x8440; BYTE $0xf6 // test sil, sil
+ JE LBB0_19
+ LONG $0x01fe8040 // cmp sil, 1
+ JNE LBB0_697
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_371
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_5
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_412
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_428
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_444
+ WORD $0xf631 // xor esi, esi
+
+LBB0_453:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_455
+
+LBB0_454:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_454
+
+LBB0_455:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_456:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_456
+ JMP LBB0_697
+
+LBB0_10:
+ LONG $0x02fe8040 // cmp sil, 2
+ JE LBB0_192
+ LONG $0x03fe8040 // cmp sil, 3
+ JNE LBB0_697
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_537
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_14
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_578
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_594
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_610
+ WORD $0xf631 // xor esi, esi
+
+LBB0_619:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_621
+
+LBB0_620:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_620
+
+LBB0_621:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_622:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_622
+ JMP LBB0_697
+
+LBB0_19:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_32
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_21
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_73
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_89
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_105
+ WORD $0xf631 // xor esi, esi
+
+LBB0_114:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_116
+
+LBB0_115:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_115
+
+LBB0_116:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_117:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_117
+ JMP LBB0_697
+
+LBB0_192:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB0_205
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB0_194
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB0_246
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB0_262
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_278
+ WORD $0xf631 // xor esi, esi
+
+LBB0_287:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_289
+
+LBB0_288:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_288
+
+LBB0_289:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_290:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_290
+ JMP LBB0_697
+
+LBB0_371:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_372
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_486
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_502
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_518
+ WORD $0xf631 // xor esi, esi
+
+LBB0_527:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_529
+
+LBB0_528:
+ LONG $0x04100ff2; BYTE $0xf2 // movsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045c0ff2; BYTE $0xf1 // subsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_528
+
+LBB0_529:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_530:
+ LONG $0x04100ff2; BYTE $0xf2 // movsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045c0ff2; BYTE $0xf1 // subsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x44100ff2; WORD $0x08f2 // movsd xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x445c0ff2; WORD $0x08f1 // subsd xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x08 // movsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x44100ff2; WORD $0x10f2 // movsd xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x445c0ff2; WORD $0x10f1 // subsd xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x10 // movsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x44100ff2; WORD $0x18f2 // movsd xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x445c0ff2; WORD $0x18f1 // subsd xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x18 // movsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_530
+ JMP LBB0_697
+
+LBB0_537:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_538
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_652
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_668
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_684
+ WORD $0xf631 // xor esi, esi
+
+LBB0_693:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_695
+
+LBB0_694:
+ LONG $0x04100ff2; BYTE $0xf2 // movsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045c0ff2; BYTE $0xf1 // subsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_694
+
+LBB0_695:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_696:
+ LONG $0x04100ff2; BYTE $0xf2 // movsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x045c0ff2; BYTE $0xf1 // subsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x44100ff2; WORD $0x08f2 // movsd xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x445c0ff2; WORD $0x08f1 // subsd xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x08 // movsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x44100ff2; WORD $0x10f2 // movsd xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x445c0ff2; WORD $0x10f1 // subsd xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x10 // movsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x44100ff2; WORD $0x18f2 // movsd xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x445c0ff2; WORD $0x18f1 // subsd xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x18 // movsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_696
+ JMP LBB0_697
+
+LBB0_32:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_33
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_147
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_163
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_179
+ WORD $0xf631 // xor esi, esi
+
+LBB0_188:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_190
+
+LBB0_189:
+ LONG $0x04100ff2; BYTE $0xf1 // movsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x04580ff2; BYTE $0xf2 // addsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_189
+
+LBB0_190:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_191:
+ LONG $0x04100ff2; BYTE $0xf1 // movsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x04580ff2; BYTE $0xf2 // addsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x44100ff2; WORD $0x08f1 // movsd xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x44580ff2; WORD $0x08f2 // addsd xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x08 // movsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x44100ff2; WORD $0x10f1 // movsd xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x44580ff2; WORD $0x10f2 // addsd xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x10 // movsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x44100ff2; WORD $0x18f1 // movsd xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x44580ff2; WORD $0x18f2 // addsd xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x18 // movsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_191
+ JMP LBB0_697
+
+LBB0_205:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB0_206
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB0_320
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB0_336
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_352
+ WORD $0xf631 // xor esi, esi
+
+LBB0_361:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_363
+
+LBB0_362:
+ LONG $0x04100ff2; BYTE $0xf1 // movsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x04580ff2; BYTE $0xf2 // addsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_362
+
+LBB0_363:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_364:
+ LONG $0x04100ff2; BYTE $0xf1 // movsd xmm0, qword [rcx + 8*rsi]
+ LONG $0x04580ff2; BYTE $0xf2 // addsd xmm0, qword [rdx + 8*rsi]
+ LONG $0x110f41f2; WORD $0xf004 // movsd qword [r8 + 8*rsi], xmm0
+ LONG $0x44100ff2; WORD $0x08f1 // movsd xmm0, qword [rcx + 8*rsi + 8]
+ LONG $0x44580ff2; WORD $0x08f2 // addsd xmm0, qword [rdx + 8*rsi + 8]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x08 // movsd qword [r8 + 8*rsi + 8], xmm0
+ LONG $0x44100ff2; WORD $0x10f1 // movsd xmm0, qword [rcx + 8*rsi + 16]
+ LONG $0x44580ff2; WORD $0x10f2 // addsd xmm0, qword [rdx + 8*rsi + 16]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x10 // movsd qword [r8 + 8*rsi + 16], xmm0
+ LONG $0x44100ff2; WORD $0x18f1 // movsd xmm0, qword [rcx + 8*rsi + 24]
+ LONG $0x44580ff2; WORD $0x18f2 // addsd xmm0, qword [rdx + 8*rsi + 24]
+ LONG $0x110f41f2; WORD $0xf044; BYTE $0x18 // movsd qword [r8 + 8*rsi + 24], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_364
+ JMP LBB0_697
+
+LBB0_5:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_383
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_399
+ WORD $0xf631 // xor esi, esi
+
+LBB0_408:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_410
+
+LBB0_409:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_409
+
+LBB0_410:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_411:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_411
+ JMP LBB0_697
+
+LBB0_14:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_549
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_565
+ WORD $0xf631 // xor esi, esi
+
+LBB0_574:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_576
+
+LBB0_575:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_575
+
+LBB0_576:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_577:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_577
+ JMP LBB0_697
+
+LBB0_21:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_44
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_60
+ WORD $0xf631 // xor esi, esi
+
+LBB0_69:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_71
+
+LBB0_70:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_70
+
+LBB0_71:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_72:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_72
+ JMP LBB0_697
+
+LBB0_194:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB0_217
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_233
+ WORD $0xf631 // xor esi, esi
+
+LBB0_242:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_244
+
+LBB0_243:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_243
+
+LBB0_244:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_245:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_245
+ JMP LBB0_697
+
+LBB0_372:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_457
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_473
+ WORD $0xf631 // xor esi, esi
+
+LBB0_482:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_484
+
+LBB0_483:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_483
+
+LBB0_484:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_485:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_485
+ JMP LBB0_697
+
+LBB0_538:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_623
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_639
+ WORD $0xf631 // xor esi, esi
+
+LBB0_648:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_650
+
+LBB0_649:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_649
+
+LBB0_650:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_651:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_651
+ JMP LBB0_697
+
+LBB0_33:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_118
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_134
+ WORD $0xf631 // xor esi, esi
+
+LBB0_143:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_145
+
+LBB0_144:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_144
+
+LBB0_145:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_146:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_146
+ JMP LBB0_697
+
+LBB0_206:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB0_291
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB0_697
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_307
+ WORD $0xf631 // xor esi, esi
+
+LBB0_316:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_318
+
+LBB0_317:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_317
+
+LBB0_318:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_319:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_319
+ JMP LBB0_697
+
+LBB0_412:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_415
+ WORD $0xf631 // xor esi, esi
+
+LBB0_424:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_426
+
+LBB0_425:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_425
+
+LBB0_426:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_427:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_427
+ JMP LBB0_697
+
+LBB0_428:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_431
+ WORD $0xf631 // xor esi, esi
+
+LBB0_440:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_442
+
+LBB0_441:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_441
+
+LBB0_442:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_443:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_443
+ JMP LBB0_697
+
+LBB0_578:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_581
+ WORD $0xf631 // xor esi, esi
+
+LBB0_590:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_592
+
+LBB0_591:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_591
+
+LBB0_592:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_593:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_593
+ JMP LBB0_697
+
+LBB0_594:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_597
+ WORD $0xf631 // xor esi, esi
+
+LBB0_606:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_608
+
+LBB0_607:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_607
+
+LBB0_608:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_609:
+ LONG $0x7204b70f // movzx eax, word [rdx + 2*rsi]
+ LONG $0x71042b66 // sub ax, word [rcx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7244b70f; BYTE $0x02 // movzx eax, word [rdx + 2*rsi + 2]
+ LONG $0x71442b66; BYTE $0x02 // sub ax, word [rcx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7244b70f; BYTE $0x04 // movzx eax, word [rdx + 2*rsi + 4]
+ LONG $0x71442b66; BYTE $0x04 // sub ax, word [rcx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7244b70f; BYTE $0x06 // movzx eax, word [rdx + 2*rsi + 6]
+ LONG $0x71442b66; BYTE $0x06 // sub ax, word [rcx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_609
+ JMP LBB0_697
+
+LBB0_73:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_76
+ WORD $0xf631 // xor esi, esi
+
+LBB0_85:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_87
+
+LBB0_86:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_86
+
+LBB0_87:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_88:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_88
+ JMP LBB0_697
+
+LBB0_89:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_92
+ WORD $0xf631 // xor esi, esi
+
+LBB0_101:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_103
+
+LBB0_102:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_102
+
+LBB0_103:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_104:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_104
+ JMP LBB0_697
+
+LBB0_246:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_249
+ WORD $0xf631 // xor esi, esi
+
+LBB0_258:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_260
+
+LBB0_259:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_259
+
+LBB0_260:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_261:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_261
+ JMP LBB0_697
+
+LBB0_262:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JAE LBB0_265
+ WORD $0xf631 // xor esi, esi
+
+LBB0_274:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_276
+
+LBB0_275:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_275
+
+LBB0_276:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_277:
+ LONG $0x7104b70f // movzx eax, word [rcx + 2*rsi]
+ LONG $0x72040366 // add ax, word [rdx + 2*rsi]
+ LONG $0x04894166; BYTE $0x70 // mov word [r8 + 2*rsi], ax
+ LONG $0x7144b70f; BYTE $0x02 // movzx eax, word [rcx + 2*rsi + 2]
+ LONG $0x72440366; BYTE $0x02 // add ax, word [rdx + 2*rsi + 2]
+ LONG $0x44894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], ax
+ LONG $0x7144b70f; BYTE $0x04 // movzx eax, word [rcx + 2*rsi + 4]
+ LONG $0x72440366; BYTE $0x04 // add ax, word [rdx + 2*rsi + 4]
+ LONG $0x44894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], ax
+ LONG $0x7144b70f; BYTE $0x06 // movzx eax, word [rcx + 2*rsi + 6]
+ LONG $0x72440366; BYTE $0x06 // add ax, word [rdx + 2*rsi + 6]
+ LONG $0x44894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], ax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_277
+ JMP LBB0_697
+
+LBB0_486:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_489
+ WORD $0xf631 // xor esi, esi
+
+LBB0_498:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_500
+
+LBB0_499:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_499
+
+LBB0_500:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_501:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_501
+ JMP LBB0_697
+
+LBB0_502:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_505
+ WORD $0xf631 // xor esi, esi
+
+LBB0_514:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_516
+
+LBB0_515:
+ LONG $0x04100ff3; BYTE $0xb2 // movss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045c0ff3; BYTE $0xb1 // subss xmm0, dword [rcx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_515
+
+LBB0_516:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_517:
+ LONG $0x04100ff3; BYTE $0xb2 // movss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045c0ff3; BYTE $0xb1 // subss xmm0, dword [rcx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x44100ff3; WORD $0x04b2 // movss xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x445c0ff3; WORD $0x04b1 // subss xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x04 // movss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x44100ff3; WORD $0x08b2 // movss xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x445c0ff3; WORD $0x08b1 // subss xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x08 // movss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x44100ff3; WORD $0x0cb2 // movss xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x445c0ff3; WORD $0x0cb1 // subss xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x0c // movss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_517
+ JMP LBB0_697
+
+LBB0_652:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_655
+ WORD $0xf631 // xor esi, esi
+
+LBB0_664:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_666
+
+LBB0_665:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_665
+
+LBB0_666:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_667:
+ LONG $0xf2048b48 // mov rax, qword [rdx + 8*rsi]
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf2448b48; BYTE $0x08 // mov rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf2448b48; BYTE $0x10 // mov rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf2448b48; BYTE $0x18 // mov rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_667
+ JMP LBB0_697
+
+LBB0_668:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_671
+ WORD $0xf631 // xor esi, esi
+
+LBB0_680:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_682
+
+LBB0_681:
+ LONG $0x04100ff3; BYTE $0xb2 // movss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045c0ff3; BYTE $0xb1 // subss xmm0, dword [rcx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_681
+
+LBB0_682:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_683:
+ LONG $0x04100ff3; BYTE $0xb2 // movss xmm0, dword [rdx + 4*rsi]
+ LONG $0x045c0ff3; BYTE $0xb1 // subss xmm0, dword [rcx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x44100ff3; WORD $0x04b2 // movss xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x445c0ff3; WORD $0x04b1 // subss xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x04 // movss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x44100ff3; WORD $0x08b2 // movss xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x445c0ff3; WORD $0x08b1 // subss xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x08 // movss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x44100ff3; WORD $0x0cb2 // movss xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x445c0ff3; WORD $0x0cb1 // subss xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x0c // movss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_683
+ JMP LBB0_697
+
+LBB0_147:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_150
+ WORD $0xf631 // xor esi, esi
+
+LBB0_159:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_161
+
+LBB0_160:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_160
+
+LBB0_161:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_162:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_162
+ JMP LBB0_697
+
+LBB0_163:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_166
+ WORD $0xf631 // xor esi, esi
+
+LBB0_175:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_177
+
+LBB0_176:
+ LONG $0x04100ff3; BYTE $0xb1 // movss xmm0, dword [rcx + 4*rsi]
+ LONG $0x04580ff3; BYTE $0xb2 // addss xmm0, dword [rdx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_176
+
+LBB0_177:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_178:
+ LONG $0x04100ff3; BYTE $0xb1 // movss xmm0, dword [rcx + 4*rsi]
+ LONG $0x04580ff3; BYTE $0xb2 // addss xmm0, dword [rdx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x44100ff3; WORD $0x04b1 // movss xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x44580ff3; WORD $0x04b2 // addss xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x04 // movss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x44100ff3; WORD $0x08b1 // movss xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x44580ff3; WORD $0x08b2 // addss xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x08 // movss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x44100ff3; WORD $0x0cb1 // movss xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x44580ff3; WORD $0x0cb2 // addss xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x0c // movss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_178
+ JMP LBB0_697
+
+LBB0_320:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JAE LBB0_323
+ WORD $0xf631 // xor esi, esi
+
+LBB0_332:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_334
+
+LBB0_333:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_333
+
+LBB0_334:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_335:
+ LONG $0xf1048b48 // mov rax, qword [rcx + 8*rsi]
+ LONG $0xf2040348 // add rax, qword [rdx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0xf1448b48; BYTE $0x08 // mov rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf2440348; BYTE $0x08 // add rax, qword [rdx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ LONG $0xf1448b48; BYTE $0x10 // mov rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf2440348; BYTE $0x10 // add rax, qword [rdx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ LONG $0xf1448b48; BYTE $0x18 // mov rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf2440348; BYTE $0x18 // add rax, qword [rdx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_335
+ JMP LBB0_697
+
+LBB0_336:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_339
+ WORD $0xf631 // xor esi, esi
+
+LBB0_348:
+ WORD $0x8948; BYTE $0xf0 // mov rax, rsi
+ WORD $0xf748; BYTE $0xd0 // not rax
+ WORD $0x014c; BYTE $0xd0 // add rax, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_350
+
+LBB0_349:
+ LONG $0x04100ff3; BYTE $0xb1 // movss xmm0, dword [rcx + 4*rsi]
+ LONG $0x04580ff3; BYTE $0xb2 // addss xmm0, dword [rdx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_349
+
+LBB0_350:
+ LONG $0x03f88348 // cmp rax, 3
+ JB LBB0_697
+
+LBB0_351:
+ LONG $0x04100ff3; BYTE $0xb1 // movss xmm0, dword [rcx + 4*rsi]
+ LONG $0x04580ff3; BYTE $0xb2 // addss xmm0, dword [rdx + 4*rsi]
+ LONG $0x110f41f3; WORD $0xb004 // movss dword [r8 + 4*rsi], xmm0
+ LONG $0x44100ff3; WORD $0x04b1 // movss xmm0, dword [rcx + 4*rsi + 4]
+ LONG $0x44580ff3; WORD $0x04b2 // addss xmm0, dword [rdx + 4*rsi + 4]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x04 // movss dword [r8 + 4*rsi + 4], xmm0
+ LONG $0x44100ff3; WORD $0x08b1 // movss xmm0, dword [rcx + 4*rsi + 8]
+ LONG $0x44580ff3; WORD $0x08b2 // addss xmm0, dword [rdx + 4*rsi + 8]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x08 // movss dword [r8 + 4*rsi + 8], xmm0
+ LONG $0x44100ff3; WORD $0x0cb1 // movss xmm0, dword [rcx + 4*rsi + 12]
+ LONG $0x44580ff3; WORD $0x0cb2 // addss xmm0, dword [rdx + 4*rsi + 12]
+ LONG $0x110f41f3; WORD $0xb044; BYTE $0x0c // movss dword [r8 + 4*rsi + 12], xmm0
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_351
+ JMP LBB0_697
+
+LBB0_383:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_386
+ WORD $0xf631 // xor esi, esi
+
+LBB0_395:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_397
+
+LBB0_396:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_396
+
+LBB0_397:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_398:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_398
+ JMP LBB0_697
+
+LBB0_549:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_552
+ WORD $0xf631 // xor esi, esi
+
+LBB0_561:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_563
+
+LBB0_562:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_562
+
+LBB0_563:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_564:
+ LONG $0x3204b60f // movzx eax, byte [rdx + rsi]
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3244b60f; BYTE $0x01 // movzx eax, byte [rdx + rsi + 1]
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3244b60f; BYTE $0x02 // movzx eax, byte [rdx + rsi + 2]
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3244b60f; BYTE $0x03 // movzx eax, byte [rdx + rsi + 3]
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_564
+ JMP LBB0_697
+
+LBB0_44:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_47
+ WORD $0xf631 // xor esi, esi
+
+LBB0_56:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_58
+
+LBB0_57:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_57
+
+LBB0_58:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_59:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_59
+ JMP LBB0_697
+
+LBB0_217:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JAE LBB0_220
+ WORD $0xf631 // xor esi, esi
+
+LBB0_229:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_231
+
+LBB0_230:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_230
+
+LBB0_231:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_232:
+ LONG $0x3104b60f // movzx eax, byte [rcx + rsi]
+ WORD $0x0402; BYTE $0x32 // add al, byte [rdx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x3144b60f; BYTE $0x01 // movzx eax, byte [rcx + rsi + 1]
+ LONG $0x01324402 // add al, byte [rdx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ LONG $0x3144b60f; BYTE $0x02 // movzx eax, byte [rcx + rsi + 2]
+ LONG $0x02324402 // add al, byte [rdx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ LONG $0x3144b60f; BYTE $0x03 // movzx eax, byte [rcx + rsi + 3]
+ LONG $0x03324402 // add al, byte [rdx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_232
+ JMP LBB0_697
+
+LBB0_457:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_460
+ WORD $0xf631 // xor esi, esi
+
+LBB0_469:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_471
+
+LBB0_470:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_470
+
+LBB0_471:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_472:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_472
+ JMP LBB0_697
+
+LBB0_623:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_626
+ WORD $0xf631 // xor esi, esi
+
+LBB0_635:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_637
+
+LBB0_636:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_636
+
+LBB0_637:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_638:
+ WORD $0x048b; BYTE $0xb2 // mov eax, dword [rdx + 4*rsi]
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b2448b // mov eax, dword [rdx + 4*rsi + 4]
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b2448b // mov eax, dword [rdx + 4*rsi + 8]
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb2448b // mov eax, dword [rdx + 4*rsi + 12]
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_638
+ JMP LBB0_697
+
+LBB0_118:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_121
+ WORD $0xf631 // xor esi, esi
+
+LBB0_130:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_132
+
+LBB0_131:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_131
+
+LBB0_132:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_133:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_133
+ JMP LBB0_697
+
+LBB0_291:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB0_697
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JAE LBB0_294
+ WORD $0xf631 // xor esi, esi
+
+LBB0_303:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB0_305
+
+LBB0_304:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB0_304
+
+LBB0_305:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB0_697
+
+LBB0_306:
+ WORD $0x048b; BYTE $0xb1 // mov eax, dword [rcx + 4*rsi]
+ WORD $0x0403; BYTE $0xb2 // add eax, dword [rdx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x04b1448b // mov eax, dword [rcx + 4*rsi + 4]
+ LONG $0x04b24403 // add eax, dword [rdx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ LONG $0x08b1448b // mov eax, dword [rcx + 4*rsi + 8]
+ LONG $0x08b24403 // add eax, dword [rdx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ LONG $0x0cb1448b // mov eax, dword [rcx + 4*rsi + 12]
+ LONG $0x0cb24403 // add eax, dword [rdx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB0_306
+ JMP LBB0_697
+
+LBB0_444:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_453
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_453
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_447
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_449:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_449
+ JMP LBB0_450
+
+LBB0_610:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_619
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_619
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_613
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_615:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_615
+ JMP LBB0_616
+
+LBB0_105:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_114
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_114
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_108
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_110:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30b9 // movdqu xmm0, oword [rcx + 4*rdi + 48]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm0
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_110
+ JMP LBB0_111
+
+LBB0_278:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_287
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_287
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_281
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_283:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30b9 // movdqu xmm0, oword [rcx + 4*rdi + 48]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm0
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_283
+ JMP LBB0_284
+
+LBB0_518:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_527
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_527
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_521
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_523:
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xc25c0f66 // subpd xmm0, xmm2
+ LONG $0x54100f66; WORD $0x10f9 // movupd xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xca5c0f66 // subpd xmm1, xmm2
+ LONG $0x110f4166; WORD $0xf804 // movupd oword [r8 + 8*rdi], xmm0
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x44100f66; WORD $0x20fa // movupd xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c100f66; WORD $0x30fa // movupd xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xc25c0f66 // subpd xmm0, xmm2
+ LONG $0x54100f66; WORD $0x30f9 // movupd xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xca5c0f66 // subpd xmm1, xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm0
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_523
+ JMP LBB0_524
+
+LBB0_684:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_693
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_693
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_687
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_689:
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xc25c0f66 // subpd xmm0, xmm2
+ LONG $0x54100f66; WORD $0x10f9 // movupd xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xca5c0f66 // subpd xmm1, xmm2
+ LONG $0x110f4166; WORD $0xf804 // movupd oword [r8 + 8*rdi], xmm0
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x44100f66; WORD $0x20fa // movupd xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c100f66; WORD $0x30fa // movupd xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xc25c0f66 // subpd xmm0, xmm2
+ LONG $0x54100f66; WORD $0x30f9 // movupd xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xca5c0f66 // subpd xmm1, xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm0
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_689
+ JMP LBB0_690
+
+LBB0_179:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_188
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_188
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_182
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_184:
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0580f66 // addpd xmm2, xmm0
+ LONG $0x44100f66; WORD $0x10f9 // movupd xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1580f66 // addpd xmm0, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm0
+ LONG $0x44100f66; WORD $0x20fa // movupd xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c100f66; WORD $0x30fa // movupd xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xd0580f66 // addpd xmm2, xmm0
+ LONG $0x44100f66; WORD $0x30f9 // movupd xmm0, oword [rcx + 8*rdi + 48]
+ LONG $0xc1580f66 // addpd xmm0, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm0
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_184
+ JMP LBB0_185
+
+LBB0_352:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_361
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_361
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_355
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_357:
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0580f66 // addpd xmm2, xmm0
+ LONG $0x44100f66; WORD $0x10f9 // movupd xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1580f66 // addpd xmm0, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm0
+ LONG $0x44100f66; WORD $0x20fa // movupd xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c100f66; WORD $0x30fa // movupd xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xd0580f66 // addpd xmm2, xmm0
+ LONG $0x44100f66; WORD $0x30f9 // movupd xmm0, oword [rcx + 8*rdi + 48]
+ LONG $0xc1580f66 // addpd xmm0, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm0
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_357
+ JMP LBB0_358
+
+LBB0_399:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_408
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_408
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_402
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_404:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_404
+ JMP LBB0_405
+
+LBB0_565:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_574
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_574
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_568
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_570:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_570
+ JMP LBB0_571
+
+LBB0_60:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_69
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_69
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_63
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_65:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3039 // movdqu xmm0, oword [rcx + rdi + 48]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm0
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_65
+ JMP LBB0_66
+
+LBB0_233:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_242
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_242
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_236
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_238:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3039 // movdqu xmm0, oword [rcx + rdi + 48]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm0
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_238
+ JMP LBB0_239
+
+LBB0_473:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_482
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_482
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_476
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_478:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_478
+ JMP LBB0_479
+
+LBB0_639:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_648
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_648
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_642
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_644:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_644
+ JMP LBB0_645
+
+LBB0_134:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_143
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_143
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_137
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_139:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30f9 // movdqu xmm0, oword [rcx + 8*rdi + 48]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm0
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_139
+ JMP LBB0_140
+
+LBB0_307:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_316
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_316
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_310
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_312:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30f9 // movdqu xmm0, oword [rcx + 8*rdi + 48]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm0
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_312
+ JMP LBB0_313
+
+LBB0_415:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_424
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_424
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_418
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_420:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_420
+ JMP LBB0_421
+
+LBB0_431:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_440
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_440
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_434
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_436:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_436
+ JMP LBB0_437
+
+LBB0_581:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_590
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_590
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_584
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_586:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_586
+ JMP LBB0_587
+
+LBB0_597:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_606
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_606
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_600
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_602:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_602
+ JMP LBB0_603
+
+LBB0_76:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_85
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_85
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_79
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_81:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3079 // movdqu xmm0, oword [rcx + 2*rdi + 48]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm0
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_81
+ JMP LBB0_82
+
+LBB0_92:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_101
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_101
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_95
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_97:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3079 // movdqu xmm0, oword [rcx + 2*rdi + 48]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm0
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_97
+ JMP LBB0_98
+
+LBB0_249:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_258
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_258
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_252
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_254:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3079 // movdqu xmm0, oword [rcx + 2*rdi + 48]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm0
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_254
+ JMP LBB0_255
+
+LBB0_265:
+ LONG $0x50348d4b // lea rsi, [r8 + 2*r10]
+ LONG $0x52048d4a // lea rax, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x51048d4a // lea rax, [rcx + 2*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_274
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_274
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xf0468d48 // lea rax, [rsi - 16]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_268
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_270:
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x207a // movdqu xmm0, oword [rdx + 2*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x307a // movdqu xmm1, oword [rdx + 2*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2079 // movdqu xmm2, oword [rcx + 2*rdi + 32]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3079 // movdqu xmm0, oword [rcx + 2*rdi + 48]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm0
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_270
+ JMP LBB0_271
+
+LBB0_489:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_498
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_498
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_492
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_494:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_494
+ JMP LBB0_495
+
+LBB0_505:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_514
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_514
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_508
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_510:
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x5c0f; BYTE $0xc2 // subps xmm0, xmm2
+ LONG $0xb954100f; BYTE $0x10 // movups xmm2, oword [rcx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xca // subps xmm1, xmm2
+ LONG $0x04110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm0
+ LONG $0x4c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm1
+ LONG $0xba44100f; BYTE $0x20 // movups xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0xba4c100f; BYTE $0x30 // movups xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ WORD $0x5c0f; BYTE $0xc2 // subps xmm0, xmm2
+ LONG $0xb954100f; BYTE $0x30 // movups xmm2, oword [rcx + 4*rdi + 48]
+ WORD $0x5c0f; BYTE $0xca // subps xmm1, xmm2
+ LONG $0x44110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm0
+ LONG $0x4c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_510
+ JMP LBB0_511
+
+LBB0_655:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_664
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_664
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_658
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_660:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_660
+ JMP LBB0_661
+
+LBB0_671:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_680
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_680
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_674
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_676:
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x5c0f; BYTE $0xc2 // subps xmm0, xmm2
+ LONG $0xb954100f; BYTE $0x10 // movups xmm2, oword [rcx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xca // subps xmm1, xmm2
+ LONG $0x04110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm0
+ LONG $0x4c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm1
+ LONG $0xba44100f; BYTE $0x20 // movups xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0xba4c100f; BYTE $0x30 // movups xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ WORD $0x5c0f; BYTE $0xc2 // subps xmm0, xmm2
+ LONG $0xb954100f; BYTE $0x30 // movups xmm2, oword [rcx + 4*rdi + 48]
+ WORD $0x5c0f; BYTE $0xca // subps xmm1, xmm2
+ LONG $0x44110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm0
+ LONG $0x4c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_676
+ JMP LBB0_677
+
+LBB0_150:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_159
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_159
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_153
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_155:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30f9 // movdqu xmm0, oword [rcx + 8*rdi + 48]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm0
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_155
+ JMP LBB0_156
+
+LBB0_166:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_175
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_175
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_169
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_171:
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x580f; BYTE $0xd0 // addps xmm2, xmm0
+ LONG $0xb944100f; BYTE $0x10 // movups xmm0, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xc1 // addps xmm0, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x44110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm0
+ LONG $0xba44100f; BYTE $0x20 // movups xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0xba4c100f; BYTE $0x30 // movups xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ WORD $0x580f; BYTE $0xd0 // addps xmm2, xmm0
+ LONG $0xb944100f; BYTE $0x30 // movups xmm0, oword [rcx + 4*rdi + 48]
+ WORD $0x580f; BYTE $0xc1 // addps xmm0, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x44110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm0
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_171
+ JMP LBB0_172
+
+LBB0_323:
+ LONG $0xd0348d4b // lea rsi, [r8 + 8*r10]
+ LONG $0xd2048d4a // lea rax, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0xd1048d4a // lea rax, [rcx + 8*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_332
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_332
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0xfc468d48 // lea rax, [rsi - 4]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_326
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_328:
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20fa // movdqu xmm0, oword [rdx + 8*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30fa // movdqu xmm1, oword [rdx + 8*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20f9 // movdqu xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30f9 // movdqu xmm0, oword [rcx + 8*rdi + 48]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm0
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_328
+ JMP LBB0_329
+
+LBB0_339:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_348
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_348
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_342
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_344:
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x580f; BYTE $0xd0 // addps xmm2, xmm0
+ LONG $0xb944100f; BYTE $0x10 // movups xmm0, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xc1 // addps xmm0, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x44110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm0
+ LONG $0xba44100f; BYTE $0x20 // movups xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0xba4c100f; BYTE $0x30 // movups xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ WORD $0x580f; BYTE $0xd0 // addps xmm2, xmm0
+ LONG $0xb944100f; BYTE $0x30 // movups xmm0, oword [rcx + 4*rdi + 48]
+ WORD $0x580f; BYTE $0xc1 // addps xmm0, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x44110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm0
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_344
+ JMP LBB0_345
+
+LBB0_386:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_395
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_395
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_389
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_391:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_391
+ JMP LBB0_392
+
+LBB0_552:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_561
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_561
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_555
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_557:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_557
+ JMP LBB0_558
+
+LBB0_47:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_56
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_56
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_50
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_52:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3039 // movdqu xmm0, oword [rcx + rdi + 48]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm0
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_52
+ JMP LBB0_53
+
+LBB0_220:
+ LONG $0x10348d4b // lea rsi, [r8 + r10]
+ LONG $0x12048d4a // lea rax, [rdx + r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x11048d4a // lea rax, [rcx + r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_229
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_229
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xe0468d48 // lea rax, [rsi - 32]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_223
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_225:
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x203a // movdqu xmm0, oword [rdx + rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x303a // movdqu xmm1, oword [rdx + rdi + 48]
+ LONG $0x546f0ff3; WORD $0x2039 // movdqu xmm2, oword [rcx + rdi + 32]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x3039 // movdqu xmm0, oword [rcx + rdi + 48]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm0
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_225
+ JMP LBB0_226
+
+LBB0_460:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_469
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_469
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_463
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_465:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_465
+ JMP LBB0_466
+
+LBB0_626:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_635
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_635
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_629
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_631:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_631
+ JMP LBB0_632
+
+LBB0_121:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_130
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_130
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_124
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_126:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30b9 // movdqu xmm0, oword [rcx + 4*rdi + 48]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm0
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_126
+ JMP LBB0_127
+
+LBB0_294:
+ LONG $0x90348d4b // lea rsi, [r8 + 4*r10]
+ LONG $0x92048d4a // lea rax, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ LONG $0xd1970f41 // seta r9b
+ LONG $0x91048d4a // lea rax, [rcx + 4*r10]
+ WORD $0x3948; BYTE $0xd6 // cmp rsi, rdx
+ LONG $0xd3970f41 // seta r11b
+ WORD $0x394c; BYTE $0xc0 // cmp rax, r8
+ WORD $0x970f; BYTE $0xd0 // seta al
+ WORD $0x3948; BYTE $0xce // cmp rsi, rcx
+ LONG $0xd7970f40 // seta dil
+ WORD $0xf631 // xor esi, esi
+ WORD $0x8445; BYTE $0xd9 // test r9b, r11b
+ JNE LBB0_303
+ WORD $0x2040; BYTE $0xf8 // and al, dil
+ JNE LBB0_303
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xf8468d48 // lea rax, [rsi - 8]
+ WORD $0x8949; BYTE $0xc1 // mov r9, rax
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc0 // test rax, rax
+ JE LBB0_297
+ WORD $0x894c; BYTE $0xc8 // mov rax, r9
+ LONG $0xfee08348 // and rax, -2
+ WORD $0xf748; BYTE $0xd8 // neg rax
+ WORD $0xff31 // xor edi, edi
+
+LBB0_299:
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+ LONG $0x446f0ff3; WORD $0x20ba // movdqu xmm0, oword [rdx + 4*rdi + 32]
+ LONG $0x4c6f0ff3; WORD $0x30ba // movdqu xmm1, oword [rdx + 4*rdi + 48]
+ LONG $0x546f0ff3; WORD $0x20b9 // movdqu xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x30b9 // movdqu xmm0, oword [rcx + 4*rdi + 48]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm0
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c08348 // add rax, 2
+ JNE LBB0_299
+ JMP LBB0_300
+
+LBB0_447:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_450:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_452
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+
+LBB0_452:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_453
+ JMP LBB0_697
+
+LBB0_613:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_616:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_618
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+
+LBB0_618:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_619
+ JMP LBB0_697
+
+LBB0_108:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_111:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_113
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB0_113:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_114
+ JMP LBB0_697
+
+LBB0_281:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_284:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_286
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB0_286:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_287
+ JMP LBB0_697
+
+LBB0_521:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_524:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_526
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xc25c0f66 // subpd xmm0, xmm2
+ LONG $0x54100f66; WORD $0x10f9 // movupd xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xca5c0f66 // subpd xmm1, xmm2
+ LONG $0x110f4166; WORD $0xf804 // movupd oword [r8 + 8*rdi], xmm0
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm1
+
+LBB0_526:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_527
+ JMP LBB0_697
+
+LBB0_687:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_690:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_692
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xc25c0f66 // subpd xmm0, xmm2
+ LONG $0x54100f66; WORD $0x10f9 // movupd xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xca5c0f66 // subpd xmm1, xmm2
+ LONG $0x110f4166; WORD $0xf804 // movupd oword [r8 + 8*rdi], xmm0
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm1
+
+LBB0_692:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_693
+ JMP LBB0_697
+
+LBB0_182:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_185:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_187
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0580f66 // addpd xmm2, xmm0
+ LONG $0x44100f66; WORD $0x10f9 // movupd xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1580f66 // addpd xmm0, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm0
+
+LBB0_187:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_188
+ JMP LBB0_697
+
+LBB0_355:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_358:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_360
+ LONG $0x04100f66; BYTE $0xfa // movupd xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c100f66; WORD $0x10fa // movupd xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0580f66 // addpd xmm2, xmm0
+ LONG $0x44100f66; WORD $0x10f9 // movupd xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1580f66 // addpd xmm0, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf844; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm0
+
+LBB0_360:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_361
+ JMP LBB0_697
+
+LBB0_402:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_405:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_407
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+
+LBB0_407:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_408
+ JMP LBB0_697
+
+LBB0_568:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_571:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_573
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+
+LBB0_573:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_574
+ JMP LBB0_697
+
+LBB0_63:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_66:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_68
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB0_68:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_69
+ JMP LBB0_697
+
+LBB0_236:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_239:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_241
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB0_241:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_242
+ JMP LBB0_697
+
+LBB0_476:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_479:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_481
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+
+LBB0_481:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_482
+ JMP LBB0_697
+
+LBB0_642:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_645:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_647
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+
+LBB0_647:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_648
+ JMP LBB0_697
+
+LBB0_137:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_140:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_142
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB0_142:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_143
+ JMP LBB0_697
+
+LBB0_310:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_313:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_315
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB0_315:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_316
+ JMP LBB0_697
+
+LBB0_418:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_421:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_423
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+
+LBB0_423:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_424
+ JMP LBB0_697
+
+LBB0_434:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_437:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_439
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+
+LBB0_439:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_440
+ JMP LBB0_697
+
+LBB0_584:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_587:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_589
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+
+LBB0_589:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_590
+ JMP LBB0_697
+
+LBB0_600:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_603:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_605
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x7804 // movdqu oword [r8 + 2*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+
+LBB0_605:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_606
+ JMP LBB0_697
+
+LBB0_79:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_82:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_84
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB0_84:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_85
+ JMP LBB0_697
+
+LBB0_95:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_98:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_100
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB0_100:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_101
+ JMP LBB0_697
+
+LBB0_252:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_255:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_257
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB0_257:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_258
+ JMP LBB0_697
+
+LBB0_268:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_271:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_273
+ LONG $0x046f0ff3; BYTE $0x7a // movdqu xmm0, oword [rdx + 2*rdi]
+ LONG $0x4c6f0ff3; WORD $0x107a // movdqu xmm1, oword [rdx + 2*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x79 // movdqu xmm2, oword [rcx + 2*rdi]
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1079 // movdqu xmm0, oword [rcx + 2*rdi + 16]
+ LONG $0xc1fd0f66 // paddw xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x7814 // movdqu oword [r8 + 2*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB0_273:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_274
+ JMP LBB0_697
+
+LBB0_492:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_495:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_497
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+
+LBB0_497:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_498
+ JMP LBB0_697
+
+LBB0_508:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_511:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_513
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x5c0f; BYTE $0xc2 // subps xmm0, xmm2
+ LONG $0xb954100f; BYTE $0x10 // movups xmm2, oword [rcx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xca // subps xmm1, xmm2
+ LONG $0x04110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm0
+ LONG $0x4c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm1
+
+LBB0_513:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_514
+ JMP LBB0_697
+
+LBB0_658:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_661:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_663
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf804 // movdqu oword [r8 + 8*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+
+LBB0_663:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_664
+ JMP LBB0_697
+
+LBB0_674:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_677:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_679
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x5c0f; BYTE $0xc2 // subps xmm0, xmm2
+ LONG $0xb954100f; BYTE $0x10 // movups xmm2, oword [rcx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xca // subps xmm1, xmm2
+ LONG $0x04110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm0
+ LONG $0x4c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm1
+
+LBB0_679:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_680
+ JMP LBB0_697
+
+LBB0_153:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_156:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_158
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB0_158:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_159
+ JMP LBB0_697
+
+LBB0_169:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_172:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_174
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x580f; BYTE $0xd0 // addps xmm2, xmm0
+ LONG $0xb944100f; BYTE $0x10 // movups xmm0, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xc1 // addps xmm0, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x44110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm0
+
+LBB0_174:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_175
+ JMP LBB0_697
+
+LBB0_326:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_329:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_331
+ LONG $0x046f0ff3; BYTE $0xfa // movdqu xmm0, oword [rdx + 8*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10fa // movdqu xmm1, oword [rdx + 8*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xf9 // movdqu xmm2, oword [rcx + 8*rdi]
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10f9 // movdqu xmm0, oword [rcx + 8*rdi + 16]
+ LONG $0xc1d40f66 // paddq xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xf814 // movdqu oword [r8 + 8*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB0_331:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_332
+ JMP LBB0_697
+
+LBB0_342:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_345:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_347
+ LONG $0xba04100f // movups xmm0, oword [rdx + 4*rdi]
+ LONG $0xba4c100f; BYTE $0x10 // movups xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ WORD $0x580f; BYTE $0xd0 // addps xmm2, xmm0
+ LONG $0xb944100f; BYTE $0x10 // movups xmm0, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xc1 // addps xmm0, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x44110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm0
+
+LBB0_347:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_348
+ JMP LBB0_697
+
+LBB0_389:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_392:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_394
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+
+LBB0_394:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_395
+ JMP LBB0_697
+
+LBB0_555:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_558:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_560
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x3804 // movdqu oword [r8 + rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+
+LBB0_560:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_561
+ JMP LBB0_697
+
+LBB0_50:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_53:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_55
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB0_55:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_56
+ JMP LBB0_697
+
+LBB0_223:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_226:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_228
+ LONG $0x046f0ff3; BYTE $0x3a // movdqu xmm0, oword [rdx + rdi]
+ LONG $0x4c6f0ff3; WORD $0x103a // movdqu xmm1, oword [rdx + rdi + 16]
+ LONG $0x146f0ff3; BYTE $0x39 // movdqu xmm2, oword [rcx + rdi]
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x1039 // movdqu xmm0, oword [rcx + rdi + 16]
+ LONG $0xc1fc0f66 // paddb xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0x3814 // movdqu oword [r8 + rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB0_228:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_229
+ JMP LBB0_697
+
+LBB0_463:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_466:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_468
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+
+LBB0_468:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_469
+ JMP LBB0_697
+
+LBB0_629:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_632:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_634
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb804 // movdqu oword [r8 + 4*rdi], xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+
+LBB0_634:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_635
+ JMP LBB0_697
+
+LBB0_124:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_127:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_129
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB0_129:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_130
+ JMP LBB0_697
+
+LBB0_297:
+ WORD $0xff31 // xor edi, edi
+
+LBB0_300:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB0_302
+ LONG $0x046f0ff3; BYTE $0xba // movdqu xmm0, oword [rdx + 4*rdi]
+ LONG $0x4c6f0ff3; WORD $0x10ba // movdqu xmm1, oword [rdx + 4*rdi + 16]
+ LONG $0x146f0ff3; BYTE $0xb9 // movdqu xmm2, oword [rcx + 4*rdi]
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x446f0ff3; WORD $0x10b9 // movdqu xmm0, oword [rcx + 4*rdi + 16]
+ LONG $0xc1fe0f66 // paddd xmm0, xmm1
+ LONG $0x7f0f41f3; WORD $0xb814 // movdqu oword [r8 + 4*rdi], xmm2
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB0_302:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB0_303
+
+LBB0_697:
+ RET
+
+TEXT ·_arithmetic_arr_scalar_sse4(SB), $0-48
+
+ MOVQ typ+0(FP), DI
+ MOVQ op+8(FP), SI
+ MOVQ inLeft+16(FP), DX
+ MOVQ inRight+24(FP), CX
+ MOVQ out+32(FP), R8
+ MOVQ len+40(FP), R9
+
+ LONG $0x01fe8040 // cmp sil, 1
+ JG LBB1_11
+ WORD $0x8440; BYTE $0xf6 // test sil, sil
+ JE LBB1_21
+ LONG $0x01fe8040 // cmp sil, 1
+ JNE LBB1_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_37
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_65
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_105
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_108
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_10
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_297
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_297
+
+LBB1_10:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_421:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_423
+
+LBB1_422:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_422
+
+LBB1_423:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_424:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_424
+ JMP LBB1_737
+
+LBB1_11:
+ LONG $0x02fe8040 // cmp sil, 2
+ JE LBB1_29
+ LONG $0x03fe8040 // cmp sil, 3
+ JNE LBB1_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_44
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_70
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_111
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_114
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_20
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_300
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_300
+
+LBB1_20:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_429:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_431
+
+LBB1_430:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_430
+
+LBB1_431:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_432:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_432
+ JMP LBB1_737
+
+LBB1_21:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_51
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_75
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_117
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_120
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_28
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_303
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_303
+
+LBB1_28:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_437:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_439
+
+LBB1_438:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_438
+
+LBB1_439:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_440:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_440
+ JMP LBB1_737
+
+LBB1_29:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB1_58
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB1_80
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB1_123
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB1_126
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_36
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_306
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_306
+
+LBB1_36:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_445:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_447
+
+LBB1_446:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_446
+
+LBB1_447:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_448:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_448
+ JMP LBB1_737
+
+LBB1_37:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_85
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_129
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_132
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff2 // movsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_43
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_309
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_309
+
+LBB1_43:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_453:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_455
+
+LBB1_454:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_454
+
+LBB1_455:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_456:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c100ff2; WORD $0x08ca // movsd xmm1, qword [rdx + 8*rcx + 8]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x08 // movsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c100ff2; WORD $0x10ca // movsd xmm1, qword [rdx + 8*rcx + 16]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x10 // movsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c100ff2; WORD $0x18ca // movsd xmm1, qword [rdx + 8*rcx + 24]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x18 // movsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_456
+ JMP LBB1_737
+
+LBB1_44:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_90
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_135
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_138
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff2 // movsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_50
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_312
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_312
+
+LBB1_50:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_461:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_463
+
+LBB1_462:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_462
+
+LBB1_463:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_464:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c100ff2; WORD $0x08ca // movsd xmm1, qword [rdx + 8*rcx + 8]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x08 // movsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c100ff2; WORD $0x10ca // movsd xmm1, qword [rdx + 8*rcx + 16]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x10 // movsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c100ff2; WORD $0x18ca // movsd xmm1, qword [rdx + 8*rcx + 24]
+ LONG $0xc85c0ff2 // subsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x18 // movsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_464
+ JMP LBB1_737
+
+LBB1_51:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_95
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_141
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_144
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff2 // movsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_57
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_315
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_315
+
+LBB1_57:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_469:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_471
+
+LBB1_470:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_470
+
+LBB1_471:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_472:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c100ff2; WORD $0x08ca // movsd xmm1, qword [rdx + 8*rcx + 8]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x08 // movsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c100ff2; WORD $0x10ca // movsd xmm1, qword [rdx + 8*rcx + 16]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x10 // movsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c100ff2; WORD $0x18ca // movsd xmm1, qword [rdx + 8*rcx + 24]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x18 // movsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_472
+ JMP LBB1_737
+
+LBB1_58:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB1_100
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB1_147
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB1_150
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff2 // movsd xmm0, qword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_64
+ LONG $0xc20c8d48 // lea rcx, [rdx + 8*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_318
+ LONG $0xc00c8d49 // lea rcx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_318
+
+LBB1_64:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_477:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_479
+
+LBB1_478:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_478
+
+LBB1_479:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_480:
+ LONG $0x0c100ff2; BYTE $0xca // movsd xmm1, qword [rdx + 8*rcx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc80c // movsd qword [r8 + 8*rcx], xmm1
+ LONG $0x4c100ff2; WORD $0x08ca // movsd xmm1, qword [rdx + 8*rcx + 8]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x08 // movsd qword [r8 + 8*rcx + 8], xmm1
+ LONG $0x4c100ff2; WORD $0x10ca // movsd xmm1, qword [rdx + 8*rcx + 16]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x10 // movsd qword [r8 + 8*rcx + 16], xmm1
+ LONG $0x4c100ff2; WORD $0x18ca // movsd xmm1, qword [rdx + 8*rcx + 24]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xc84c; BYTE $0x18 // movsd qword [r8 + 8*rcx + 24], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_480
+ JMP LBB1_737
+
+LBB1_65:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_153
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_69
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_321
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_321
+
+LBB1_69:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_485:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_487
+
+LBB1_486:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_486
+
+LBB1_487:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_488:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_488
+ JMP LBB1_737
+
+LBB1_70:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_156
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_74
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_324
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_324
+
+LBB1_74:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_493:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_495
+
+LBB1_494:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_494
+
+LBB1_495:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_496:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_496
+ JMP LBB1_737
+
+LBB1_75:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_159
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_79
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_327
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_327
+
+LBB1_79:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_501:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_503
+
+LBB1_502:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_502
+
+LBB1_503:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_504:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_504
+ JMP LBB1_737
+
+LBB1_80:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB1_162
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_84
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_330
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_330
+
+LBB1_84:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_509:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_511
+
+LBB1_510:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_510
+
+LBB1_511:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_512:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_512
+ JMP LBB1_737
+
+LBB1_85:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_165
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_89
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_333
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_333
+
+LBB1_89:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_517:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_519
+
+LBB1_518:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_518
+
+LBB1_519:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_520:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_520
+ JMP LBB1_737
+
+LBB1_90:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_168
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_94
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_336
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_336
+
+LBB1_94:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_525:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_527
+
+LBB1_526:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_526
+
+LBB1_527:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_528:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_528
+ JMP LBB1_737
+
+LBB1_95:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_171
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_99
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_339
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_339
+
+LBB1_99:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_533:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_535
+
+LBB1_534:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_534
+
+LBB1_535:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_536:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_536
+ JMP LBB1_737
+
+LBB1_100:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB1_174
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB1_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_104
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_342
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_342
+
+LBB1_104:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_541:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_543
+
+LBB1_542:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_542
+
+LBB1_543:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_544:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_544
+ JMP LBB1_737
+
+LBB1_105:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_107
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_345
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_345
+
+LBB1_107:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_549:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_551
+
+LBB1_550:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_550
+
+LBB1_551:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_552:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_552
+ JMP LBB1_737
+
+LBB1_108:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_110
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_348
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_348
+
+LBB1_110:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_557:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_559
+
+LBB1_558:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_558
+
+LBB1_559:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_560:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_560
+ JMP LBB1_737
+
+LBB1_111:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_113
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_351
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_351
+
+LBB1_113:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_565:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_567
+
+LBB1_566:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_566
+
+LBB1_567:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_568:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_568
+ JMP LBB1_737
+
+LBB1_114:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_116
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_354
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_354
+
+LBB1_116:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_573:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_575
+
+LBB1_574:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_574
+
+LBB1_575:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_576:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_576
+ JMP LBB1_737
+
+LBB1_117:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_119
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_357
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_357
+
+LBB1_119:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_581:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_583
+
+LBB1_582:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_582
+
+LBB1_583:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_584:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_584
+ JMP LBB1_737
+
+LBB1_120:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_122
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_360
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_360
+
+LBB1_122:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_589:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_591
+
+LBB1_590:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_590
+
+LBB1_591:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_592:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_592
+ JMP LBB1_737
+
+LBB1_123:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_125
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_363
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_363
+
+LBB1_125:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_597:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_599
+
+LBB1_598:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_598
+
+LBB1_599:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_600:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_600
+ JMP LBB1_737
+
+LBB1_126:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0xb70f; BYTE $0x01 // movzx eax, word [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB1_128
+ LONG $0x520c8d4a // lea rcx, [rdx + 2*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_366
+ LONG $0x500c8d4b // lea rcx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_366
+
+LBB1_128:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_605:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_607
+
+LBB1_606:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_606
+
+LBB1_607:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_608:
+ LONG $0x720cb70f // movzx ecx, word [rdx + 2*rsi]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x0c894166; BYTE $0x70 // mov word [r8 + 2*rsi], cx
+ LONG $0x724cb70f; BYTE $0x02 // movzx ecx, word [rdx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], cx
+ LONG $0x724cb70f; BYTE $0x04 // movzx ecx, word [rdx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], cx
+ LONG $0x724cb70f; BYTE $0x06 // movzx ecx, word [rdx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc1 // add cx, ax
+ LONG $0x4c894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], cx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_608
+ JMP LBB1_737
+
+LBB1_129:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_131
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_369
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_369
+
+LBB1_131:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_613:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_615
+
+LBB1_614:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_614
+
+LBB1_615:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_616:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_616
+ JMP LBB1_737
+
+LBB1_132:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff3 // movss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_134
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_372
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_372
+
+LBB1_134:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_621:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_623
+
+LBB1_622:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_622
+
+LBB1_623:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_624:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c100ff3; WORD $0x048a // movss xmm1, dword [rdx + 4*rcx + 4]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x04 // movss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c100ff3; WORD $0x088a // movss xmm1, dword [rdx + 4*rcx + 8]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x08 // movss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c100ff3; WORD $0x0c8a // movss xmm1, dword [rdx + 4*rcx + 12]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x0c // movss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_624
+ JMP LBB1_737
+
+LBB1_135:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_137
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_375
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_375
+
+LBB1_137:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_629:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_631
+
+LBB1_630:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_630
+
+LBB1_631:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_632:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x2948; BYTE $0xc1 // sub rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_632
+ JMP LBB1_737
+
+LBB1_138:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff3 // movss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_140
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_378
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_378
+
+LBB1_140:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_637:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_639
+
+LBB1_638:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_638
+
+LBB1_639:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_640:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c100ff3; WORD $0x048a // movss xmm1, dword [rdx + 4*rcx + 4]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x04 // movss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c100ff3; WORD $0x088a // movss xmm1, dword [rdx + 4*rcx + 8]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x08 // movss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c100ff3; WORD $0x0c8a // movss xmm1, dword [rdx + 4*rcx + 12]
+ LONG $0xc85c0ff3 // subss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x0c // movss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_640
+ JMP LBB1_737
+
+LBB1_141:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_143
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_381
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_381
+
+LBB1_143:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_645:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_647
+
+LBB1_646:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_646
+
+LBB1_647:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_648:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_648
+ JMP LBB1_737
+
+LBB1_144:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff3 // movss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_146
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_384
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_384
+
+LBB1_146:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_653:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_655
+
+LBB1_654:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_654
+
+LBB1_655:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_656:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c100ff3; WORD $0x048a // movss xmm1, dword [rdx + 4*rcx + 4]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x04 // movss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c100ff3; WORD $0x088a // movss xmm1, dword [rdx + 4*rcx + 8]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x08 // movss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c100ff3; WORD $0x0c8a // movss xmm1, dword [rdx + 4*rcx + 12]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x0c // movss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_656
+ JMP LBB1_737
+
+LBB1_147:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x8b48; BYTE $0x01 // mov rax, qword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB1_149
+ LONG $0xd20c8d4a // lea rcx, [rdx + 8*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_387
+ LONG $0xd00c8d4b // lea rcx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_387
+
+LBB1_149:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_661:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_663
+
+LBB1_662:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_662
+
+LBB1_663:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_664:
+ LONG $0xf20c8b48 // mov rcx, qword [rdx + 8*rsi]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf00c8949 // mov qword [r8 + 8*rsi], rcx
+ LONG $0xf24c8b48; BYTE $0x08 // mov rcx, qword [rdx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rcx
+ LONG $0xf24c8b48; BYTE $0x10 // mov rcx, qword [rdx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rcx
+ LONG $0xf24c8b48; BYTE $0x18 // mov rcx, qword [rdx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc1 // add rcx, rax
+ LONG $0xf04c8949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rcx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_664
+ JMP LBB1_737
+
+LBB1_150:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ LONG $0x01100ff3 // movss xmm0, dword [rcx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_152
+ LONG $0x820c8d48 // lea rcx, [rdx + 4*rax]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_390
+ LONG $0x800c8d49 // lea rcx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_390
+
+LBB1_152:
+ WORD $0xc931 // xor ecx, ecx
+
+LBB1_669:
+ WORD $0x8948; BYTE $0xce // mov rsi, rcx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_671
+
+LBB1_670:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x01c18348 // add rcx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_670
+
+LBB1_671:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB1_737
+
+LBB1_672:
+ LONG $0x0c100ff3; BYTE $0x8a // movss xmm1, dword [rdx + 4*rcx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x880c // movss dword [r8 + 4*rcx], xmm1
+ LONG $0x4c100ff3; WORD $0x048a // movss xmm1, dword [rdx + 4*rcx + 4]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x04 // movss dword [r8 + 4*rcx + 4], xmm1
+ LONG $0x4c100ff3; WORD $0x088a // movss xmm1, dword [rdx + 4*rcx + 8]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x08 // movss dword [r8 + 4*rcx + 8], xmm1
+ LONG $0x4c100ff3; WORD $0x0c8a // movss xmm1, dword [rdx + 4*rcx + 12]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x884c; BYTE $0x0c // movss dword [r8 + 4*rcx + 12], xmm1
+ LONG $0x04c18348 // add rcx, 4
+ WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
+ JNE LBB1_672
+ JMP LBB1_737
+
+LBB1_153:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_155
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_393
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_393
+
+LBB1_155:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_677:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_679
+
+LBB1_678:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_678
+
+LBB1_679:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_680:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_680
+ JMP LBB1_737
+
+LBB1_156:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_158
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_396
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_396
+
+LBB1_158:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_685:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_687
+
+LBB1_686:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_686
+
+LBB1_687:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_688:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc128 // sub cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc128 // sub cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_688
+ JMP LBB1_737
+
+LBB1_159:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_161
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_399
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_399
+
+LBB1_161:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_693:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_695
+
+LBB1_694:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_694
+
+LBB1_695:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_696:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_696
+ JMP LBB1_737
+
+LBB1_162:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018a // mov al, byte [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB1_164
+ LONG $0x120c8d4a // lea rcx, [rdx + r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_402
+ LONG $0x100c8d4b // lea rcx, [r8 + r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_402
+
+LBB1_164:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_701:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_703
+
+LBB1_702:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_702
+
+LBB1_703:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_704:
+ LONG $0x320cb60f // movzx ecx, byte [rdx + rsi]
+ WORD $0xc100 // add cl, al
+ LONG $0x300c8841 // mov byte [r8 + rsi], cl
+ LONG $0x324cb60f; BYTE $0x01 // movzx ecx, byte [rdx + rsi + 1]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x01 // mov byte [r8 + rsi + 1], cl
+ LONG $0x324cb60f; BYTE $0x02 // movzx ecx, byte [rdx + rsi + 2]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x02 // mov byte [r8 + rsi + 2], cl
+ LONG $0x324cb60f; BYTE $0x03 // movzx ecx, byte [rdx + rsi + 3]
+ WORD $0xc100 // add cl, al
+ LONG $0x304c8841; BYTE $0x03 // mov byte [r8 + rsi + 3], cl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_704
+ JMP LBB1_737
+
+LBB1_165:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_167
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_405
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_405
+
+LBB1_167:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_709:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_711
+
+LBB1_710:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_710
+
+LBB1_711:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_712:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_712
+ JMP LBB1_737
+
+LBB1_168:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_170
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_408
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_408
+
+LBB1_170:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_717:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_719
+
+LBB1_718:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_718
+
+LBB1_719:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_720:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc129 // sub ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_720
+ JMP LBB1_737
+
+LBB1_171:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_173
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_411
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_411
+
+LBB1_173:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_725:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_727
+
+LBB1_726:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_726
+
+LBB1_727:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_728:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_728
+ JMP LBB1_737
+
+LBB1_174:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB1_737
+ WORD $0x018b // mov eax, dword [rcx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB1_176
+ LONG $0x920c8d4a // lea rcx, [rdx + 4*r10]
+ WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
+ JBE LBB1_414
+ LONG $0x900c8d4b // lea rcx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xd1 // cmp rcx, rdx
+ JBE LBB1_414
+
+LBB1_176:
+ WORD $0xf631 // xor esi, esi
+
+LBB1_733:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB1_735
+
+LBB1_734:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB1_734
+
+LBB1_735:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB1_737
+
+LBB1_736:
+ WORD $0x0c8b; BYTE $0xb2 // mov ecx, dword [rdx + 4*rsi]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb00c8941 // mov dword [r8 + 4*rsi], ecx
+ LONG $0x04b24c8b // mov ecx, dword [rdx + 4*rsi + 4]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], ecx
+ LONG $0x08b24c8b // mov ecx, dword [rdx + 4*rsi + 8]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], ecx
+ LONG $0x0cb24c8b // mov ecx, dword [rdx + 4*rsi + 12]
+ WORD $0xc101 // add ecx, eax
+ LONG $0xb04c8941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], ecx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB1_736
+ JMP LBB1_737
+
+LBB1_297:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_417
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_299:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_299
+ JMP LBB1_418
+
+LBB1_300:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_425
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_302:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_302
+ JMP LBB1_426
+
+LBB1_303:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_433
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_305:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_305
+ JMP LBB1_434
+
+LBB1_306:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_441
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_308:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_308
+ JMP LBB1_442
+
+LBB1_309:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xfc // and ecx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc718d48 // lea rsi, [rcx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_449
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_311:
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd15c0f66 // subpd xmm2, xmm1
+ LONG $0xd95c0f66 // subpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+ LONG $0x54100f66; WORD $0x20fa // movupd xmm2, oword [rdx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30fa // movupd xmm3, oword [rdx + 8*rdi + 48]
+ LONG $0xd15c0f66 // subpd xmm2, xmm1
+ LONG $0xd95c0f66 // subpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm3
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_311
+ JMP LBB1_450
+
+LBB1_312:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xfc // and ecx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc718d48 // lea rsi, [rcx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_457
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_314:
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd15c0f66 // subpd xmm2, xmm1
+ LONG $0xd95c0f66 // subpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+ LONG $0x54100f66; WORD $0x20fa // movupd xmm2, oword [rdx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30fa // movupd xmm3, oword [rdx + 8*rdi + 48]
+ LONG $0xd15c0f66 // subpd xmm2, xmm1
+ LONG $0xd95c0f66 // subpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm3
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_314
+ JMP LBB1_458
+
+LBB1_315:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xfc // and ecx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc718d48 // lea rsi, [rcx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_465
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_317:
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+ LONG $0x54100f66; WORD $0x20fa // movupd xmm2, oword [rdx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30fa // movupd xmm3, oword [rdx + 8*rdi + 48]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm3
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_317
+ JMP LBB1_466
+
+LBB1_318:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xfc // and ecx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc718d48 // lea rsi, [rcx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_473
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_320:
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+ LONG $0x54100f66; WORD $0x20fa // movupd xmm2, oword [rdx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30fa // movupd xmm3, oword [rdx + 8*rdi + 48]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm3
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_320
+ JMP LBB1_474
+
+LBB1_321:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_481
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_323:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_323
+ JMP LBB1_482
+
+LBB1_324:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_489
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_326:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_326
+ JMP LBB1_490
+
+LBB1_327:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_497
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_329:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_329
+ JMP LBB1_498
+
+LBB1_330:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_505
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_332:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_332
+ JMP LBB1_506
+
+LBB1_333:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_513
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_335:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_335
+ JMP LBB1_514
+
+LBB1_336:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_521
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_338:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_338
+ JMP LBB1_522
+
+LBB1_339:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_529
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_341:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_341
+ JMP LBB1_530
+
+LBB1_342:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_537
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_344:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_344
+ JMP LBB1_538
+
+LBB1_345:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_545
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_347:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_347
+ JMP LBB1_546
+
+LBB1_348:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_553
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_350:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_350
+ JMP LBB1_554
+
+LBB1_351:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_561
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_353:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_353
+ JMP LBB1_562
+
+LBB1_354:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_569
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_356:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_356
+ JMP LBB1_570
+
+LBB1_357:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_577
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_359:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_359
+ JMP LBB1_578
+
+LBB1_360:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_585
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_362:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_362
+ JMP LBB1_586
+
+LBB1_363:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_593
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_365:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_365
+ JMP LBB1_594
+
+LBB1_366:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf04e8d48 // lea rcx, [rsi - 16]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_601
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_368:
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x207a // movdqu xmm1, oword [rdx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x307a // movdqu xmm2, oword [rdx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_368
+ JMP LBB1_602
+
+LBB1_369:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_609
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_371:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_371
+ JMP LBB1_610
+
+LBB1_372:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf8 // and ecx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8718d48 // lea rsi, [rcx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_617
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_374:
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xd1 // subps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd9 // subps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+ LONG $0xba54100f; BYTE $0x20 // movups xmm2, oword [rdx + 4*rdi + 32]
+ LONG $0xba5c100f; BYTE $0x30 // movups xmm3, oword [rdx + 4*rdi + 48]
+ WORD $0x5c0f; BYTE $0xd1 // subps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd9 // subps xmm3, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x5c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm3
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_374
+ JMP LBB1_618
+
+LBB1_375:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_625
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_377:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_377
+ JMP LBB1_626
+
+LBB1_378:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf8 // and ecx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8718d48 // lea rsi, [rcx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_633
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_380:
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xd1 // subps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd9 // subps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+ LONG $0xba54100f; BYTE $0x20 // movups xmm2, oword [rdx + 4*rdi + 32]
+ LONG $0xba5c100f; BYTE $0x30 // movups xmm3, oword [rdx + 4*rdi + 48]
+ WORD $0x5c0f; BYTE $0xd1 // subps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd9 // subps xmm3, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x5c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm3
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_380
+ JMP LBB1_634
+
+LBB1_381:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_641
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_383:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_383
+ JMP LBB1_642
+
+LBB1_384:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf8 // and ecx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8718d48 // lea rsi, [rcx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_649
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_386:
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+ LONG $0xba54100f; BYTE $0x20 // movups xmm2, oword [rdx + 4*rdi + 32]
+ LONG $0xba5c100f; BYTE $0x30 // movups xmm3, oword [rdx + 4*rdi + 48]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x5c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm3
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_386
+ JMP LBB1_650
+
+LBB1_387:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc4e8d48 // lea rcx, [rsi - 4]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_657
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_389:
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20fa // movdqu xmm1, oword [rdx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30fa // movdqu xmm2, oword [rdx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_389
+ JMP LBB1_658
+
+LBB1_390:
+ WORD $0xc189 // mov ecx, eax
+ WORD $0xe183; BYTE $0xf8 // and ecx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8718d48 // lea rsi, [rcx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB1_665
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB1_392:
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+ LONG $0xba54100f; BYTE $0x20 // movups xmm2, oword [rdx + 4*rdi + 32]
+ LONG $0xba5c100f; BYTE $0x30 // movups xmm3, oword [rdx + 4*rdi + 48]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x5c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm3
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB1_392
+ JMP LBB1_666
+
+LBB1_393:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_673
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_395:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_395
+ JMP LBB1_674
+
+LBB1_396:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_681
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_398:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_398
+ JMP LBB1_682
+
+LBB1_399:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_689
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_401:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_401
+ JMP LBB1_690
+
+LBB1_402:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xc8 // movzx ecx, al
+ LONG $0xc16e0f66 // movd xmm0, ecx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe04e8d48 // lea rcx, [rsi - 32]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_697
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_404:
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x203a // movdqu xmm1, oword [rdx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x303a // movdqu xmm2, oword [rdx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_404
+ JMP LBB1_698
+
+LBB1_405:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_705
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_407:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_407
+ JMP LBB1_706
+
+LBB1_408:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_713
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_410:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_410
+ JMP LBB1_714
+
+LBB1_411:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_721
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_413:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_413
+ JMP LBB1_722
+
+LBB1_414:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf84e8d48 // lea rcx, [rsi - 8]
+ WORD $0x8949; BYTE $0xc9 // mov r9, rcx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xc9 // test rcx, rcx
+ JE LBB1_729
+ WORD $0x894c; BYTE $0xc9 // mov rcx, r9
+ LONG $0xfee18348 // and rcx, -2
+ WORD $0xf748; BYTE $0xd9 // neg rcx
+ WORD $0xff31 // xor edi, edi
+
+LBB1_416:
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20ba // movdqu xmm1, oword [rdx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30ba // movdqu xmm2, oword [rdx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c18348 // add rcx, 2
+ JNE LBB1_416
+ JMP LBB1_730
+
+LBB1_417:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_418:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_420
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_420:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_421
+
+LBB1_425:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_426:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_428
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_428:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_429
+
+LBB1_433:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_434:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_436
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_436:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_437
+
+LBB1_441:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_442:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_444
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_444:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_445
+
+LBB1_449:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_450:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_452
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd15c0f66 // subpd xmm2, xmm1
+ LONG $0xd95c0f66 // subpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+
+LBB1_452:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_453
+
+LBB1_457:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_458:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_460
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd15c0f66 // subpd xmm2, xmm1
+ LONG $0xd95c0f66 // subpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+
+LBB1_460:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_461
+
+LBB1_465:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_466:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_468
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+
+LBB1_468:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_469
+
+LBB1_473:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_474:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_476
+ LONG $0x14100f66; BYTE $0xfa // movupd xmm2, oword [rdx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10fa // movupd xmm3, oword [rdx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+
+LBB1_476:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_477
+
+LBB1_481:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_482:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_484
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_484:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_485
+
+LBB1_489:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_490:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_492
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_492:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_493
+
+LBB1_497:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_498:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_500
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_500:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_501
+
+LBB1_505:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_506:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_508
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_508:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_509
+
+LBB1_513:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_514:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_516
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_516:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_517
+
+LBB1_521:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_522:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_524
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_524:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_525
+
+LBB1_529:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_530:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_532
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_532:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_533
+
+LBB1_537:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_538:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_540
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_540:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_541
+
+LBB1_545:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_546:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_548
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_548:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_549
+
+LBB1_553:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_554:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_556
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_556:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_557
+
+LBB1_561:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_562:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_564
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_564:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_565
+
+LBB1_569:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_570:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_572
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8f90f66 // psubw xmm1, xmm0
+ LONG $0xd0f90f66 // psubw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_572:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_573
+
+LBB1_577:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_578:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_580
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_580:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_581
+
+LBB1_585:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_586:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_588
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_588:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_589
+
+LBB1_593:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_594:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_596
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_596:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_597
+
+LBB1_601:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_602:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_604
+ LONG $0x0c6f0ff3; BYTE $0x7a // movdqu xmm1, oword [rdx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x107a // movdqu xmm2, oword [rdx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB1_604:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_605
+
+LBB1_609:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_610:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_612
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_612:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_613
+
+LBB1_617:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_618:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_620
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xd1 // subps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd9 // subps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+
+LBB1_620:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_621
+
+LBB1_625:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_626:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_628
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8fb0f66 // psubq xmm1, xmm0
+ LONG $0xd0fb0f66 // psubq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_628:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_629
+
+LBB1_633:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_634:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_636
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x5c0f; BYTE $0xd1 // subps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd9 // subps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+
+LBB1_636:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_637
+
+LBB1_641:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_642:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_644
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_644:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_645
+
+LBB1_649:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_650:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_652
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+
+LBB1_652:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_653
+
+LBB1_657:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_658:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_660
+ LONG $0x0c6f0ff3; BYTE $0xfa // movdqu xmm1, oword [rdx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10fa // movdqu xmm2, oword [rdx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB1_660:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_661
+
+LBB1_665:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_666:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_668
+ LONG $0xba14100f // movups xmm2, oword [rdx + 4*rdi]
+ LONG $0xba5c100f; BYTE $0x10 // movups xmm3, oword [rdx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+
+LBB1_668:
+ WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
+ JE LBB1_737
+ JMP LBB1_669
+
+LBB1_673:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_674:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_676
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_676:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_677
+
+LBB1_681:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_682:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_684
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8f80f66 // psubb xmm1, xmm0
+ LONG $0xd0f80f66 // psubb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_684:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_685
+
+LBB1_689:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_690:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_692
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_692:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_693
+
+LBB1_697:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_698:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_700
+ LONG $0x0c6f0ff3; BYTE $0x3a // movdqu xmm1, oword [rdx + rdi]
+ LONG $0x546f0ff3; WORD $0x103a // movdqu xmm2, oword [rdx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB1_700:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_701
+
+LBB1_705:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_706:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_708
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_708:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_709
+
+LBB1_713:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_714:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_716
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fa0f66 // psubd xmm1, xmm0
+ LONG $0xd0fa0f66 // psubd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_716:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_717
+
+LBB1_721:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_722:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_724
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_724:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB1_737
+ JMP LBB1_725
+
+LBB1_729:
+ WORD $0xff31 // xor edi, edi
+
+LBB1_730:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB1_732
+ LONG $0x0c6f0ff3; BYTE $0xba // movdqu xmm1, oword [rdx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10ba // movdqu xmm2, oword [rdx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB1_732:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB1_733
+
+LBB1_737:
+ RET
+
+TEXT ·_arithmetic_scalar_arr_sse4(SB), $0-48
+
+ MOVQ typ+0(FP), DI
+ MOVQ op+8(FP), SI
+ MOVQ inLeft+16(FP), DX
+ MOVQ inRight+24(FP), CX
+ MOVQ out+32(FP), R8
+ MOVQ len+40(FP), R9
+
+ LONG $0x01fe8040 // cmp sil, 1
+ JG LBB2_11
+ WORD $0x8440; BYTE $0xf6 // test sil, sil
+ JE LBB2_21
+ LONG $0x01fe8040 // cmp sil, 1
+ JNE LBB2_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_37
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_65
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_105
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_108
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_10
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_297
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_297
+
+LBB2_10:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_421:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_423
+
+LBB2_422:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_422
+
+LBB2_423:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_424:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_424
+ JMP LBB2_737
+
+LBB2_11:
+ LONG $0x02fe8040 // cmp sil, 2
+ JE LBB2_29
+ LONG $0x03fe8040 // cmp sil, 3
+ JNE LBB2_737
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_44
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_70
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_111
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_114
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_20
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_300
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_300
+
+LBB2_20:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_429:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_431
+
+LBB2_430:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_430
+
+LBB2_431:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_432:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_432
+ JMP LBB2_737
+
+LBB2_21:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_51
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_75
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_117
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_120
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_28
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_303
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_303
+
+LBB2_28:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_437:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_439
+
+LBB2_438:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_438
+
+LBB2_439:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_440:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_440
+ JMP LBB2_737
+
+LBB2_29:
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JG LBB2_58
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JLE LBB2_80
+ WORD $0xff83; BYTE $0x04 // cmp edi, 4
+ JE LBB2_123
+ WORD $0xff83; BYTE $0x05 // cmp edi, 5
+ JE LBB2_126
+ WORD $0xff83; BYTE $0x06 // cmp edi, 6
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_36
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_306
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_306
+
+LBB2_36:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_445:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_447
+
+LBB2_446:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_446
+
+LBB2_447:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_448:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_448
+ JMP LBB2_737
+
+LBB2_37:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_85
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_129
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_132
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff2 // movsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_43
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_309
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_309
+
+LBB2_43:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_453:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_455
+
+LBB2_454:
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x0c5c0ff2; BYTE $0xd1 // subsd xmm1, qword [rcx + 8*rdx]
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_454
+
+LBB2_455:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_456:
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x0c5c0ff2; BYTE $0xd1 // subsd xmm1, qword [rcx + 8*rdx]
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x4c5c0ff2; WORD $0x08d1 // subsd xmm1, qword [rcx + 8*rdx + 8]
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x08 // movsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x4c5c0ff2; WORD $0x10d1 // subsd xmm1, qword [rcx + 8*rdx + 16]
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x10 // movsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x4c5c0ff2; WORD $0x18d1 // subsd xmm1, qword [rcx + 8*rdx + 24]
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x18 // movsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_456
+ JMP LBB2_737
+
+LBB2_44:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_90
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_135
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_138
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff2 // movsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_50
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_312
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_312
+
+LBB2_50:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_461:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_463
+
+LBB2_462:
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x0c5c0ff2; BYTE $0xd1 // subsd xmm1, qword [rcx + 8*rdx]
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_462
+
+LBB2_463:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_464:
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x0c5c0ff2; BYTE $0xd1 // subsd xmm1, qword [rcx + 8*rdx]
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x4c5c0ff2; WORD $0x08d1 // subsd xmm1, qword [rcx + 8*rdx + 8]
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x08 // movsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x4c5c0ff2; WORD $0x10d1 // subsd xmm1, qword [rcx + 8*rdx + 16]
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x10 // movsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0xc8280f66 // movapd xmm1, xmm0
+ LONG $0x4c5c0ff2; WORD $0x18d1 // subsd xmm1, qword [rcx + 8*rdx + 24]
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x18 // movsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_464
+ JMP LBB2_737
+
+LBB2_51:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_95
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_141
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_144
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff2 // movsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_57
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_315
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_315
+
+LBB2_57:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_469:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_471
+
+LBB2_470:
+ LONG $0x0c100ff2; BYTE $0xd1 // movsd xmm1, qword [rcx + 8*rdx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_470
+
+LBB2_471:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_472:
+ LONG $0x0c100ff2; BYTE $0xd1 // movsd xmm1, qword [rcx + 8*rdx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0x4c100ff2; WORD $0x08d1 // movsd xmm1, qword [rcx + 8*rdx + 8]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x08 // movsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0x4c100ff2; WORD $0x10d1 // movsd xmm1, qword [rcx + 8*rdx + 16]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x10 // movsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0x4c100ff2; WORD $0x18d1 // movsd xmm1, qword [rcx + 8*rdx + 24]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x18 // movsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_472
+ JMP LBB2_737
+
+LBB2_58:
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JLE LBB2_100
+ WORD $0xff83; BYTE $0x09 // cmp edi, 9
+ JE LBB2_147
+ WORD $0xff83; BYTE $0x0b // cmp edi, 11
+ JE LBB2_150
+ WORD $0xff83; BYTE $0x0c // cmp edi, 12
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff2 // movsd xmm0, qword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_64
+ LONG $0xc1148d48 // lea rdx, [rcx + 8*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_318
+ LONG $0xc0148d49 // lea rdx, [r8 + 8*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_318
+
+LBB2_64:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_477:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_479
+
+LBB2_478:
+ LONG $0x0c100ff2; BYTE $0xd1 // movsd xmm1, qword [rcx + 8*rdx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_478
+
+LBB2_479:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_480:
+ LONG $0x0c100ff2; BYTE $0xd1 // movsd xmm1, qword [rcx + 8*rdx]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd00c // movsd qword [r8 + 8*rdx], xmm1
+ LONG $0x4c100ff2; WORD $0x08d1 // movsd xmm1, qword [rcx + 8*rdx + 8]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x08 // movsd qword [r8 + 8*rdx + 8], xmm1
+ LONG $0x4c100ff2; WORD $0x10d1 // movsd xmm1, qword [rcx + 8*rdx + 16]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x10 // movsd qword [r8 + 8*rdx + 16], xmm1
+ LONG $0x4c100ff2; WORD $0x18d1 // movsd xmm1, qword [rcx + 8*rdx + 24]
+ LONG $0xc8580ff2 // addsd xmm1, xmm0
+ LONG $0x110f41f2; WORD $0xd04c; BYTE $0x18 // movsd qword [r8 + 8*rdx + 24], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_480
+ JMP LBB2_737
+
+LBB2_65:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_153
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8a44; BYTE $0x1a // mov r11b, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_69
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_321
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_321
+
+LBB2_69:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_485:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_487
+
+LBB2_486:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_486
+
+LBB2_487:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_488:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_488
+ JMP LBB2_737
+
+LBB2_70:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_156
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8a44; BYTE $0x1a // mov r11b, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_74
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_324
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_324
+
+LBB2_74:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_493:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_495
+
+LBB2_494:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_494
+
+LBB2_495:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_496:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_496
+ JMP LBB2_737
+
+LBB2_75:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_159
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_79
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_327
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_327
+
+LBB2_79:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_501:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_503
+
+LBB2_502:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_502
+
+LBB2_503:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_504:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_504
+ JMP LBB2_737
+
+LBB2_80:
+ WORD $0xff83; BYTE $0x02 // cmp edi, 2
+ JE LBB2_162
+ WORD $0xff83; BYTE $0x03 // cmp edi, 3
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_84
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_330
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_330
+
+LBB2_84:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_509:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_511
+
+LBB2_510:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_510
+
+LBB2_511:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_512:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_512
+ JMP LBB2_737
+
+LBB2_85:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_165
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_89
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_333
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_333
+
+LBB2_89:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_517:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_519
+
+LBB2_518:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_518
+
+LBB2_519:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_520:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_520
+ JMP LBB2_737
+
+LBB2_90:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_168
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_94
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_336
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_336
+
+LBB2_94:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_525:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_527
+
+LBB2_526:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_526
+
+LBB2_527:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_528:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_528
+ JMP LBB2_737
+
+LBB2_95:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_171
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_99
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_339
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_339
+
+LBB2_99:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_533:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_535
+
+LBB2_534:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_534
+
+LBB2_535:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_536:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_536
+ JMP LBB2_737
+
+LBB2_100:
+ WORD $0xff83; BYTE $0x07 // cmp edi, 7
+ JE LBB2_174
+ WORD $0xff83; BYTE $0x08 // cmp edi, 8
+ JNE LBB2_737
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_104
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_342
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_342
+
+LBB2_104:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_541:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_543
+
+LBB2_542:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_542
+
+LBB2_543:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_544:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_544
+ JMP LBB2_737
+
+LBB2_105:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_107
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_345
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_345
+
+LBB2_107:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_549:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_551
+
+LBB2_550:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_550
+
+LBB2_551:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_552:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_552
+ JMP LBB2_737
+
+LBB2_108:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_110
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_348
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_348
+
+LBB2_110:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_557:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_559
+
+LBB2_558:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_558
+
+LBB2_559:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_560:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_560
+ JMP LBB2_737
+
+LBB2_111:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_113
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_351
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_351
+
+LBB2_113:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_565:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_567
+
+LBB2_566:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_566
+
+LBB2_567:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_568:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_568
+ JMP LBB2_737
+
+LBB2_114:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_116
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_354
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_354
+
+LBB2_116:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_573:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_575
+
+LBB2_574:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_574
+
+LBB2_575:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_576:
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71142b66 // sub dx, word [rcx + 2*rsi]
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x02 // sub dx, word [rcx + 2*rsi + 2]
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x04 // sub dx, word [rcx + 2*rsi + 4]
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ WORD $0xc289 // mov edx, eax
+ LONG $0x71542b66; BYTE $0x06 // sub dx, word [rcx + 2*rsi + 6]
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_576
+ JMP LBB2_737
+
+LBB2_117:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_119
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_357
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_357
+
+LBB2_119:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_581:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_583
+
+LBB2_582:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_582
+
+LBB2_583:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_584:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_584
+ JMP LBB2_737
+
+LBB2_120:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_122
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_360
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_360
+
+LBB2_122:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_589:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_591
+
+LBB2_590:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_590
+
+LBB2_591:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_592:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_592
+ JMP LBB2_737
+
+LBB2_123:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_125
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_363
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_363
+
+LBB2_125:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_597:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_599
+
+LBB2_598:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_598
+
+LBB2_599:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_600:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_600
+ JMP LBB2_737
+
+LBB2_126:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0xb70f; BYTE $0x02 // movzx eax, word [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x10f98341 // cmp r9d, 16
+ JB LBB2_128
+ LONG $0x51148d4a // lea rdx, [rcx + 2*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_366
+ LONG $0x50148d4b // lea rdx, [r8 + 2*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_366
+
+LBB2_128:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_605:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_607
+
+LBB2_606:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_606
+
+LBB2_607:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_608:
+ LONG $0x7114b70f // movzx edx, word [rcx + 2*rsi]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x14894166; BYTE $0x70 // mov word [r8 + 2*rsi], dx
+ LONG $0x7154b70f; BYTE $0x02 // movzx edx, word [rcx + 2*rsi + 2]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0270 // mov word [r8 + 2*rsi + 2], dx
+ LONG $0x7154b70f; BYTE $0x04 // movzx edx, word [rcx + 2*rsi + 4]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0470 // mov word [r8 + 2*rsi + 4], dx
+ LONG $0x7154b70f; BYTE $0x06 // movzx edx, word [rcx + 2*rsi + 6]
+ WORD $0x0166; BYTE $0xc2 // add dx, ax
+ LONG $0x54894166; WORD $0x0670 // mov word [r8 + 2*rsi + 6], dx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_608
+ JMP LBB2_737
+
+LBB2_129:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_131
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_369
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_369
+
+LBB2_131:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_613:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_615
+
+LBB2_614:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_614
+
+LBB2_615:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_616:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_616
+ JMP LBB2_737
+
+LBB2_132:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff3 // movss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_134
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_372
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_372
+
+LBB2_134:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_621:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_623
+
+LBB2_622:
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x0c5c0ff3; BYTE $0x91 // subss xmm1, dword [rcx + 4*rdx]
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_622
+
+LBB2_623:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_624:
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x0c5c0ff3; BYTE $0x91 // subss xmm1, dword [rcx + 4*rdx]
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x4c5c0ff3; WORD $0x0491 // subss xmm1, dword [rcx + 4*rdx + 4]
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x04 // movss dword [r8 + 4*rdx + 4], xmm1
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x4c5c0ff3; WORD $0x0891 // subss xmm1, dword [rcx + 4*rdx + 8]
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x08 // movss dword [r8 + 4*rdx + 8], xmm1
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x4c5c0ff3; WORD $0x0c91 // subss xmm1, dword [rcx + 4*rdx + 12]
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x0c // movss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_624
+ JMP LBB2_737
+
+LBB2_135:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b4c; BYTE $0x1a // mov r11, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_137
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_375
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_375
+
+LBB2_137:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_629:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_631
+
+LBB2_630:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_630
+
+LBB2_631:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_632:
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1042b48 // sub rax, qword [rcx + 8*rsi]
+ LONG $0xf0048949 // mov qword [r8 + 8*rsi], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x08 // sub rax, qword [rcx + 8*rsi + 8]
+ LONG $0xf0448949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x10 // sub rax, qword [rcx + 8*rsi + 16]
+ LONG $0xf0448949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rax
+ WORD $0x894c; BYTE $0xd8 // mov rax, r11
+ LONG $0xf1442b48; BYTE $0x18 // sub rax, qword [rcx + 8*rsi + 24]
+ LONG $0xf0448949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_632
+ JMP LBB2_737
+
+LBB2_138:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff3 // movss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_140
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_378
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_378
+
+LBB2_140:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_637:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_639
+
+LBB2_638:
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x0c5c0ff3; BYTE $0x91 // subss xmm1, dword [rcx + 4*rdx]
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_638
+
+LBB2_639:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_640:
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x0c5c0ff3; BYTE $0x91 // subss xmm1, dword [rcx + 4*rdx]
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x4c5c0ff3; WORD $0x0491 // subss xmm1, dword [rcx + 4*rdx + 4]
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x04 // movss dword [r8 + 4*rdx + 4], xmm1
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x4c5c0ff3; WORD $0x0891 // subss xmm1, dword [rcx + 4*rdx + 8]
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x08 // movss dword [r8 + 4*rdx + 8], xmm1
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x4c5c0ff3; WORD $0x0c91 // subss xmm1, dword [rcx + 4*rdx + 12]
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x0c // movss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_640
+ JMP LBB2_737
+
+LBB2_141:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_143
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_381
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_381
+
+LBB2_143:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_645:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_647
+
+LBB2_646:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_646
+
+LBB2_647:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_648:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_648
+ JMP LBB2_737
+
+LBB2_144:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff3 // movss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_146
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_384
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_384
+
+LBB2_146:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_653:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_655
+
+LBB2_654:
+ LONG $0x0c100ff3; BYTE $0x91 // movss xmm1, dword [rcx + 4*rdx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_654
+
+LBB2_655:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_656:
+ LONG $0x0c100ff3; BYTE $0x91 // movss xmm1, dword [rcx + 4*rdx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ LONG $0x4c100ff3; WORD $0x0491 // movss xmm1, dword [rcx + 4*rdx + 4]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x04 // movss dword [r8 + 4*rdx + 4], xmm1
+ LONG $0x4c100ff3; WORD $0x0891 // movss xmm1, dword [rcx + 4*rdx + 8]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x08 // movss dword [r8 + 4*rdx + 8], xmm1
+ LONG $0x4c100ff3; WORD $0x0c91 // movss xmm1, dword [rcx + 4*rdx + 12]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x0c // movss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_656
+ JMP LBB2_737
+
+LBB2_147:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b48; BYTE $0x02 // mov rax, qword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x04f98341 // cmp r9d, 4
+ JB LBB2_149
+ LONG $0xd1148d4a // lea rdx, [rcx + 8*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_387
+ LONG $0xd0148d4b // lea rdx, [r8 + 8*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_387
+
+LBB2_149:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_661:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_663
+
+LBB2_662:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_662
+
+LBB2_663:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_664:
+ LONG $0xf1148b48 // mov rdx, qword [rcx + 8*rsi]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0148949 // mov qword [r8 + 8*rsi], rdx
+ LONG $0xf1548b48; BYTE $0x08 // mov rdx, qword [rcx + 8*rsi + 8]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x08 // mov qword [r8 + 8*rsi + 8], rdx
+ LONG $0xf1548b48; BYTE $0x10 // mov rdx, qword [rcx + 8*rsi + 16]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x10 // mov qword [r8 + 8*rsi + 16], rdx
+ LONG $0xf1548b48; BYTE $0x18 // mov rdx, qword [rcx + 8*rsi + 24]
+ WORD $0x0148; BYTE $0xc2 // add rdx, rax
+ LONG $0xf0548949; BYTE $0x18 // mov qword [r8 + 8*rsi + 24], rdx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_664
+ JMP LBB2_737
+
+LBB2_150:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ LONG $0x02100ff3 // movss xmm0, dword [rdx]
+ WORD $0x8944; BYTE $0xc8 // mov eax, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_152
+ LONG $0x81148d48 // lea rdx, [rcx + 4*rax]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_390
+ LONG $0x80148d49 // lea rdx, [r8 + 4*rax]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_390
+
+LBB2_152:
+ WORD $0xd231 // xor edx, edx
+
+LBB2_669:
+ WORD $0x8948; BYTE $0xd6 // mov rsi, rdx
+ WORD $0xf748; BYTE $0xd6 // not rsi
+ WORD $0x0148; BYTE $0xc6 // add rsi, rax
+ WORD $0x8948; BYTE $0xc7 // mov rdi, rax
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_671
+
+LBB2_670:
+ LONG $0x0c100ff3; BYTE $0x91 // movss xmm1, dword [rcx + 4*rdx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ LONG $0x01c28348 // add rdx, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_670
+
+LBB2_671:
+ LONG $0x03fe8348 // cmp rsi, 3
+ JB LBB2_737
+
+LBB2_672:
+ LONG $0x0c100ff3; BYTE $0x91 // movss xmm1, dword [rcx + 4*rdx]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x900c // movss dword [r8 + 4*rdx], xmm1
+ LONG $0x4c100ff3; WORD $0x0491 // movss xmm1, dword [rcx + 4*rdx + 4]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x04 // movss dword [r8 + 4*rdx + 4], xmm1
+ LONG $0x4c100ff3; WORD $0x0891 // movss xmm1, dword [rcx + 4*rdx + 8]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x08 // movss dword [r8 + 4*rdx + 8], xmm1
+ LONG $0x4c100ff3; WORD $0x0c91 // movss xmm1, dword [rcx + 4*rdx + 12]
+ LONG $0xc8580ff3 // addss xmm1, xmm0
+ LONG $0x110f41f3; WORD $0x904c; BYTE $0x0c // movss dword [r8 + 4*rdx + 12], xmm1
+ LONG $0x04c28348 // add rdx, 4
+ WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
+ JNE LBB2_672
+ JMP LBB2_737
+
+LBB2_153:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8a44; BYTE $0x1a // mov r11b, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_155
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_393
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_393
+
+LBB2_155:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_677:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_679
+
+LBB2_678:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_678
+
+LBB2_679:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_680:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_680
+ JMP LBB2_737
+
+LBB2_156:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8a44; BYTE $0x1a // mov r11b, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_158
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_396
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_396
+
+LBB2_158:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_685:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_687
+
+LBB2_686:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_686
+
+LBB2_687:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_688:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042a; BYTE $0x31 // sub al, byte [rcx + rsi]
+ LONG $0x30048841 // mov byte [r8 + rsi], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0131442a // sub al, byte [rcx + rsi + 1]
+ LONG $0x30448841; BYTE $0x01 // mov byte [r8 + rsi + 1], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0231442a // sub al, byte [rcx + rsi + 2]
+ LONG $0x30448841; BYTE $0x02 // mov byte [r8 + rsi + 2], al
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0331442a // sub al, byte [rcx + rsi + 3]
+ LONG $0x30448841; BYTE $0x03 // mov byte [r8 + rsi + 3], al
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_688
+ JMP LBB2_737
+
+LBB2_159:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_161
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_399
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_399
+
+LBB2_161:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_693:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_695
+
+LBB2_694:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_694
+
+LBB2_695:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_696:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_696
+ JMP LBB2_737
+
+LBB2_162:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028a // mov al, byte [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x20f98341 // cmp r9d, 32
+ JB LBB2_164
+ LONG $0x11148d4a // lea rdx, [rcx + r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_402
+ LONG $0x10148d4b // lea rdx, [r8 + r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_402
+
+LBB2_164:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_701:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_703
+
+LBB2_702:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_702
+
+LBB2_703:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_704:
+ LONG $0x3114b60f // movzx edx, byte [rcx + rsi]
+ WORD $0xc200 // add dl, al
+ LONG $0x30148841 // mov byte [r8 + rsi], dl
+ LONG $0x3154b60f; BYTE $0x01 // movzx edx, byte [rcx + rsi + 1]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x01 // mov byte [r8 + rsi + 1], dl
+ LONG $0x3154b60f; BYTE $0x02 // movzx edx, byte [rcx + rsi + 2]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x02 // mov byte [r8 + rsi + 2], dl
+ LONG $0x3154b60f; BYTE $0x03 // movzx edx, byte [rcx + rsi + 3]
+ WORD $0xc200 // add dl, al
+ LONG $0x30548841; BYTE $0x03 // mov byte [r8 + rsi + 3], dl
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_704
+ JMP LBB2_737
+
+LBB2_165:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_167
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_405
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_405
+
+LBB2_167:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_709:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_711
+
+LBB2_710:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_710
+
+LBB2_711:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_712:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_712
+ JMP LBB2_737
+
+LBB2_168:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x8b44; BYTE $0x1a // mov r11d, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_170
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_408
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_408
+
+LBB2_170:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_717:
+ WORD $0x8948; BYTE $0xf2 // mov rdx, rsi
+ WORD $0xf748; BYTE $0xd2 // not rdx
+ WORD $0x014c; BYTE $0xd2 // add rdx, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_719
+
+LBB2_718:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_718
+
+LBB2_719:
+ LONG $0x03fa8348 // cmp rdx, 3
+ JB LBB2_737
+
+LBB2_720:
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ WORD $0x042b; BYTE $0xb1 // sub eax, dword [rcx + 4*rsi]
+ LONG $0xb0048941 // mov dword [r8 + 4*rsi], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x04b1442b // sub eax, dword [rcx + 4*rsi + 4]
+ LONG $0xb0448941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x08b1442b // sub eax, dword [rcx + 4*rsi + 8]
+ LONG $0xb0448941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], eax
+ WORD $0x8944; BYTE $0xd8 // mov eax, r11d
+ LONG $0x0cb1442b // sub eax, dword [rcx + 4*rsi + 12]
+ LONG $0xb0448941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], eax
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_720
+ JMP LBB2_737
+
+LBB2_171:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_173
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_411
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_411
+
+LBB2_173:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_725:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_727
+
+LBB2_726:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_726
+
+LBB2_727:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_728:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_728
+ JMP LBB2_737
+
+LBB2_174:
+ WORD $0x8545; BYTE $0xc9 // test r9d, r9d
+ JLE LBB2_737
+ WORD $0x028b // mov eax, dword [rdx]
+ WORD $0x8945; BYTE $0xca // mov r10d, r9d
+ LONG $0x08f98341 // cmp r9d, 8
+ JB LBB2_176
+ LONG $0x91148d4a // lea rdx, [rcx + 4*r10]
+ WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
+ JBE LBB2_414
+ LONG $0x90148d4b // lea rdx, [r8 + 4*r10]
+ WORD $0x3948; BYTE $0xca // cmp rdx, rcx
+ JBE LBB2_414
+
+LBB2_176:
+ WORD $0xf631 // xor esi, esi
+
+LBB2_733:
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ WORD $0xf749; BYTE $0xd1 // not r9
+ WORD $0x014d; BYTE $0xd1 // add r9, r10
+ WORD $0x894c; BYTE $0xd7 // mov rdi, r10
+ LONG $0x03e78348 // and rdi, 3
+ JE LBB2_735
+
+LBB2_734:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x01c68348 // add rsi, 1
+ LONG $0xffc78348 // add rdi, -1
+ JNE LBB2_734
+
+LBB2_735:
+ LONG $0x03f98349 // cmp r9, 3
+ JB LBB2_737
+
+LBB2_736:
+ WORD $0x148b; BYTE $0xb1 // mov edx, dword [rcx + 4*rsi]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0148941 // mov dword [r8 + 4*rsi], edx
+ LONG $0x04b1548b // mov edx, dword [rcx + 4*rsi + 4]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x04 // mov dword [r8 + 4*rsi + 4], edx
+ LONG $0x08b1548b // mov edx, dword [rcx + 4*rsi + 8]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x08 // mov dword [r8 + 4*rsi + 8], edx
+ LONG $0x0cb1548b // mov edx, dword [rcx + 4*rsi + 12]
+ WORD $0xc201 // add edx, eax
+ LONG $0xb0548941; BYTE $0x0c // mov dword [r8 + 4*rsi + 12], edx
+ LONG $0x04c68348 // add rsi, 4
+ WORD $0x3949; BYTE $0xf2 // cmp r10, rsi
+ JNE LBB2_736
+ JMP LBB2_737
+
+LBB2_297:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0x6e0f4166; BYTE $0xc3 // movd xmm0, r11d
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_417
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_299:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb85c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_299
+ JMP LBB2_418
+
+LBB2_300:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0x6e0f4166; BYTE $0xc3 // movd xmm0, r11d
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_425
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_302:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb85c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_302
+ JMP LBB2_426
+
+LBB2_303:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_433
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_305:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_305
+ JMP LBB2_434
+
+LBB2_306:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_441
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_308:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_308
+ JMP LBB2_442
+
+LBB2_309:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xfc // and edx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc728d48 // lea rsi, [rdx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_449
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_311:
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xe1280f66 // movapd xmm4, xmm1
+ LONG $0xe25c0f66 // subpd xmm4, xmm2
+ LONG $0xd1280f66 // movapd xmm2, xmm1
+ LONG $0xd35c0f66 // subpd xmm2, xmm3
+ LONG $0x110f4166; WORD $0xf824 // movupd oword [r8 + 8*rdi], xmm4
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30f9 // movupd xmm3, oword [rcx + 8*rdi + 48]
+ LONG $0xe1280f66 // movapd xmm4, xmm1
+ LONG $0xe25c0f66 // subpd xmm4, xmm2
+ LONG $0xd1280f66 // movapd xmm2, xmm1
+ LONG $0xd35c0f66 // subpd xmm2, xmm3
+ LONG $0x110f4166; WORD $0xf864; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm4
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_311
+ JMP LBB2_450
+
+LBB2_312:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xfc // and edx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc728d48 // lea rsi, [rdx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_457
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_314:
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xe1280f66 // movapd xmm4, xmm1
+ LONG $0xe25c0f66 // subpd xmm4, xmm2
+ LONG $0xd1280f66 // movapd xmm2, xmm1
+ LONG $0xd35c0f66 // subpd xmm2, xmm3
+ LONG $0x110f4166; WORD $0xf824 // movupd oword [r8 + 8*rdi], xmm4
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30f9 // movupd xmm3, oword [rcx + 8*rdi + 48]
+ LONG $0xe1280f66 // movapd xmm4, xmm1
+ LONG $0xe25c0f66 // subpd xmm4, xmm2
+ LONG $0xd1280f66 // movapd xmm2, xmm1
+ LONG $0xd35c0f66 // subpd xmm2, xmm3
+ LONG $0x110f4166; WORD $0xf864; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm4
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_314
+ JMP LBB2_458
+
+LBB2_315:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xfc // and edx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc728d48 // lea rsi, [rdx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_465
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_317:
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30f9 // movupd xmm3, oword [rcx + 8*rdi + 48]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm3
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_317
+ JMP LBB2_466
+
+LBB2_318:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xfc // and edx, -4
+ LONG $0xc8120ff2 // movddup xmm1, xmm0
+ LONG $0xfc728d48 // lea rsi, [rdx - 4]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_473
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_320:
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+ LONG $0x54100f66; WORD $0x20f9 // movupd xmm2, oword [rcx + 8*rdi + 32]
+ LONG $0x5c100f66; WORD $0x30f9 // movupd xmm3, oword [rcx + 8*rdi + 48]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf854; BYTE $0x20 // movupd oword [r8 + 8*rdi + 32], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x30 // movupd oword [r8 + 8*rdi + 48], xmm3
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_320
+ JMP LBB2_474
+
+LBB2_321:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xd3b60f41 // movzx edx, r11b
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_481
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_323:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x385c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_323
+ JMP LBB2_482
+
+LBB2_324:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xd3b60f41 // movzx edx, r11b
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_489
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_326:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x385c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_326
+ JMP LBB2_490
+
+LBB2_327:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xd0 // movzx edx, al
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_497
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_329:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_329
+ JMP LBB2_498
+
+LBB2_330:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xd0 // movzx edx, al
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_505
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_332:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_332
+ JMP LBB2_506
+
+LBB2_333:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4966; BYTE $0xc3 // movq xmm0, r11
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_513
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_335:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf85c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_335
+ JMP LBB2_514
+
+LBB2_336:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4966; BYTE $0xc3 // movq xmm0, r11
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_521
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_338:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf85c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_338
+ JMP LBB2_522
+
+LBB2_339:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_529
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_341:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_341
+ JMP LBB2_530
+
+LBB2_342:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_537
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_344:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_344
+ JMP LBB2_538
+
+LBB2_345:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_545
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_347:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x785c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_347
+ JMP LBB2_546
+
+LBB2_348:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_553
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_350:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x785c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_350
+ JMP LBB2_554
+
+LBB2_351:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_561
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_353:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x785c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_353
+ JMP LBB2_562
+
+LBB2_354:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_569
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_356:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf90f66 // psubw xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x785c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm1
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_356
+ JMP LBB2_570
+
+LBB2_357:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_577
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_359:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_359
+ JMP LBB2_578
+
+LBB2_360:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_585
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_362:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_362
+ JMP LBB2_586
+
+LBB2_363:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_593
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_365:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_365
+ JMP LBB2_594
+
+LBB2_366:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf0 // and esi, -16
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700ff2; BYTE $0xe0 // pshuflw xmm0, xmm0, 224
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf0568d48 // lea rdx, [rsi - 16]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x04e9c149 // shr r9, 4
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_601
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_368:
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2079 // movdqu xmm1, oword [rcx + 2*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3079 // movdqu xmm2, oword [rcx + 2*rdi + 48]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x784c; BYTE $0x20 // movdqu oword [r8 + 2*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x30 // movdqu oword [r8 + 2*rdi + 48], xmm2
+ LONG $0x20c78348 // add rdi, 32
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_368
+ JMP LBB2_602
+
+LBB2_369:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4966; BYTE $0xc3 // movq xmm0, r11
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_609
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_371:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf85c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_371
+ JMP LBB2_610
+
+LBB2_372:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf8 // and edx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8728d48 // lea rsi, [rdx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_617
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_374:
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x280f; BYTE $0xe1 // movaps xmm4, xmm1
+ WORD $0x5c0f; BYTE $0xe2 // subps xmm4, xmm2
+ WORD $0x280f; BYTE $0xd1 // movaps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd3 // subps xmm2, xmm3
+ LONG $0x24110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm4
+ LONG $0x54110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm2
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xb95c100f; BYTE $0x30 // movups xmm3, oword [rcx + 4*rdi + 48]
+ WORD $0x280f; BYTE $0xe1 // movaps xmm4, xmm1
+ WORD $0x5c0f; BYTE $0xe2 // subps xmm4, xmm2
+ WORD $0x280f; BYTE $0xd1 // movaps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd3 // subps xmm2, xmm3
+ LONG $0x64110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm4
+ LONG $0x54110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_374
+ JMP LBB2_618
+
+LBB2_375:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4966; BYTE $0xc3 // movq xmm0, r11
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_625
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_377:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafb0f66 // psubq xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xf85c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm1
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_377
+ JMP LBB2_626
+
+LBB2_378:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf8 // and edx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8728d48 // lea rsi, [rdx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_633
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_380:
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x280f; BYTE $0xe1 // movaps xmm4, xmm1
+ WORD $0x5c0f; BYTE $0xe2 // subps xmm4, xmm2
+ WORD $0x280f; BYTE $0xd1 // movaps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd3 // subps xmm2, xmm3
+ LONG $0x24110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm4
+ LONG $0x54110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm2
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xb95c100f; BYTE $0x30 // movups xmm3, oword [rcx + 4*rdi + 48]
+ WORD $0x280f; BYTE $0xe1 // movaps xmm4, xmm1
+ WORD $0x5c0f; BYTE $0xe2 // subps xmm4, xmm2
+ WORD $0x280f; BYTE $0xd1 // movaps xmm2, xmm1
+ WORD $0x5c0f; BYTE $0xd3 // subps xmm2, xmm3
+ LONG $0x64110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm4
+ LONG $0x54110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_380
+ JMP LBB2_634
+
+LBB2_381:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_641
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_383:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_383
+ JMP LBB2_642
+
+LBB2_384:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf8 // and edx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8728d48 // lea rsi, [rdx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_649
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_386:
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xb95c100f; BYTE $0x30 // movups xmm3, oword [rcx + 4*rdi + 48]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x5c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm3
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_386
+ JMP LBB2_650
+
+LBB2_387:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xfc // and esi, -4
+ LONG $0x6e0f4866; BYTE $0xc0 // movq xmm0, rax
+ LONG $0xc0700f66; BYTE $0x44 // pshufd xmm0, xmm0, 68
+ LONG $0xfc568d48 // lea rdx, [rsi - 4]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x02e9c149 // shr r9, 2
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_657
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_389:
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20f9 // movdqu xmm1, oword [rcx + 8*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30f9 // movdqu xmm2, oword [rcx + 8*rdi + 48]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf84c; BYTE $0x20 // movdqu oword [r8 + 8*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x30 // movdqu oword [r8 + 8*rdi + 48], xmm2
+ LONG $0x08c78348 // add rdi, 8
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_389
+ JMP LBB2_658
+
+LBB2_390:
+ WORD $0xc289 // mov edx, eax
+ WORD $0xe283; BYTE $0xf8 // and edx, -8
+ WORD $0x280f; BYTE $0xc8 // movaps xmm1, xmm0
+ LONG $0x00c8c60f // shufps xmm1, xmm0, 0
+ LONG $0xf8728d48 // lea rsi, [rdx - 8]
+ WORD $0x8949; BYTE $0xf1 // mov r9, rsi
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xf6 // test rsi, rsi
+ JE LBB2_665
+ WORD $0x894c; BYTE $0xce // mov rsi, r9
+ LONG $0xfee68348 // and rsi, -2
+ WORD $0xf748; BYTE $0xde // neg rsi
+ WORD $0xff31 // xor edi, edi
+
+LBB2_392:
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+ LONG $0xb954100f; BYTE $0x20 // movups xmm2, oword [rcx + 4*rdi + 32]
+ LONG $0xb95c100f; BYTE $0x30 // movups xmm3, oword [rcx + 4*rdi + 48]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x54110f41; WORD $0x20b8 // movups oword [r8 + 4*rdi + 32], xmm2
+ LONG $0x5c110f41; WORD $0x30b8 // movups oword [r8 + 4*rdi + 48], xmm3
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c68348 // add rsi, 2
+ JNE LBB2_392
+ JMP LBB2_666
+
+LBB2_393:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xd3b60f41 // movzx edx, r11b
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_673
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_395:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x385c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_395
+ JMP LBB2_674
+
+LBB2_396:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ LONG $0xd3b60f41 // movzx edx, r11b
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_681
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_398:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcaf80f66 // psubb xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0x385c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm1
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_398
+ JMP LBB2_682
+
+LBB2_399:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xd0 // movzx edx, al
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_689
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_401:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_401
+ JMP LBB2_690
+
+LBB2_402:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xe0 // and esi, -32
+ WORD $0xb60f; BYTE $0xd0 // movzx edx, al
+ LONG $0xc26e0f66 // movd xmm0, edx
+ LONG $0xc9ef0f66 // pxor xmm1, xmm1
+ LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1
+ LONG $0xe0568d48 // lea rdx, [rsi - 32]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x05e9c149 // shr r9, 5
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_697
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_404:
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x2039 // movdqu xmm1, oword [rcx + rdi + 32]
+ LONG $0x546f0ff3; WORD $0x3039 // movdqu xmm2, oword [rcx + rdi + 48]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x384c; BYTE $0x20 // movdqu oword [r8 + rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x30 // movdqu oword [r8 + rdi + 48], xmm2
+ LONG $0x40c78348 // add rdi, 64
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_404
+ JMP LBB2_698
+
+LBB2_405:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0x6e0f4166; BYTE $0xc3 // movd xmm0, r11d
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_705
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_407:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb85c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_407
+ JMP LBB2_706
+
+LBB2_408:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0x6e0f4166; BYTE $0xc3 // movd xmm0, r11d
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_713
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_410:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm1
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc86f0f66 // movdqa xmm1, xmm0
+ LONG $0xcafa0f66 // psubd xmm1, xmm2
+ LONG $0x7f0f41f3; WORD $0xb85c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm3
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm1
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_410
+ JMP LBB2_714
+
+LBB2_411:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_721
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_413:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_413
+ JMP LBB2_722
+
+LBB2_414:
+ WORD $0x8944; BYTE $0xd6 // mov esi, r10d
+ WORD $0xe683; BYTE $0xf8 // and esi, -8
+ LONG $0xc06e0f66 // movd xmm0, eax
+ LONG $0xc0700f66; BYTE $0x00 // pshufd xmm0, xmm0, 0
+ LONG $0xf8568d48 // lea rdx, [rsi - 8]
+ WORD $0x8949; BYTE $0xd1 // mov r9, rdx
+ LONG $0x03e9c149 // shr r9, 3
+ LONG $0x01c18349 // add r9, 1
+ WORD $0x8548; BYTE $0xd2 // test rdx, rdx
+ JE LBB2_729
+ WORD $0x894c; BYTE $0xca // mov rdx, r9
+ LONG $0xfee28348 // and rdx, -2
+ WORD $0xf748; BYTE $0xda // neg rdx
+ WORD $0xff31 // xor edi, edi
+
+LBB2_416:
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+ LONG $0x4c6f0ff3; WORD $0x20b9 // movdqu xmm1, oword [rcx + 4*rdi + 32]
+ LONG $0x546f0ff3; WORD $0x30b9 // movdqu xmm2, oword [rcx + 4*rdi + 48]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb84c; BYTE $0x20 // movdqu oword [r8 + 4*rdi + 32], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x30 // movdqu oword [r8 + 4*rdi + 48], xmm2
+ LONG $0x10c78348 // add rdi, 16
+ LONG $0x02c28348 // add rdx, 2
+ JNE LBB2_416
+ JMP LBB2_730
+
+LBB2_417:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_418:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_420
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB2_420:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_421
+
+LBB2_425:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_426:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_428
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB2_428:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_429
+
+LBB2_433:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_434:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_436
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB2_436:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_437
+
+LBB2_441:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_442:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_444
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB2_444:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_445
+
+LBB2_449:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_450:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_452
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xe1280f66 // movapd xmm4, xmm1
+ LONG $0xe25c0f66 // subpd xmm4, xmm2
+ LONG $0xcb5c0f66 // subpd xmm1, xmm3
+ LONG $0x110f4166; WORD $0xf824 // movupd oword [r8 + 8*rdi], xmm4
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm1
+
+LBB2_452:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_453
+
+LBB2_457:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_458:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_460
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xe1280f66 // movapd xmm4, xmm1
+ LONG $0xe25c0f66 // subpd xmm4, xmm2
+ LONG $0xcb5c0f66 // subpd xmm1, xmm3
+ LONG $0x110f4166; WORD $0xf824 // movupd oword [r8 + 8*rdi], xmm4
+ LONG $0x110f4166; WORD $0xf84c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm1
+
+LBB2_460:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_461
+
+LBB2_465:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_466:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_468
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+
+LBB2_468:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_469
+
+LBB2_473:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_474:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_476
+ LONG $0x14100f66; BYTE $0xf9 // movupd xmm2, oword [rcx + 8*rdi]
+ LONG $0x5c100f66; WORD $0x10f9 // movupd xmm3, oword [rcx + 8*rdi + 16]
+ LONG $0xd1580f66 // addpd xmm2, xmm1
+ LONG $0xd9580f66 // addpd xmm3, xmm1
+ LONG $0x110f4166; WORD $0xf814 // movupd oword [r8 + 8*rdi], xmm2
+ LONG $0x110f4166; WORD $0xf85c; BYTE $0x10 // movupd oword [r8 + 8*rdi + 16], xmm3
+
+LBB2_476:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_477
+
+LBB2_481:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_482:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_484
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB2_484:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_485
+
+LBB2_489:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_490:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_492
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB2_492:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_493
+
+LBB2_497:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_498:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_500
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB2_500:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_501
+
+LBB2_505:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_506:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_508
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB2_508:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_509
+
+LBB2_513:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_514:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_516
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB2_516:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_517
+
+LBB2_521:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_522:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_524
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB2_524:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_525
+
+LBB2_529:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_530:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_532
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB2_532:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_533
+
+LBB2_537:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_538:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_540
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB2_540:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_541
+
+LBB2_545:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_546:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_548
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB2_548:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_549
+
+LBB2_553:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_554:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_556
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB2_556:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_557
+
+LBB2_561:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_562:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_564
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB2_564:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_565
+
+LBB2_569:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_570:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_572
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f90f66 // psubw xmm3, xmm1
+ LONG $0xc2f90f66 // psubw xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x781c // movdqu oword [r8 + 2*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x7844; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm0
+
+LBB2_572:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_573
+
+LBB2_577:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_578:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_580
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB2_580:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_581
+
+LBB2_585:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_586:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_588
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB2_588:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_589
+
+LBB2_593:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_594:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_596
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB2_596:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_597
+
+LBB2_601:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_602:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_604
+ LONG $0x0c6f0ff3; BYTE $0x79 // movdqu xmm1, oword [rcx + 2*rdi]
+ LONG $0x546f0ff3; WORD $0x1079 // movdqu xmm2, oword [rcx + 2*rdi + 16]
+ LONG $0xc8fd0f66 // paddw xmm1, xmm0
+ LONG $0xd0fd0f66 // paddw xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x780c // movdqu oword [r8 + 2*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x7854; BYTE $0x10 // movdqu oword [r8 + 2*rdi + 16], xmm2
+
+LBB2_604:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_605
+
+LBB2_609:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_610:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_612
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB2_612:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_613
+
+LBB2_617:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_618:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_620
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x280f; BYTE $0xe1 // movaps xmm4, xmm1
+ WORD $0x5c0f; BYTE $0xe2 // subps xmm4, xmm2
+ WORD $0x5c0f; BYTE $0xcb // subps xmm1, xmm3
+ LONG $0x24110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm4
+ LONG $0x4c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm1
+
+LBB2_620:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_621
+
+LBB2_625:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_626:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_628
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fb0f66 // psubq xmm3, xmm1
+ LONG $0xc2fb0f66 // psubq xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xf81c // movdqu oword [r8 + 8*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xf844; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm0
+
+LBB2_628:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_629
+
+LBB2_633:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_634:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_636
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x280f; BYTE $0xe1 // movaps xmm4, xmm1
+ WORD $0x5c0f; BYTE $0xe2 // subps xmm4, xmm2
+ WORD $0x5c0f; BYTE $0xcb // subps xmm1, xmm3
+ LONG $0x24110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm4
+ LONG $0x4c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm1
+
+LBB2_636:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_637
+
+LBB2_641:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_642:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_644
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB2_644:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_645
+
+LBB2_649:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_650:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_652
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+
+LBB2_652:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_653
+
+LBB2_657:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_658:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_660
+ LONG $0x0c6f0ff3; BYTE $0xf9 // movdqu xmm1, oword [rcx + 8*rdi]
+ LONG $0x546f0ff3; WORD $0x10f9 // movdqu xmm2, oword [rcx + 8*rdi + 16]
+ LONG $0xc8d40f66 // paddq xmm1, xmm0
+ LONG $0xd0d40f66 // paddq xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xf80c // movdqu oword [r8 + 8*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xf854; BYTE $0x10 // movdqu oword [r8 + 8*rdi + 16], xmm2
+
+LBB2_660:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_661
+
+LBB2_665:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_666:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_668
+ LONG $0xb914100f // movups xmm2, oword [rcx + 4*rdi]
+ LONG $0xb95c100f; BYTE $0x10 // movups xmm3, oword [rcx + 4*rdi + 16]
+ WORD $0x580f; BYTE $0xd1 // addps xmm2, xmm1
+ WORD $0x580f; BYTE $0xd9 // addps xmm3, xmm1
+ LONG $0x14110f41; BYTE $0xb8 // movups oword [r8 + 4*rdi], xmm2
+ LONG $0x5c110f41; WORD $0x10b8 // movups oword [r8 + 4*rdi + 16], xmm3
+
+LBB2_668:
+ WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
+ JE LBB2_737
+ JMP LBB2_669
+
+LBB2_673:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_674:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_676
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB2_676:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_677
+
+LBB2_681:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_682:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_684
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9f80f66 // psubb xmm3, xmm1
+ LONG $0xc2f80f66 // psubb xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0x381c // movdqu oword [r8 + rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0x3844; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm0
+
+LBB2_684:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_685
+
+LBB2_689:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_690:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_692
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB2_692:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_693
+
+LBB2_697:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_698:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_700
+ LONG $0x0c6f0ff3; BYTE $0x39 // movdqu xmm1, oword [rcx + rdi]
+ LONG $0x546f0ff3; WORD $0x1039 // movdqu xmm2, oword [rcx + rdi + 16]
+ LONG $0xc8fc0f66 // paddb xmm1, xmm0
+ LONG $0xd0fc0f66 // paddb xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0x380c // movdqu oword [r8 + rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0x3854; BYTE $0x10 // movdqu oword [r8 + rdi + 16], xmm2
+
+LBB2_700:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_701
+
+LBB2_705:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_706:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_708
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB2_708:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_709
+
+LBB2_713:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_714:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_716
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xd86f0f66 // movdqa xmm3, xmm0
+ LONG $0xd9fa0f66 // psubd xmm3, xmm1
+ LONG $0xc2fa0f66 // psubd xmm0, xmm2
+ LONG $0x7f0f41f3; WORD $0xb81c // movdqu oword [r8 + 4*rdi], xmm3
+ LONG $0x7f0f41f3; WORD $0xb844; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm0
+
+LBB2_716:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_717
+
+LBB2_721:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_722:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_724
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB2_724:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JE LBB2_737
+ JMP LBB2_725
+
+LBB2_729:
+ WORD $0xff31 // xor edi, edi
+
+LBB2_730:
+ LONG $0x01c1f641 // test r9b, 1
+ JE LBB2_732
+ LONG $0x0c6f0ff3; BYTE $0xb9 // movdqu xmm1, oword [rcx + 4*rdi]
+ LONG $0x546f0ff3; WORD $0x10b9 // movdqu xmm2, oword [rcx + 4*rdi + 16]
+ LONG $0xc8fe0f66 // paddd xmm1, xmm0
+ LONG $0xd0fe0f66 // paddd xmm2, xmm0
+ LONG $0x7f0f41f3; WORD $0xb80c // movdqu oword [r8 + 4*rdi], xmm1
+ LONG $0x7f0f41f3; WORD $0xb854; BYTE $0x10 // movdqu oword [r8 + 4*rdi + 16], xmm2
+
+LBB2_732:
+ WORD $0x394c; BYTE $0xd6 // cmp rsi, r10
+ JNE LBB2_733
+
+LBB2_737:
+ RET
diff --git a/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go b/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go
new file mode 100644
index 00000000000..a98781ffd7d
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/basic_arithmetic_noasm.go
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build noasm
+
+package kernels
+
+import (
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "golang.org/x/exp/constraints"
+)
+
+func getArithmeticBinaryOpFloating[T constraints.Float](op ArithmeticOp) exec.ArrayKernelExec {
+ return getGoArithmeticBinaryOpFloating[T](op)
+}
+
+func getArithmeticBinaryOpIntegral[T exec.UintTypes | exec.IntTypes](op ArithmeticOp) exec.ArrayKernelExec {
+ return getGoArithmeticBinaryOpIntegral[T](op)
+}
diff --git a/go/arrow/compute/internal/kernels/helpers.go b/go/arrow/compute/internal/kernels/helpers.go
index 1da86803be8..7fd85414376 100644
--- a/go/arrow/compute/internal/kernels/helpers.go
+++ b/go/arrow/compute/internal/kernels/helpers.go
@@ -23,7 +23,9 @@ import (
"github.com/apache/arrow/go/v10/arrow"
"github.com/apache/arrow/go/v10/arrow/bitutil"
"github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/internal/debug"
"github.com/apache/arrow/go/v10/arrow/memory"
+ "github.com/apache/arrow/go/v10/arrow/scalar"
"github.com/apache/arrow/go/v10/internal/bitutils"
"golang.org/x/exp/constraints"
)
@@ -159,6 +161,170 @@ func ScalarUnaryBoolArg[OutT exec.FixedWidthTypes](op func(*exec.KernelCtx, []by
}
}
+func UnboxScalar[T exec.FixedWidthTypes](val scalar.PrimitiveScalar) T {
+ return *(*T)(unsafe.Pointer(&val.Data()[0]))
+}
+
+func UnboxBinaryScalar(val scalar.BinaryScalar) []byte {
+ if !val.IsValid() {
+ return nil
+ }
+ return val.Data()
+}
+
+type arrArrFn[OutT, Arg0T, Arg1T exec.FixedWidthTypes] func(*exec.KernelCtx, []Arg0T, []Arg1T, []OutT) error
+type arrScalarFn[OutT, Arg0T, Arg1T exec.FixedWidthTypes] func(*exec.KernelCtx, []Arg0T, Arg1T, []OutT) error
+type scalarArrFn[OutT, Arg0T, Arg1T exec.FixedWidthTypes] func(*exec.KernelCtx, Arg0T, []Arg1T, []OutT) error
+
+type binaryOps[OutT, Arg0T, Arg1T exec.FixedWidthTypes] struct {
+ arrArr arrArrFn[OutT, Arg0T, Arg1T]
+ arrScalar arrScalarFn[OutT, Arg0T, Arg1T]
+ scalarArr scalarArrFn[OutT, Arg0T, Arg1T]
+}
+
+func ScalarBinary[OutT, Arg0T, Arg1T exec.FixedWidthTypes](ops binaryOps[OutT, Arg0T, Arg1T]) exec.ArrayKernelExec {
+ arrayArray := func(ctx *exec.KernelCtx, arg0, arg1 *exec.ArraySpan, out *exec.ExecResult) error {
+ var (
+ a0 = exec.GetSpanValues[Arg0T](arg0, 1)
+ a1 = exec.GetSpanValues[Arg1T](arg1, 1)
+ outData = exec.GetSpanValues[OutT](out, 1)
+ )
+ return ops.arrArr(ctx, a0, a1, outData)
+ }
+
+ arrayScalar := func(ctx *exec.KernelCtx, arg0 *exec.ArraySpan, arg1 scalar.Scalar, out *exec.ExecResult) error {
+ var (
+ a0 = exec.GetSpanValues[Arg0T](arg0, 1)
+ a1 = UnboxScalar[Arg1T](arg1.(scalar.PrimitiveScalar))
+ outData = exec.GetSpanValues[OutT](out, 1)
+ )
+ return ops.arrScalar(ctx, a0, a1, outData)
+ }
+
+ scalarArray := func(ctx *exec.KernelCtx, arg0 scalar.Scalar, arg1 *exec.ArraySpan, out *exec.ExecResult) error {
+ var (
+ a0 = UnboxScalar[Arg0T](arg0.(scalar.PrimitiveScalar))
+ a1 = exec.GetSpanValues[Arg1T](arg1, 1)
+ outData = exec.GetSpanValues[OutT](out, 1)
+ )
+ return ops.scalarArr(ctx, a0, a1, outData)
+ }
+
+ return func(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
+ if batch.Values[0].IsArray() {
+ if batch.Values[1].IsArray() {
+ return arrayArray(ctx, &batch.Values[0].Array, &batch.Values[1].Array, out)
+ }
+ return arrayScalar(ctx, &batch.Values[0].Array, batch.Values[1].Scalar, out)
+ }
+
+ if batch.Values[1].IsArray() {
+ return scalarArray(ctx, batch.Values[0].Scalar, &batch.Values[1].Array, out)
+ }
+
+ debug.Assert(false, "should be unreachable")
+ return fmt.Errorf("%w: scalar binary with two scalars?", arrow.ErrInvalid)
+ }
+}
+
+func ScalarBinaryNotNull[OutT, Arg0T, Arg1T exec.FixedWidthTypes](op func(*exec.KernelCtx, Arg0T, Arg1T, *error) OutT) exec.ArrayKernelExec {
+ arrayArray := func(ctx *exec.KernelCtx, arg0, arg1 *exec.ArraySpan, out *exec.ExecResult) (err error) {
+ // fast path if one side is entirely null
+ if arg0.UpdateNullCount() == arg0.Len || arg1.UpdateNullCount() == arg1.Len {
+ return nil
+ }
+
+ var (
+ a0 = exec.GetSpanValues[Arg0T](arg0, 1)
+ a1 = exec.GetSpanValues[Arg1T](arg1, 1)
+ outData = exec.GetSpanValues[OutT](out, 1)
+ outPos int64
+ def OutT
+ )
+ bitutils.VisitTwoBitBlocks(arg0.Buffers[0].Buf, arg1.Buffers[0].Buf, arg0.Offset, arg1.Offset, out.Len,
+ func(pos int64) {
+ outData[outPos] = op(ctx, a0[pos], a1[pos], &err)
+ outPos++
+ }, func() {
+ outData[outPos] = def
+ outPos++
+ })
+ return
+ }
+
+ arrayScalar := func(ctx *exec.KernelCtx, arg0 *exec.ArraySpan, arg1 scalar.Scalar, out *exec.ExecResult) (err error) {
+ // fast path if one side is entirely null
+ if arg0.UpdateNullCount() == arg0.Len || !arg1.IsValid() {
+ return nil
+ }
+
+ var (
+ a0 = exec.GetSpanValues[Arg0T](arg0, 1)
+ outData = exec.GetSpanValues[OutT](out, 1)
+ outPos int64
+ def OutT
+ )
+ if !arg1.IsValid() {
+ return nil
+ }
+
+ a1 := UnboxScalar[Arg1T](arg1.(scalar.PrimitiveScalar))
+ bitutils.VisitBitBlocks(arg0.Buffers[0].Buf, arg0.Offset, arg0.Len,
+ func(pos int64) {
+ outData[outPos] = op(ctx, a0[pos], a1, &err)
+ outPos++
+ }, func() {
+ outData[outPos] = def
+ outPos++
+ })
+ return
+ }
+
+ scalarArray := func(ctx *exec.KernelCtx, arg0 scalar.Scalar, arg1 *exec.ArraySpan, out *exec.ExecResult) (err error) {
+ // fast path if one side is entirely null
+ if arg1.UpdateNullCount() == arg1.Len || !arg0.IsValid() {
+ return nil
+ }
+
+ var (
+ a1 = exec.GetSpanValues[Arg1T](arg1, 1)
+ outData = exec.GetSpanValues[OutT](out, 1)
+ outPos int64
+ def OutT
+ )
+ if !arg0.IsValid() {
+ return nil
+ }
+
+ a0 := UnboxScalar[Arg0T](arg0.(scalar.PrimitiveScalar))
+ bitutils.VisitBitBlocks(arg1.Buffers[0].Buf, arg1.Offset, arg1.Len,
+ func(pos int64) {
+ outData[outPos] = op(ctx, a0, a1[pos], &err)
+ outPos++
+ }, func() {
+ outData[outPos] = def
+ outPos++
+ })
+ return
+ }
+
+ return func(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
+ if batch.Values[0].IsArray() {
+ if batch.Values[1].IsArray() {
+ return arrayArray(ctx, &batch.Values[0].Array, &batch.Values[1].Array, out)
+ }
+ return arrayScalar(ctx, &batch.Values[0].Array, batch.Values[1].Scalar, out)
+ }
+
+ if batch.Values[1].IsArray() {
+ return scalarArray(ctx, batch.Values[0].Scalar, &batch.Values[1].Array, out)
+ }
+
+ debug.Assert(false, "should be unreachable")
+ return fmt.Errorf("%w: scalar binary with two scalars?", arrow.ErrInvalid)
+ }
+}
+
// SizeOf determines the size in number of bytes for an integer
// based on the generic value in a way that the compiler should
// be able to easily evaluate and create as a constant.
diff --git a/go/arrow/compute/internal/kernels/scalar_arithmetic.go b/go/arrow/compute/internal/kernels/scalar_arithmetic.go
new file mode 100644
index 00000000000..cbe92f199eb
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/scalar_arithmetic.go
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernels
+
+import (
+ "github.com/apache/arrow/go/v10/arrow"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+)
+
+// scalar kernel that ignores (assumed all-null inputs) and returns null
+func NullToNullExec(_ *exec.KernelCtx, _ *exec.ExecSpan, _ *exec.ExecResult) error {
+ return nil
+}
+
+func NullExecKernel(nargs int) exec.ScalarKernel {
+ in := make([]exec.InputType, nargs)
+ for i := range in {
+ in[i] = exec.NewIDInput(arrow.NULL)
+ }
+ return exec.NewScalarKernel(in, exec.NewOutputType(arrow.Null), NullToNullExec, nil)
+}
+
+func GetArithmeticKernels(op ArithmeticOp) []exec.ScalarKernel {
+ kernels := make([]exec.ScalarKernel, 0)
+ for _, ty := range numericTypes {
+ kernels = append(kernels, exec.NewScalarKernel(
+ []exec.InputType{exec.NewExactInput(ty), exec.NewExactInput(ty)},
+ exec.NewOutputType(ty), ArithmeticExec(ty.ID(), op), nil))
+ }
+ return append(kernels, NullExecKernel(2))
+}
diff --git a/go/arrow/compute/internal/kernels/scalar_boolean.go b/go/arrow/compute/internal/kernels/scalar_boolean.go
new file mode 100644
index 00000000000..a458306451b
--- /dev/null
+++ b/go/arrow/compute/internal/kernels/scalar_boolean.go
@@ -0,0 +1,332 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernels
+
+import (
+ "github.com/apache/arrow/go/v10/arrow/bitutil"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/scalar"
+)
+
+type computeWordFN func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64)
+
+func computeKleene(computeWord computeWordFN, ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ var (
+ inBMs = [4]bitutil.Bitmap{
+ {Data: left.Buffers[0].Buf, Offset: left.Offset, Len: left.Len},
+ {Data: left.Buffers[1].Buf, Offset: left.Offset, Len: left.Len},
+ {Data: right.Buffers[1].Buf, Offset: right.Offset, Len: right.Len},
+ {Data: right.Buffers[0].Buf, Offset: right.Offset, Len: right.Len},
+ }
+ outBMs = [2]bitutil.Bitmap{
+ {Data: out.Buffers[0].Buf, Offset: out.Offset, Len: out.Len},
+ {Data: out.Buffers[1].Buf, Offset: out.Offset, Len: out.Len},
+ }
+ apply = func(leftValid, leftData uint64, rightValid, rightData uint64) (outValidity, outData uint64) {
+ leftTrue, leftFalse := leftValid&leftData, leftValid&^leftData
+ rightTrue, rightFalse := rightValid&rightData, rightValid&^rightData
+ return computeWord(leftTrue, leftFalse, rightTrue, rightFalse)
+ }
+ )
+
+ switch {
+ case right.UpdateNullCount() == 0:
+ return bitutil.VisitWordsAndWrite(inBMs[:3], outBMs[:],
+ func(in, out []uint64) {
+ out[0], out[1] = apply(in[0], in[1], ^uint64(0), in[2])
+ })
+ case left.UpdateNullCount() == 0:
+ return bitutil.VisitWordsAndWrite(inBMs[1:], outBMs[:],
+ func(in, out []uint64) {
+ out[0], out[1] = apply(^uint64(0), in[0], in[2], in[1])
+ })
+ default:
+ return bitutil.VisitWordsAndWrite(inBMs[:], outBMs[:],
+ func(in, out []uint64) {
+ out[0], out[1] = apply(in[0], in[1], in[3], in[2])
+ })
+ }
+}
+
+type AndOpKernel struct {
+ commutativeBinaryKernel[AndOpKernel]
+}
+
+func (AndOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ bitutil.BitmapAnd(left.Buffers[1].Buf, right.Buffers[1].Buf,
+ left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, left.Len)
+ return nil
+}
+
+func (AndOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+ if !left.IsValid() {
+ return nil
+ }
+
+ outBM := out.Buffers[1].Buf
+ if left.(*scalar.Boolean).Value {
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset),
+ int(right.Len), outBM, int(out.Offset))
+ } else {
+ bitutil.SetBitsTo(outBM, out.Offset, out.Len, false)
+ }
+ return nil
+}
+
+type KleeneAndOpKernel struct {
+ commutativeBinaryKernel[KleeneAndOpKernel]
+}
+
+func (KleeneAndOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 {
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ return (AndOpKernel{}).Call(ctx, left, right, out)
+ }
+
+ computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) {
+ return leftFalse | rightFalse | (leftTrue & rightTrue), leftTrue & rightTrue
+ }
+ return computeKleene(computeWord, ctx, left, right, out)
+}
+
+func (KleeneAndOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+ var (
+ leftTrue = left.IsValid() && left.(*scalar.Boolean).Value
+ leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value
+ )
+
+ switch {
+ case leftFalse:
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, false)
+ case leftTrue:
+ if right.UpdateNullCount() == 0 {
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ } else {
+ bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[0].Buf, int(out.Offset))
+ }
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[1].Buf, int(out.Offset))
+ default: // scalar was null: out[i] is valid iff right[i] was false
+ if right.UpdateNullCount() == 0 {
+ bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[0].Buf, int(out.Offset))
+ } else {
+ bitutil.BitmapAndNot(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset,
+ right.Offset, out.Buffers[0].Buf, out.Offset, right.Len)
+ }
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[1].Buf, int(out.Offset))
+ }
+ return nil
+}
+
+type OrOpKernel struct {
+ commutativeBinaryKernel[OrOpKernel]
+}
+
+func (OrOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ bitutil.BitmapOr(left.Buffers[1].Buf, right.Buffers[1].Buf,
+ left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, left.Len)
+ return nil
+}
+
+func (OrOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+ if !left.IsValid() {
+ return nil
+ }
+
+ outBM := out.Buffers[1].Buf
+ if left.(*scalar.Boolean).Value {
+ bitutil.SetBitsTo(outBM, out.Offset, out.Len, true)
+ } else {
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset),
+ int(right.Len), outBM, int(out.Offset))
+ }
+ return nil
+}
+
+type KleeneOrOpKernel struct {
+ commutativeBinaryKernel[KleeneOrOpKernel]
+}
+
+func (KleeneOrOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 {
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ return (OrOpKernel{}).Call(ctx, left, right, out)
+ }
+
+ computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) {
+ return leftTrue | rightTrue | (leftFalse & rightFalse), leftTrue | rightTrue
+ }
+ return computeKleene(computeWord, ctx, left, right, out)
+}
+
+func (KleeneOrOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+ var (
+ leftTrue = left.IsValid() && left.(*scalar.Boolean).Value
+ leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value
+ )
+
+ switch {
+ case leftTrue:
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, true) // all true case
+ case leftFalse:
+ if right.UpdateNullCount() == 0 {
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ } else {
+ bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[0].Buf, int(out.Offset))
+ }
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[1].Buf, int(out.Offset))
+ default: // scalar was null: out[i] is valid iff right[i] was true
+ if right.UpdateNullCount() == 0 {
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[0].Buf, int(out.Offset))
+ } else {
+ bitutil.BitmapAnd(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset,
+ right.Offset, out.Buffers[0].Buf, out.Offset, right.Len)
+ }
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[1].Buf, int(out.Offset))
+ }
+ return nil
+}
+
+type XorOpKernel struct {
+ commutativeBinaryKernel[XorOpKernel]
+}
+
+func (XorOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ bitutil.BitmapXor(left.Buffers[1].Buf, right.Buffers[1].Buf,
+ left.Offset, right.Offset, out.Buffers[1].Buf, out.Offset, out.Len)
+ return nil
+}
+
+func (XorOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+ if !left.IsValid() {
+ return nil
+ }
+
+ outBM := out.Buffers[1].Buf
+ if left.(*scalar.Boolean).Value {
+ bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ outBM, int(out.Offset))
+ } else {
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ outBM, int(out.Offset))
+ }
+ return nil
+}
+
+func invertScalar(in scalar.Scalar) *scalar.Boolean {
+ if in.IsValid() {
+ return scalar.NewBooleanScalar(!in.(*scalar.Boolean).Value)
+ }
+ return in.(*scalar.Boolean)
+}
+
+type AndNotOpKernel struct{}
+
+func (AndNotOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ bitutil.BitmapAndNot(left.Buffers[1].Buf, right.Buffers[1].Buf, left.Offset, right.Offset,
+ out.Buffers[1].Buf, out.Offset, right.Len)
+ return nil
+}
+
+func (AndNotOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+ if !left.IsValid() {
+ return nil
+ }
+
+ outBM := out.Buffers[1].Buf
+ if left.(*scalar.Boolean).Value {
+ bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ outBM, int(out.Offset))
+ } else {
+ bitutil.SetBitsTo(outBM, out.Offset, out.Len, false)
+ }
+ return nil
+}
+
+func (AndNotOpKernel) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error {
+ return (AndOpKernel{}).CallScalarRight(ctx, left, invertScalar(right), out)
+}
+
+type KleeneAndNotOpKernel struct{}
+
+func (KleeneAndNotOpKernel) Call(ctx *exec.KernelCtx, left, right *exec.ArraySpan, out *exec.ExecResult) error {
+ if left.UpdateNullCount() == 0 && right.UpdateNullCount() == 0 {
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ return (AndNotOpKernel{}).Call(ctx, left, right, out)
+ }
+
+ computeWord := func(leftTrue, leftFalse, rightTrue, rightFalse uint64) (outValid, outData uint64) {
+ return leftFalse | rightTrue | (leftTrue & rightFalse), leftTrue & rightFalse
+ }
+
+ return computeKleene(computeWord, ctx, left, right, out)
+}
+
+func (KleeneAndNotOpKernel) CallScalarLeft(ctx *exec.KernelCtx, left scalar.Scalar, right *exec.ArraySpan, out *exec.ExecResult) error {
+ var (
+ leftTrue = left.IsValid() && left.(*scalar.Boolean).Value
+ leftFalse = left.IsValid() && !left.(*scalar.Boolean).Value
+ )
+
+ switch {
+ case leftFalse:
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ bitutil.SetBitsTo(out.Buffers[1].Buf, out.Offset, out.Len, false)
+ case leftTrue:
+ if right.UpdateNullCount() == 0 {
+ bitutil.SetBitsTo(out.Buffers[0].Buf, out.Offset, out.Len, true)
+ out.Nulls = 0
+ } else {
+ bitutil.CopyBitmap(right.Buffers[0].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[0].Buf, int(out.Offset))
+ }
+ bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[1].Buf, int(out.Offset))
+ default: // scalar was null: out[i] is valid iff right[i] was true
+ if right.UpdateNullCount() == 0 {
+ bitutil.CopyBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[0].Buf, int(out.Offset))
+ } else {
+ bitutil.BitmapAnd(right.Buffers[0].Buf, right.Buffers[1].Buf, right.Offset, right.Offset,
+ out.Buffers[0].Buf, out.Offset, right.Len)
+ }
+ bitutil.InvertBitmap(right.Buffers[1].Buf, int(right.Offset), int(right.Len),
+ out.Buffers[1].Buf, int(out.Offset))
+ }
+ return nil
+}
+
+func (KleeneAndNotOpKernel) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error {
+ return (KleeneAndOpKernel{}).CallScalarRight(ctx, left, invertScalar(right), out)
+}
diff --git a/go/arrow/compute/internal/kernels/types.go b/go/arrow/compute/internal/kernels/types.go
index bffb27e8801..073e1c608c8 100644
--- a/go/arrow/compute/internal/kernels/types.go
+++ b/go/arrow/compute/internal/kernels/types.go
@@ -17,7 +17,12 @@
package kernels
import (
+ "fmt"
+
"github.com/apache/arrow/go/v10/arrow"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/internal/debug"
+ "github.com/apache/arrow/go/v10/arrow/scalar"
)
var (
@@ -35,7 +40,6 @@ var (
}
intTypes = append(unsignedIntTypes, signedIntTypes...)
floatingTypes = []arrow.DataType{
- arrow.FixedWidthTypes.Float16,
arrow.PrimitiveTypes.Float32,
arrow.PrimitiveTypes.Float64,
}
@@ -63,3 +67,41 @@ const (
CmpLT
CmpLE
)
+
+type simpleBinaryKernel interface {
+ Call(*exec.KernelCtx, *exec.ArraySpan, *exec.ArraySpan, *exec.ExecResult) error
+ CallScalarLeft(*exec.KernelCtx, scalar.Scalar, *exec.ArraySpan, *exec.ExecResult) error
+}
+
+type commutativeBinaryKernel[T simpleBinaryKernel] struct{}
+
+func (commutativeBinaryKernel[T]) CallScalarRight(ctx *exec.KernelCtx, left *exec.ArraySpan, right scalar.Scalar, out *exec.ExecResult) error {
+ var t T
+ return t.CallScalarLeft(ctx, right, left, out)
+}
+
+type SimpleBinaryKernel interface {
+ simpleBinaryKernel
+ CallScalarRight(*exec.KernelCtx, *exec.ArraySpan, scalar.Scalar, *exec.ExecResult) error
+}
+
+func SimpleBinary[K SimpleBinaryKernel](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
+ if batch.Len == 0 {
+ return nil
+ }
+
+ var k K
+ if batch.Values[0].IsArray() {
+ if batch.Values[1].IsArray() {
+ return k.Call(ctx, &batch.Values[0].Array, &batch.Values[1].Array, out)
+ }
+ return k.CallScalarRight(ctx, &batch.Values[0].Array, batch.Values[1].Scalar, out)
+ }
+
+ if batch.Values[1].IsArray() {
+ return k.CallScalarLeft(ctx, batch.Values[0].Scalar, &batch.Values[1].Array, out)
+ }
+
+ debug.Assert(false, "should be unreachable")
+ return fmt.Errorf("%w: should be unreachable", arrow.ErrInvalid)
+}
diff --git a/go/arrow/compute/registry.go b/go/arrow/compute/registry.go
index 7bb8d8c5995..c28eea619a8 100644
--- a/go/arrow/compute/registry.go
+++ b/go/arrow/compute/registry.go
@@ -46,6 +46,8 @@ func GetFunctionRegistry() FunctionRegistry {
registry = NewRegistry()
RegisterScalarCast(registry)
RegisterVectorSelection(registry)
+ RegisterScalarBoolean(registry)
+ RegisterScalarArithmetic(registry)
})
return registry
}
diff --git a/go/arrow/compute/scalar_bool.go b/go/arrow/compute/scalar_bool.go
new file mode 100644
index 00000000000..0a0f6afd191
--- /dev/null
+++ b/go/arrow/compute/scalar_bool.go
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compute
+
+import (
+ "fmt"
+
+ "github.com/apache/arrow/go/v10/arrow"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/kernels"
+)
+
+var (
+ andDoc = FunctionDoc{
+ Summary: "Logical 'and' boolean values",
+ Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'and_kleene'",
+ ArgNames: []string{"x", "y"},
+ }
+ andNotDoc = FunctionDoc{
+ Summary: "Logical 'and not' boolean values",
+ Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'and_not_kleene'",
+ ArgNames: []string{"x", "y"},
+ }
+ orDoc = FunctionDoc{
+ Summary: "Logical 'or' boolean values",
+ Description: "When a null is encountered in either input, a null is output.\nFor a different null behavior, see function 'or_kleene'",
+ ArgNames: []string{"x", "y"},
+ }
+ xorDoc = FunctionDoc{
+ Summary: "Logical 'xor' boolean values",
+ Description: "When a null is encountered in either input, a null is output.",
+ ArgNames: []string{"x", "y"},
+ }
+ andKleeneDoc = FunctionDoc{
+ Summary: "Logical 'and' boolean values (Kleene logic)",
+ Description: `This function behaves as follows with nulls:
+
+ - true and null = null
+ - null and true = null
+ - false and null = false
+ - null and false = false
+ - null and null = null
+
+ In other words, in this context, a null value really means "unknown"
+ and an unknown value "and" false is always false.
+ For a different null behavior, see function "and".`,
+ ArgNames: []string{"x", "y"},
+ }
+ andNotKleeneDoc = FunctionDoc{
+ Summary: "Logical 'and_not' boolean values (Kleene logic)",
+ Description: `This function behaves as follows with nulls:
+
+ - true and not null = null
+ - null and not false = null
+ - false and not null = false
+ - null and not true = false
+ - null and not null = null
+
+ In other words, in this context, a null value really means "unknown"
+ and an unknown value "and not" true is always false, as is false
+ "and not" an unknown value.
+ For a different null behavior, see function "and_not".`,
+ ArgNames: []string{"x", "y"},
+ }
+ orKleeneDoc = FunctionDoc{
+ Summary: "Logical 'or' boolean values (Kleene logic)",
+ Description: `This function behaves as follows with nulls:
+
+ - true or null = true
+ - null or true = true
+ - false or null = null
+ - null or false = null
+ - null or null = null
+
+ In other words, in this context, a null value really means "unknown"
+ and an unknown value "or" true is always true.
+ For a different null behavior, see function "and".`,
+ ArgNames: []string{"x", "y"},
+ }
+)
+
+func makeFunction(reg FunctionRegistry, name string, arity int, ex exec.ArrayKernelExec, doc FunctionDoc, nulls exec.NullHandling) {
+ fn := NewScalarFunction(name, Arity{NArgs: arity}, doc)
+
+ inTypes := make([]exec.InputType, arity)
+ for i := range inTypes {
+ inTypes[i] = exec.NewExactInput(arrow.FixedWidthTypes.Boolean)
+ }
+
+ k := exec.NewScalarKernel(inTypes, exec.NewOutputType(arrow.FixedWidthTypes.Boolean), ex, nil)
+ k.NullHandling = nulls
+
+ if err := fn.AddKernel(k); err != nil {
+ panic(err)
+ }
+
+ if !reg.AddFunction(fn, false) {
+ panic(fmt.Errorf("function '%s' already exists", name))
+ }
+}
+
+func RegisterScalarBoolean(reg FunctionRegistry) {
+ makeFunction(reg, "and", 2, kernels.SimpleBinary[kernels.AndOpKernel],
+ andDoc, exec.NullIntersection)
+ makeFunction(reg, "and_not", 2, kernels.SimpleBinary[kernels.AndNotOpKernel],
+ andNotDoc, exec.NullIntersection)
+ makeFunction(reg, "or", 2, kernels.SimpleBinary[kernels.OrOpKernel],
+ orDoc, exec.NullIntersection)
+ makeFunction(reg, "xor", 2, kernels.SimpleBinary[kernels.XorOpKernel],
+ xorDoc, exec.NullIntersection)
+ makeFunction(reg, "and_kleene", 2, kernels.SimpleBinary[kernels.KleeneAndOpKernel],
+ andKleeneDoc, exec.NullComputedPrealloc)
+ makeFunction(reg, "and_not_kleene", 2, kernels.SimpleBinary[kernels.KleeneAndNotOpKernel],
+ andNotKleeneDoc, exec.NullComputedPrealloc)
+ makeFunction(reg, "or_kleene", 2, kernels.SimpleBinary[kernels.KleeneOrOpKernel],
+ orKleeneDoc, exec.NullComputedPrealloc)
+}
diff --git a/go/arrow/compute/scalar_bool_test.go b/go/arrow/compute/scalar_bool_test.go
new file mode 100644
index 00000000000..956118d2653
--- /dev/null
+++ b/go/arrow/compute/scalar_bool_test.go
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compute_test
+
+import (
+ "context"
+ "strings"
+ "testing"
+
+ "github.com/apache/arrow/go/v10/arrow"
+ "github.com/apache/arrow/go/v10/arrow/array"
+ "github.com/apache/arrow/go/v10/arrow/compute"
+ "github.com/apache/arrow/go/v10/arrow/memory"
+ "github.com/apache/arrow/go/v10/arrow/scalar"
+ "github.com/stretchr/testify/require"
+)
+
+func checkScalarBinary(t *testing.T, fn string, left, right, expected compute.Datum, opts compute.FunctionOptions) {
+ checkScalar(t, fn, []compute.Datum{left, right}, expected, opts)
+}
+
+func checkBooleanScalarArrayBinary(t *testing.T, ctx context.Context, funcName string, array compute.Datum) {
+ mem := compute.GetAllocator(ctx)
+ for _, sc := range []scalar.Scalar{scalar.MakeNullScalar(arrow.FixedWidthTypes.Boolean), scalar.NewBooleanScalar(true), scalar.NewBooleanScalar(false)} {
+ constantArr, err := scalar.MakeArrayFromScalar(sc, int(array.Len()), mem)
+ defer constantArr.Release()
+
+ require.NoError(t, err)
+ expected, err := compute.CallFunction(ctx, funcName, nil, &compute.ArrayDatum{Value: constantArr.Data()}, array)
+ require.NoError(t, err)
+ defer expected.Release()
+
+ checkScalar(t, funcName, []compute.Datum{compute.NewDatum(sc), array}, expected, nil)
+
+ expected, err = compute.CallFunction(ctx, funcName, nil, array, &compute.ArrayDatum{Value: constantArr.Data()})
+ require.NoError(t, err)
+ defer expected.Release()
+ checkScalar(t, funcName, []compute.Datum{array, compute.NewDatum(sc)}, expected, nil)
+ }
+}
+
+func TestBooleanKernels(t *testing.T) {
+ tests := []struct {
+ fn string
+ expectedJSON string
+ commutative bool
+ }{
+ {"and", `[true, false, null, false, null, null]`, true},
+ {"or", `[true, true, null, false, null, null]`, true},
+ {"xor", `[false, true, null, false, null, null]`, true},
+ {"and_not", `[false, true, null, false, false, null, null, null, null]`, false},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.fn, func(t *testing.T) {
+ mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(t, 0)
+
+ var (
+ leftJSON = `[true, true, true, false, false, null]`
+ rightJSON = `[true, false, null, false, null, null]`
+ )
+
+ if !tt.commutative {
+ leftJSON = `[true, true, true, false, false, false, null, null, null]`
+ rightJSON = `[true, false, null, true, false, null, true, false, null]`
+ }
+
+ left, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+ strings.NewReader(leftJSON))
+ defer left.Release()
+ right, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+ strings.NewReader(rightJSON))
+ defer right.Release()
+ exp, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, strings.NewReader(tt.expectedJSON))
+ defer exp.Release()
+
+ checkScalarBinary(t, tt.fn, &compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()}, &compute.ArrayDatum{Value: exp.Data()}, nil)
+ ctx := compute.WithAllocator(context.Background(), mem)
+ checkBooleanScalarArrayBinary(t, ctx, tt.fn, &compute.ArrayDatum{Value: left.Data()})
+ })
+ }
+}
+
+func TestBooleanKleeneKernels(t *testing.T) {
+ tests := []struct {
+ fn string
+ expectedJSON []string
+ commutative bool
+ }{
+ {"and_kleene", []string{`[true, false, null, false, false, null]`, `[true, false, false, null, false]`, `[true, false, false, false]`}, true},
+ {"or_kleene", []string{`[true, true, true, false, null, null]`, `[true, true, false, true, null]`, `[true, true, false, true]`}, true},
+ {"and_not_kleene", []string{`[false, true, null, false, false, false, false, null, null]`, `[false, true, false, false]`}, false},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.fn, func(t *testing.T) {
+ var (
+ leftJSON = make([]string, len(tt.expectedJSON))
+ rightJSON = make([]string, len(tt.expectedJSON))
+ )
+
+ if tt.commutative {
+ leftJSON[0] = `[true, true, true, false, false, null]`
+ rightJSON[0] = `[true, false, null, false, null, null]`
+ leftJSON[1] = `[true, true, false, null, null]`
+ rightJSON[1] = `[true, false, false, true, false]`
+ leftJSON[2] = `[true, true, false, true]`
+ rightJSON[2] = `[true, false, false, false]`
+ } else {
+ leftJSON[0] = `[true, true, true, false, false, false, null, null, null]`
+ rightJSON[0] = `[true, false, null, true, false, null, true, false, null]`
+ leftJSON[1] = `[true, true, false, false]`
+ rightJSON[1] = `[true, false, true, false]`
+ }
+
+ for i := range tt.expectedJSON {
+ func() {
+ mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(t, 0)
+
+ left, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+ strings.NewReader(leftJSON[i]))
+ defer left.Release()
+ right, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean,
+ strings.NewReader(rightJSON[i]))
+ defer right.Release()
+ exp, _, _ := array.FromJSON(mem, arrow.FixedWidthTypes.Boolean, strings.NewReader(tt.expectedJSON[i]))
+ defer exp.Release()
+
+ checkScalarBinary(t, tt.fn, &compute.ArrayDatum{Value: left.Data()}, &compute.ArrayDatum{Value: right.Data()}, &compute.ArrayDatum{Value: exp.Data()}, nil)
+ ctx := compute.WithAllocator(context.Background(), mem)
+ checkBooleanScalarArrayBinary(t, ctx, tt.fn, &compute.ArrayDatum{Value: left.Data()})
+ }()
+ }
+ })
+ }
+}
diff --git a/go/arrow/compute/utils.go b/go/arrow/compute/utils.go
index 32ad97b586d..8a27176a12b 100644
--- a/go/arrow/compute/utils.go
+++ b/go/arrow/compute/utils.go
@@ -20,7 +20,10 @@ import (
"io"
"math"
+ "github.com/apache/arrow/go/v10/arrow"
"github.com/apache/arrow/go/v10/arrow/bitutil"
+ "github.com/apache/arrow/go/v10/arrow/compute/internal/exec"
+ "github.com/apache/arrow/go/v10/arrow/internal/debug"
"github.com/apache/arrow/go/v10/arrow/memory"
"golang.org/x/xerrors"
)
@@ -81,3 +84,167 @@ func (b *bufferWriteSeeker) Seek(offset int64, whence int) (int64, error) {
b.pos = newpos
return int64(newpos), nil
}
+
+// ensureDictionaryDecoded is used by DispatchBest to determine
+// the proper types for promotion. Casting is then performed by
+// the executor before continuing execution: see the implementation
+// of execInternal in exec.go after calling DispatchBest.
+//
+// That casting is where actual decoding would be performed for
+// the dictionary
+func ensureDictionaryDecoded(vals ...arrow.DataType) {
+ for i, v := range vals {
+ if v.ID() == arrow.DICTIONARY {
+ vals[i] = v.(*arrow.DictionaryType).ValueType
+ }
+ }
+}
+
+func replaceNullWithOtherType(vals ...arrow.DataType) {
+ debug.Assert(len(vals) == 2, "should be length 2")
+
+ if vals[0].ID() == arrow.NULL {
+ vals[0] = vals[1]
+ return
+ }
+
+ if vals[1].ID() == arrow.NULL {
+ vals[1] = vals[0]
+ return
+ }
+}
+
+func commonTemporalResolution(vals ...arrow.DataType) (arrow.TimeUnit, bool) {
+ isTimeUnit := false
+ finestUnit := arrow.Second
+ for _, v := range vals {
+ switch dt := v.(type) {
+ case *arrow.Date32Type:
+ isTimeUnit = true
+ continue
+ case *arrow.Date64Type:
+ finestUnit = exec.Max(finestUnit, arrow.Millisecond)
+ isTimeUnit = true
+ case arrow.TemporalWithUnit:
+ finestUnit = exec.Max(finestUnit, dt.TimeUnit())
+ isTimeUnit = true
+ default:
+ continue
+ }
+ }
+ return finestUnit, isTimeUnit
+}
+
+func replaceTemporalTypes(unit arrow.TimeUnit, vals ...arrow.DataType) {
+ for i, v := range vals {
+ switch dt := v.(type) {
+ case *arrow.TimestampType:
+ dt.Unit = unit
+ vals[i] = dt
+ case *arrow.Time32Type, *arrow.Time64Type:
+ if unit > arrow.Millisecond {
+ vals[i] = &arrow.Time64Type{Unit: unit}
+ } else {
+ vals[i] = &arrow.Time32Type{Unit: unit}
+ }
+ case *arrow.DurationType:
+ dt.Unit = unit
+ vals[i] = dt
+ case *arrow.Date32Type, *arrow.Date64Type:
+ vals[i] = &arrow.TimestampType{Unit: unit}
+ }
+ }
+}
+
+func replaceTypes(replacement arrow.DataType, vals ...arrow.DataType) {
+ for i := range vals {
+ vals[i] = replacement
+ }
+}
+
+func commonNumeric(vals ...arrow.DataType) arrow.DataType {
+ for _, v := range vals {
+ if !arrow.IsFloating(v.ID()) && !arrow.IsInteger(v.ID()) {
+ // a common numeric type is only possible if all are numeric
+ return nil
+ }
+ if v.ID() == arrow.FLOAT16 {
+ // float16 arithmetic is not currently supported
+ return nil
+ }
+ }
+
+ for _, v := range vals {
+ if v.ID() == arrow.FLOAT64 {
+ return arrow.PrimitiveTypes.Float64
+ }
+ }
+
+ for _, v := range vals {
+ if v.ID() == arrow.FLOAT32 {
+ return arrow.PrimitiveTypes.Float32
+ }
+ }
+
+ maxWidthSigned, maxWidthUnsigned := 0, 0
+ for _, v := range vals {
+ if arrow.IsUnsignedInteger(v.ID()) {
+ maxWidthUnsigned = exec.Max(v.(arrow.FixedWidthDataType).BitWidth(), maxWidthUnsigned)
+ } else {
+ maxWidthSigned = exec.Max(v.(arrow.FixedWidthDataType).BitWidth(), maxWidthSigned)
+ }
+ }
+
+ if maxWidthSigned == 0 {
+ switch {
+ case maxWidthUnsigned >= 64:
+ return arrow.PrimitiveTypes.Uint64
+ case maxWidthUnsigned == 32:
+ return arrow.PrimitiveTypes.Uint32
+ case maxWidthUnsigned == 16:
+ return arrow.PrimitiveTypes.Uint16
+ default:
+ debug.Assert(maxWidthUnsigned == 8, "bad maxWidthUnsigned")
+ return arrow.PrimitiveTypes.Uint8
+ }
+ }
+
+ if maxWidthSigned <= maxWidthUnsigned {
+ maxWidthSigned = bitutil.NextPowerOf2(maxWidthUnsigned + 1)
+ }
+
+ switch {
+ case maxWidthSigned >= 64:
+ return arrow.PrimitiveTypes.Int64
+ case maxWidthSigned == 32:
+ return arrow.PrimitiveTypes.Int32
+ case maxWidthSigned == 16:
+ return arrow.PrimitiveTypes.Int16
+ default:
+ debug.Assert(maxWidthSigned == 8, "bad maxWidthSigned")
+ return arrow.PrimitiveTypes.Int8
+ }
+}
+
+func hasDecimal(vals ...arrow.DataType) bool {
+ for _, v := range vals {
+ if arrow.IsDecimal(v.ID()) {
+ return true
+ }
+ }
+
+ return false
+}
+
+type decimalPromotion uint8
+
+const (
+ decPromoteNone decimalPromotion = iota
+ decPromoteAdd
+ decPromoteMultiply
+ decPromoteDivide
+)
+
+func castBinaryDecimalArgs(promote decimalPromotion, vals ...arrow.DataType) error {
+ return arrow.ErrNotImplemented
+}
diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go
index 4096084b076..3d21062beb7 100644
--- a/go/arrow/csv/reader_test.go
+++ b/go/arrow/csv/reader_test.go
@@ -638,7 +638,7 @@ func BenchmarkRead(b *testing.B) {
return buf.Bytes()
}
- for _, rows := range []int{10, 1e2, 1e3, 1e4, 1e5} {
+ for _, rows := range []int{10, 1e2, 1e3, 1e4} {
for _, cols := range []int{1, 10, 100, 1000} {
raw := gen(rows, cols)
for _, chunks := range []int{-1, 0, 10, 100, 1000} {
diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go
index 2cd27cf64d7..8514a17161a 100644
--- a/go/arrow/datatype.go
+++ b/go/arrow/datatype.go
@@ -291,6 +291,16 @@ func IsUnsignedInteger(t Type) bool {
return false
}
+// IsFloating is a helper that returns true if the type ID provided is
+// one of Float16, Float32, or Float64
+func IsFloating(t Type) bool {
+ switch t {
+ case FLOAT16, FLOAT32, FLOAT64:
+ return true
+ }
+ return false
+}
+
// IsPrimitive returns true if the provided type ID represents a fixed width
// primitive type.
func IsPrimitive(t Type) bool {
diff --git a/go/go.sum b/go/go.sum
index 04695d55594..b247b659ccf 100644
--- a/go/go.sum
+++ b/go/go.sum
@@ -137,6 +137,7 @@ github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qq
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
+github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
diff --git a/go/internal/bitutils/bit_block_counter.go b/go/internal/bitutils/bit_block_counter.go
index 1ecd03e37b4..2e2b6a674cc 100644
--- a/go/internal/bitutils/bit_block_counter.go
+++ b/go/internal/bitutils/bit_block_counter.go
@@ -291,6 +291,41 @@ func VisitBitBlocksShort(bitmap []byte, offset, length int64, visitValid func(po
return nil
}
+func VisitTwoBitBlocks(leftBitmap, rightBitmap []byte, leftOffset, rightOffset int64, len int64, visitValid func(pos int64), visitNull func()) {
+ if leftBitmap == nil || rightBitmap == nil {
+ // at most one is present
+ if leftBitmap == nil {
+ VisitBitBlocks(rightBitmap, rightOffset, len, visitValid, visitNull)
+ } else {
+ VisitBitBlocks(leftBitmap, leftOffset, len, visitValid, visitNull)
+ }
+ return
+ }
+
+ bitCounter := NewBinaryBitBlockCounter(leftBitmap, rightBitmap, leftOffset, rightOffset, len)
+ var pos int64
+ for pos < len {
+ block := bitCounter.NextAndWord()
+ if block.AllSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ visitValid(pos)
+ }
+ } else if block.NoneSet() {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ visitNull()
+ }
+ } else {
+ for i := 0; i < int(block.Len); i, pos = i+1, pos+1 {
+ if bitutil.BitIsSet(leftBitmap, int(leftOffset+pos)) && bitutil.BitIsSet(rightBitmap, int(rightOffset+pos)) {
+ visitValid(pos)
+ } else {
+ visitNull()
+ }
+ }
+ }
+ }
+}
+
type bitOp struct {
bit func(bool, bool) bool
word func(uint64, uint64) uint64
diff --git a/go/parquet/pqarrow/reader_writer_test.go b/go/parquet/pqarrow/reader_writer_test.go
index 91dd6b6b7ec..3821f591d2e 100644
--- a/go/parquet/pqarrow/reader_writer_test.go
+++ b/go/parquet/pqarrow/reader_writer_test.go
@@ -177,7 +177,7 @@ func benchReadTable(b *testing.B, name string, tbl arrow.Table, nbytes int64) {
b.SetBytes(nbytes)
for i := 0; i < b.N; i++ {
- pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()), nil, nil)
+ pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
if err != nil {
b.Error(err)
}
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 16bd0985e23..578050e710a 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -189,22 +189,22 @@ cdef class ReadOptions(_Weakrefable):
self.options.reset(new CCSVReadOptions(CCSVReadOptions.Defaults()))
def __init__(self, *, use_threads=None, block_size=None, skip_rows=None,
- column_names=None, autogenerate_column_names=None,
- encoding='utf8', skip_rows_after_names=None):
+ skip_rows_after_names=None, column_names=None,
+ autogenerate_column_names=None, encoding='utf8'):
if use_threads is not None:
self.use_threads = use_threads
if block_size is not None:
self.block_size = block_size
if skip_rows is not None:
self.skip_rows = skip_rows
+ if skip_rows_after_names is not None:
+ self.skip_rows_after_names = skip_rows_after_names
if column_names is not None:
self.column_names = column_names
if autogenerate_column_names is not None:
self.autogenerate_column_names= autogenerate_column_names
# Python-specific option
self.encoding = encoding
- if skip_rows_after_names is not None:
- self.skip_rows_after_names = skip_rows_after_names
@property
def use_threads(self):
@@ -243,6 +243,23 @@ cdef class ReadOptions(_Weakrefable):
def skip_rows(self, value):
deref(self.options).skip_rows = value
+ @property
+ def skip_rows_after_names(self):
+ """
+ The number of rows to skip after the column names.
+ This number can be larger than the number of rows in one
+ block, and empty rows are counted.
+ The order of application is as follows:
+ - `skip_rows` is applied (if non-zero);
+ - column names aread (unless `column_names` is set);
+ - `skip_rows_after_names` is applied (if non-zero).
+ """
+ return deref(self.options).skip_rows_after_names
+
+ @skip_rows_after_names.setter
+ def skip_rows_after_names(self, value):
+ deref(self.options).skip_rows_after_names = value
+
@property
def column_names(self):
"""
@@ -271,23 +288,6 @@ cdef class ReadOptions(_Weakrefable):
def autogenerate_column_names(self, value):
deref(self.options).autogenerate_column_names = value
- @property
- def skip_rows_after_names(self):
- """
- The number of rows to skip after the column names.
- This number can be larger than the number of rows in one
- block, and empty rows are counted.
- The order of application is as follows:
- - `skip_rows` is applied (if non-zero);
- - column names aread (unless `column_names` is set);
- - `skip_rows_after_names` is applied (if non-zero).
- """
- return deref(self.options).skip_rows_after_names
-
- @skip_rows_after_names.setter
- def skip_rows_after_names(self, value):
- deref(self.options).skip_rows_after_names = value
-
def validate(self):
check_status(deref(self.options).Validate())
@@ -296,11 +296,11 @@ cdef class ReadOptions(_Weakrefable):
self.use_threads == other.use_threads and
self.block_size == other.block_size and
self.skip_rows == other.skip_rows and
+ self.skip_rows_after_names == other.skip_rows_after_names and
self.column_names == other.column_names and
self.autogenerate_column_names ==
other.autogenerate_column_names and
- self.encoding == other.encoding and
- self.skip_rows_after_names == other.skip_rows_after_names
+ self.encoding == other.encoding
)
@staticmethod
@@ -605,11 +605,6 @@ cdef class ConvertOptions(_Weakrefable):
decimal_point : 1-character string, optional (default '.')
The character used as decimal point in floating-point and decimal
data.
- timestamp_parsers : list, optional
- A sequence of strptime()-compatible format strings, tried in order
- when attempting to infer or convert timestamp values (the special
- value ISO8601() can also be given). By default, a fast built-in
- ISO-8601 parser is used.
strings_can_be_null : bool, optional (default False)
Whether string / binary columns can have null values.
If true, then strings in null_values are considered null for
@@ -620,16 +615,6 @@ cdef class ConvertOptions(_Weakrefable):
If true, then strings in "null_values" are also considered null
when they appear quoted in the CSV file. Otherwise, quoted values
are never considered null.
- auto_dict_encode : bool, optional (default False)
- Whether to try to automatically dict-encode string / binary data.
- If true, then when type inference detects a string or binary column,
- it it dict-encoded up to `auto_dict_max_cardinality` distinct values
- (per chunk), after which it switches to regular encoding.
- This setting is ignored for non-inferred columns (those in
- `column_types`).
- auto_dict_max_cardinality : int, optional
- The maximum dictionary cardinality for `auto_dict_encode`.
- This value is per chunk.
include_columns : list, optional
The names of columns to include in the Table.
If empty, the Table will include all columns from the CSV file.
@@ -641,6 +626,21 @@ cdef class ConvertOptions(_Weakrefable):
produce a column of nulls (whose type is selected using
`column_types`, or null by default).
This option is ignored if `include_columns` is empty.
+ auto_dict_encode : bool, optional (default False)
+ Whether to try to automatically dict-encode string / binary data.
+ If true, then when type inference detects a string or binary column,
+ it it dict-encoded up to `auto_dict_max_cardinality` distinct values
+ (per chunk), after which it switches to regular encoding.
+ This setting is ignored for non-inferred columns (those in
+ `column_types`).
+ auto_dict_max_cardinality : int, optional
+ The maximum dictionary cardinality for `auto_dict_encode`.
+ This value is per chunk.
+ timestamp_parsers : list, optional
+ A sequence of strptime()-compatible format strings, tried in order
+ when attempting to infer or convert timestamp values (the special
+ value ISO8601() can also be given). By default, a fast built-in
+ ISO-8601 parser is used.
Examples
--------
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 4ab08d45542..154a02481c9 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -257,7 +257,7 @@ cdef class Dataset(_Weakrefable):
... 'n_legs': [2, 2, 4, 4, 5, 100],
... 'animal': ["Flamingo", "Parrot", "Dog", "Horse",
... "Brittle stars", "Centipede"]})
- >>>
+ >>>
>>> import pyarrow.parquet as pq
>>> pq.write_table(table, "dataset_scanner.parquet")
@@ -1221,12 +1221,12 @@ cdef class CsvFileFormat(FileFormat):
----------
parse_options : pyarrow.csv.ParseOptions
Options regarding CSV parsing.
+ default_fragment_scan_options : CsvFragmentScanOptions
+ Default options for fragments scan.
convert_options : pyarrow.csv.ConvertOptions
Options regarding value conversion.
read_options : pyarrow.csv.ReadOptions
General read options.
- default_fragment_scan_options : CsvFragmentScanOptions
- Default options for fragments scan.
"""
cdef:
CCsvFileFormat* csv_format
@@ -2315,17 +2315,17 @@ cdef class Scanner(_Weakrefable):
projections.
The list of columns or expressions may use the special fields
- `__batch_index` (the index of the batch within the fragment),
- `__fragment_index` (the index of the fragment within the dataset),
+ `__batch_index` (the index of the batch within the fragment),
+ `__fragment_index` (the index of the fragment within the dataset),
`__last_in_fragment` (whether the batch is last in fragment), and
- `__filename` (the name of the source file or a description of the
+ `__filename` (the name of the source file or a description of the
source fragment).
The columns will be passed down to Datasets and corresponding data
fragments to avoid loading, copying, and deserializing columns
that will not be required further down the compute chain.
- By default all of the available columns are projected.
- Raises an exception if any of the referenced column names does
+ By default all of the available columns are projected.
+ Raises an exception if any of the referenced column names does
not exist in the dataset's Schema.
filter : Expression, default None
Scan will return only the rows matching the filter.
@@ -2338,8 +2338,9 @@ cdef class Scanner(_Weakrefable):
record batches are overflowing memory then this method can be
called to reduce their size.
batch_readahead : int, default 16
- The number of batches to read ahead in a file. Increasing this number
- will increase RAM usage but could also improve IO utilization.
+ The number of batches to read ahead in a file. This might not work
+ for all file formats. Increasing this number will increase
+ RAM usage but could also improve IO utilization.
fragment_readahead : int, default 4
The number of files to read ahead. Increasing this number will increase
RAM usage but could also improve IO utilization.
@@ -2375,14 +2376,13 @@ cdef class Scanner(_Weakrefable):
return self.wrapped
@staticmethod
- def from_dataset(Dataset dataset not None,
- bint use_threads=True, object use_async=None,
- MemoryPool memory_pool=None,
- object columns=None, Expression filter=None,
- int batch_size=_DEFAULT_BATCH_SIZE,
+ def from_dataset(Dataset dataset not None, *, object columns=None,
+ Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
int batch_readahead=_DEFAULT_BATCH_READAHEAD,
int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
- FragmentScanOptions fragment_scan_options=None):
+ FragmentScanOptions fragment_scan_options=None,
+ bint use_threads=True, object use_async=None,
+ MemoryPool memory_pool=None):
"""
Create Scanner from Dataset,
@@ -2397,10 +2397,10 @@ cdef class Scanner(_Weakrefable):
projections.
The list of columns or expressions may use the special fields
- `__batch_index` (the index of the batch within the fragment),
- `__fragment_index` (the index of the fragment within the dataset),
+ `__batch_index` (the index of the batch within the fragment),
+ `__fragment_index` (the index of the fragment within the dataset),
`__last_in_fragment` (whether the batch is last in fragment), and
- `__filename` (the name of the source file or a description of the
+ `__filename` (the name of the source file or a description of the
source fragment).
The columns will be passed down to Datasets and corresponding data
@@ -2426,6 +2426,9 @@ cdef class Scanner(_Weakrefable):
fragment_readahead : int, default 4
The number of files to read ahead. Increasing this number will increase
RAM usage but could also improve IO utilization.
+ fragment_scan_options : FragmentScanOptions, default None
+ Options specific to a particular scan and fragment type, which
+ can change between different scans of the same dataset.
use_threads : bool, default True
If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.
@@ -2436,9 +2439,6 @@ cdef class Scanner(_Weakrefable):
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.
- fragment_scan_options : FragmentScanOptions, default None
- Options specific to a particular scan and fragment type, which
- can change between different scans of the same dataset.
"""
cdef:
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2461,13 +2461,13 @@ cdef class Scanner(_Weakrefable):
return Scanner.wrap(scanner)
@staticmethod
- def from_fragment(Fragment fragment not None, Schema schema=None,
- bint use_threads=True, object use_async=None,
- MemoryPool memory_pool=None,
+ def from_fragment(Fragment fragment not None, *, Schema schema=None,
object columns=None, Expression filter=None,
int batch_size=_DEFAULT_BATCH_SIZE,
int batch_readahead=_DEFAULT_BATCH_READAHEAD,
- FragmentScanOptions fragment_scan_options=None):
+ FragmentScanOptions fragment_scan_options=None,
+ bint use_threads=True, object use_async=None,
+ MemoryPool memory_pool=None,):
"""
Create Scanner from Fragment,
@@ -2484,10 +2484,10 @@ cdef class Scanner(_Weakrefable):
projections.
The list of columns or expressions may use the special fields
- `__batch_index` (the index of the batch within the fragment),
- `__fragment_index` (the index of the fragment within the dataset),
+ `__batch_index` (the index of the batch within the fragment),
+ `__fragment_index` (the index of the fragment within the dataset),
`__last_in_fragment` (whether the batch is last in fragment), and
- `__filename` (the name of the source file or a description of the
+ `__filename` (the name of the source file or a description of the
source fragment).
The columns will be passed down to Datasets and corresponding data
@@ -2510,6 +2510,9 @@ cdef class Scanner(_Weakrefable):
The number of batches to read ahead in a file. This might not work
for all file formats. Increasing this number will increase
RAM usage but could also improve IO utilization.
+ fragment_scan_options : FragmentScanOptions, default None
+ Options specific to a particular scan and fragment type, which
+ can change between different scans of the same dataset.
use_threads : bool, default True
If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.
@@ -2520,9 +2523,6 @@ cdef class Scanner(_Weakrefable):
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.
- fragment_scan_options : FragmentScanOptions, default None
- Options specific to a particular scan and fragment type, which
- can change between different scans of the same dataset.
"""
cdef:
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2549,11 +2549,11 @@ cdef class Scanner(_Weakrefable):
return Scanner.wrap(scanner)
@staticmethod
- def from_batches(source, Schema schema=None, bint use_threads=True,
- object use_async=None, MemoryPool memory_pool=None,
- object columns=None, Expression filter=None,
- int batch_size=_DEFAULT_BATCH_SIZE,
- FragmentScanOptions fragment_scan_options=None):
+ def from_batches(source, *, Schema schema=None, object columns=None,
+ Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
+ FragmentScanOptions fragment_scan_options=None,
+ bint use_threads=True, object use_async=None,
+ MemoryPool memory_pool=None):
"""
Create a Scanner from an iterator of batches.
@@ -2574,6 +2574,8 @@ cdef class Scanner(_Weakrefable):
Scan will return only the rows matching the filter.
batch_size : int, default 128Ki
The maximum row count for scanned record batches.
+ fragment_scan_options : FragmentScanOptions
+ The fragment scan options.
use_threads : bool, default True
If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.
@@ -2584,8 +2586,6 @@ cdef class Scanner(_Weakrefable):
memory_pool : MemoryPool, default None
For memory allocations, if required. If not specified, uses the
default pool.
- fragment_scan_options : FragmentScanOptions
- The fragment scan options.
"""
cdef:
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 036301d90eb..86d1f0e39cf 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -702,11 +702,11 @@ cdef class _PandasConvertible(_Weakrefable):
memory_pool : MemoryPool, default None
Arrow MemoryPool to use for allocations. Uses the default memory
pool is not passed.
- strings_to_categorical : bool, default False
- Encode string (UTF8) and binary types to pandas.Categorical.
categories : list, default empty
List of fields that should be returned as pandas.Categorical. Only
applies to table-like data structures.
+ strings_to_categorical : bool, default False
+ Encode string (UTF8) and binary types to pandas.Categorical.
zero_copy_only : bool, default False
Raise an ArrowException if this function call would require copying
the underlying data.
@@ -2549,11 +2549,11 @@ cdef class DictionaryArray(Array):
The array of values referenced by the indices.
mask : ndarray or pandas.Series, bool type
True values indicate that indices are actually null.
+ ordered : bool, default False
+ Set to True if the category values are ordered.
from_pandas : bool, default False
If True, the indices should be treated as though they originated in
a pandas.Categorical (null encoded as -1).
- ordered : bool, default False
- Set to True if the category values are ordered.
safe : bool, default True
If True, check that the dictionary indices are in range.
memory_pool : MemoryPool, default None
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index 90822766db7..a9f127ef468 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -106,12 +106,12 @@ cdef class IpcReadOptions(_Weakrefable):
Parameters
----------
- use_threads : bool
- Whether to use the global CPU thread pool to parallelize any
- computational tasks like decompression.
ensure_native_endian : bool
Whether to convert incoming data to platform-native endianness.
Default is true.
+ use_threads : bool
+ Whether to use the global CPU thread pool to parallelize any
+ computational tasks like decompression.
included_fields : list
If empty (the default), return all deserialized fields.
If non-empty, the values are the indices of fields to read on
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index d63c323b335..fc724109d94 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -59,6 +59,12 @@ def __init__(self, source, *, options=None, memory_pool=None):
Either a file path, or a writable file object.
schema : pyarrow.Schema
The Arrow schema for data to be written to the file.
+use_legacy_format : bool, default None
+ Deprecated in favor of setting options. Cannot be provided with
+ options.
+
+ If None, False will be used unless this default is overridden by
+ setting the environment variable ARROW_PRE_0_15_IPC_FORMAT=1
options : pyarrow.ipc.IpcWriteOptions
Options for IPC serialization.
@@ -66,13 +72,7 @@ def __init__(self, source, *, options=None, memory_pool=None):
be used unless overridden by setting the environment variable
ARROW_PRE_0_15_IPC_FORMAT=1, and the V5 metadata version will be
used unless overridden by setting the environment variable
- ARROW_PRE_1_0_METADATA_VERSION=1.
-use_legacy_format : bool, default None
- Deprecated in favor of setting options. Cannot be provided with
- options.
-
- If None, False will be used unless this default is overridden by
- setting the environment variable ARROW_PRE_0_15_IPC_FORMAT=1"""
+ ARROW_PRE_1_0_METADATA_VERSION=1."""
class RecordBatchStreamWriter(lib._RecordBatchStreamWriter):
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index da3315441c0..5716719dde2 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -229,6 +229,8 @@ class ParquetFile:
common_metadata : FileMetaData, default None
Will be used in reads for pandas schema metadata if not found in the
main file's metadata, no other uses at the moment.
+ read_dictionary : list
+ List of column names to read directly as DictionaryArray.
memory_map : bool, default False
If the source is a file path, use a memory map to read file, which can
improve performance in some environments.
@@ -239,8 +241,6 @@ class ParquetFile:
Coalesce and issue file reads in parallel to improve performance on
high-latency filesystems (e.g. S3). If True, Arrow will use a
background I/O thread pool.
- read_dictionary : list
- List of column names to read directly as DictionaryArray.
coerce_int96_timestamp_unit : str, default None.
Cast timestamps that are stored in INT96 format to a particular
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
@@ -737,6 +737,12 @@ def _sanitize_table(table, new_schema, flavor):
use_dictionary : bool or list
Specify if we should use dictionary encoding in general or only for
some columns.
+compression : str or dict
+ Specify the compression codec, either on a general basis or per-column.
+ Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}.
+write_statistics : bool or list
+ Specify if we should write statistics in general (default is True) or only
+ for some columns.
use_deprecated_int96_timestamps : bool, default None
Write timestamps to INT96 Parquet format. Defaults to False unless enabled
by flavor argument. This take priority over the coerce_timestamps option.
@@ -750,22 +756,16 @@ def _sanitize_table(table, new_schema, flavor):
If the casting results in loss of data, it will raise an exception
unless ``allow_truncated_timestamps=True`` is given.
Valid values: {None, 'ms', 'us'}
-data_page_size : int, default None
- Set a target threshold for the approximate encoded size of data
- pages within a column chunk (in bytes). If None, use the default data page
- size of 1MByte.
allow_truncated_timestamps : bool, default False
Allow loss of data when coercing timestamps to a particular
resolution. E.g. if microsecond or nanosecond data is lost when coercing to
'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True``
will NOT result in the truncation exception being ignored unless
``coerce_timestamps`` is not None.
-compression : str or dict
- Specify the compression codec, either on a general basis or per-column.
- Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}.
-write_statistics : bool or list
- Specify if we should write statistics in general (default is True) or only
- for some columns.
+data_page_size : int, default None
+ Set a target threshold for the approximate encoded size of data
+ pages within a column chunk (in bytes). If None, use the default data page
+ size of 1MByte.
flavor : {'spark'}, default None
Sanitize schema or set other compatibility options to work with
various target systems.
@@ -1095,12 +1095,12 @@ class ParquetDatasetPiece:
Path to file in the file system where this piece is located.
open_file_func : callable
Function to use for obtaining file handle to dataset piece.
- partition_keys : list of tuples
- Two-element tuples of ``(column name, ordinal index)``.
- row_group : int, default None
- Row group to load. By default, reads all row groups.
file_options : dict
Options
+ row_group : int, default None
+ Row group to load. By default, reads all row groups.
+ partition_keys : list of tuples
+ Two-element tuples of ``(column name, ordinal index)``.
"""
def __init__(self, path, open_file_func=partial(open, mode='rb'),
@@ -1650,11 +1650,11 @@ class ParquetDataset:
If nothing passed, will be inferred based on path.
Path will try to be found in the local on-disk filesystem otherwise
it will be parsed as an URI to determine the filesystem.
-metadata : pyarrow.parquet.FileMetaData
- Use metadata obtained elsewhere to validate file schemas.
schema : pyarrow.parquet.Schema
Use schema obtained elsewhere to validate file schemas. Alternative to
metadata parameter.
+metadata : pyarrow.parquet.FileMetaData
+ Use metadata obtained elsewhere to validate file schemas.
split_row_groups : bool, default False
Divide files into pieces for each row group in the file.
validate_schema : bool, default True
@@ -2666,19 +2666,6 @@ def partitioning(self):
Optionally provide the Schema for the parquet dataset, in which case it
will not be inferred from the source.
{1}
-use_legacy_dataset : bool, default False
- By default, `read_table` uses the new Arrow Datasets API since
- pyarrow 1.0.0. Among other things, this allows to pass `filters`
- for all columns and not only the partition keys, enables
- different partitioning schemes, etc.
- Set to True to use the legacy behaviour (this option is deprecated,
- and the legacy implementation will be removed in a future version).
-ignore_prefixes : list, optional
- Files matching any of these prefixes will be ignored by the
- discovery process if use_legacy_dataset=False.
- This is matched to the basename of a path.
- By default this is ['.', '_'].
- Note that discovery happens only if a directory is passed as source.
filesystem : FileSystem, default None
If nothing passed, will be inferred based on path.
Path will try to be found in the local on-disk filesystem otherwise
@@ -2693,6 +2680,19 @@ def partitioning(self):
and different partitioning schemes are supported.
{3}
+use_legacy_dataset : bool, default False
+ By default, `read_table` uses the new Arrow Datasets API since
+ pyarrow 1.0.0. Among other things, this allows to pass `filters`
+ for all columns and not only the partition keys, enables
+ different partitioning schemes, etc.
+ Set to True to use the legacy behaviour (this option is deprecated,
+ and the legacy implementation will be removed in a future version).
+ignore_prefixes : list, optional
+ Files matching any of these prefixes will be ignored by the
+ discovery process if use_legacy_dataset=False.
+ This is matched to the basename of a path.
+ By default this is ['.', '_'].
+ Note that discovery happens only if a directory is passed as source.
pre_buffer : bool, default True
Coalesce and issue file reads in parallel to improve performance on
high-latency filesystems (e.g. S3). If True, Arrow will use a
@@ -2805,9 +2805,9 @@ def partitioning(self):
def read_table(source, *, columns=None, use_threads=True, metadata=None,
- schema=None, use_pandas_metadata=False, memory_map=False,
- read_dictionary=None, filesystem=None, filters=None,
- buffer_size=0, partitioning="hive", use_legacy_dataset=False,
+ schema=None, use_pandas_metadata=False, read_dictionary=None,
+ memory_map=False, buffer_size=0, partitioning="hive",
+ filesystem=None, filters=None, use_legacy_dataset=False,
ignore_prefixes=None, pre_buffer=True,
coerce_int96_timestamp_unit=None,
decryption_properties=None, thrift_string_size_limit=None,
@@ -2914,10 +2914,9 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None,
Note: starting with pyarrow 1.0, the default for `use_legacy_dataset` is
switched to False.""",
- "\n".join((_read_docstring_common,
- """use_pandas_metadata : bool, default False
+ "\n".join(("""use_pandas_metadata : bool, default False
If True and file has custom pandas schema metadata, ensure that
- index columns are also loaded.""")),
+ index columns are also loaded.""", _read_docstring_common)),
"""pyarrow.Table
Content of the file as a table (of columns)""",
_DNF_filter_doc, _read_table_example)
@@ -3086,10 +3085,6 @@ def write_to_dataset(table, root_path, partition_cols=None,
table : pyarrow.Table
root_path : str, pathlib.Path
The root directory of the dataset
- filesystem : FileSystem, default None
- If nothing passed, will be inferred based on path.
- Path will try to be found in the local on-disk filesystem otherwise
- it will be parsed as an URI to determine the filesystem.
partition_cols : list,
Column names by which to partition the dataset.
Columns are partitioned in the order they are given
@@ -3100,16 +3095,16 @@ def write_to_dataset(table, root_path, partition_cols=None,
This option is only supported for use_legacy_dataset=True.
When use_legacy_dataset=None and this option is specified,
use_legacy_datase will be set to True.
+ filesystem : FileSystem, default None
+ If nothing passed, will be inferred based on path.
+ Path will try to be found in the local on-disk filesystem otherwise
+ it will be parsed as an URI to determine the filesystem.
use_legacy_dataset : bool
Default is False. Set to True to use the the legacy behaviour
(this option is deprecated, and the legacy implementation will be
removed in a future version). The legacy implementation still
supports the `partition_filename_cb` keyword but is less efficient
when using partition columns.
- use_threads : bool, default True
- Write files in parallel. If enabled, then maximum parallelism will be
- used determined by the number of available CPU cores.
- This option is only supported for use_legacy_dataset=False.
schema : Schema, optional
This option is only supported for use_legacy_dataset=False.
partitioning : Partitioning or list[str], optional
@@ -3124,6 +3119,10 @@ def write_to_dataset(table, root_path, partition_cols=None,
The token '{i}' will be replaced with an automatically incremented
integer. If not specified, it defaults to "guid-{i}.parquet".
This option is only supported for use_legacy_dataset=False.
+ use_threads : bool, default True
+ Write files in parallel. If enabled, then maximum parallelism will be
+ used determined by the number of available CPU cores.
+ This option is only supported for use_legacy_dataset=False.
file_visitor : function
If set, this function will be called with a WrittenFile instance
for each file created during the call. This object will have both
diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py
index cbff41c7b10..09bc1ed7b3b 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -172,8 +172,8 @@ def alltypes_sample(size=10000, seed=0, categorical=False):
# TODO(wesm): Test other timestamp resolutions now that arrow supports
# them
'datetime': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[ms]'),
- 'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
+ dtype='datetime64[ms]').astype('datetime64[ns]'),
+ 'timedelta': np.arange(0, size, dtype="timedelta64[ns]"),
'str': pd.Series([str(x) for x in range(size)]),
'empty_str': [''] * size,
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
index 6fdc7435418..ef75c3efac7 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -1252,6 +1252,7 @@ def _test_write_to_dataset_with_partitions(base_path,
'nan': [np.nan] * 10,
'date': np.arange('2017-01-01', '2017-01-11',
dtype='datetime64[D]')})
+ output_df["date"] = output_df["date"].astype('datetime64[ns]')
cols = output_df.columns.tolist()
partition_by = ['group1', 'group2']
output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False,
@@ -1312,6 +1313,7 @@ def _test_write_to_dataset_no_partitions(base_path,
'num': list(range(10)),
'date': np.arange('2017-01-01', '2017-01-11',
dtype='datetime64[D]')})
+ output_df["date"] = output_df["date"].astype('datetime64[ns]')
cols = output_df.columns.tolist()
output_table = pa.Table.from_pandas(output_df)
@@ -1440,6 +1442,7 @@ def test_write_to_dataset_with_partitions_and_custom_filenames(
'nan': [np.nan] * 10,
'date': np.arange('2017-01-01', '2017-01-11',
dtype='datetime64[D]')})
+ output_df["date"] = output_df["date"].astype('datetime64[ns]')
partition_by = ['group1', 'group2']
output_table = pa.Table.from_pandas(output_df)
path = str(tempdir)
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 5274ddce035..37199e38c24 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -4865,3 +4865,31 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir):
ds.write_dataset(
scanner, tempdir, partitioning=["original_column"], format="ipc"
)
+
+
+@pytest.mark.parametrize("format", ("ipc", "parquet"))
+def test_read_table_nested_columns(tempdir, format):
+ if format == "parquet":
+ pytest.importorskip("pyarrow.parquet")
+
+ table = pa.table({"user_id": ["abc123", "qrs456"],
+ "a.dotted.field": [1, 2],
+ "interaction": [
+ {"type": None, "element": "button",
+ "values": [1, 2], "structs":[{"foo": "bar"}, None]},
+ {"type": "scroll", "element": "window",
+ "values": [None, 3, 4], "structs":[{"fizz": "buzz"}]}
+ ]})
+ ds.write_dataset(table, tempdir / "table", format=format)
+ ds1 = ds.dataset(tempdir / "table", format=format)
+
+ # Dot path to read subsets of nested data
+ table = ds1.to_table(
+ columns=["user_id", "interaction.type", "interaction.values",
+ "interaction.structs", "a.dotted.field"])
+ assert table.to_pylist() == [
+ {'user_id': 'abc123', 'type': None, 'values': [1, 2],
+ 'structs': [{'fizz': None, 'foo': 'bar'}, None], 'a.dotted.field': 1},
+ {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4],
+ 'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}
+ ]
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 7022441396e..f843904f126 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -69,7 +69,7 @@ def _alltypes_example(size=100):
# TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms,
# us, ns
'datetime': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[ms]'),
+ dtype='datetime64[ms]').astype("datetime64[ns]"),
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'empty_str': [''] * size
@@ -1016,7 +1016,7 @@ def test_timestamps_with_timezone(self):
'2007-07-13T01:23:34.123',
'2006-01-13T12:34:56.432',
'2010-08-13T05:46:57.437'],
- dtype='datetime64[ms]')
+ dtype='datetime64[ms]').astype("datetime64[ns]")
})
df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
_check_pandas_roundtrip(df)
@@ -2718,7 +2718,7 @@ def test_strided_data_import(self):
cases.append(boolean_objects)
cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
- dtype='datetime64[ms]')
+ dtype='datetime64[ms]').astype("datetime64[ns]")
.reshape(N, K).copy())
strided_mask = (random_numbers > 0).astype(bool)[:, 0]
diff --git a/r/NAMESPACE b/r/NAMESPACE
index f1f4bd80570..4a0c6ed2619 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -412,6 +412,7 @@ importFrom(purrr,map_dfr)
importFrom(purrr,map_int)
importFrom(purrr,map_lgl)
importFrom(purrr,reduce)
+importFrom(purrr,walk)
importFrom(rlang,"%||%")
importFrom(rlang,":=")
importFrom(rlang,.data)
diff --git a/r/NEWS.md b/r/NEWS.md
index c0bad9458d1..79925b82b05 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -19,6 +19,74 @@
# arrow 9.0.0.9000
+## Arrow dplyr queries
+
+Several new functions can be used in queries:
+
+* `dplyr::across()` can be used to apply the same computation across multiple
+ columns, and the `where()` selection helper is supported in `across()`;
+* `add_filename()` can be used to get the filename a row came from (only
+ available when querying `?Dataset`);
+* Added five functions in the `slice_*` family: `dplyr::slice_min()`,
+ `dplyr::slice_max()`, `dplyr::slice_head()`, `dplyr::slice_tail()`, and
+ `dplyr::slice_sample()`.
+
+The package now has documentation that lists all `dplyr` methods and R function
+mappings that are supported on Arrow data, along with notes about any
+differences in functionality between queries evaluated in R versus in Acero, the
+Arrow query engine. See `?acero`.
+
+A few new features and bugfixes were implemented for joins:
+
+* Extension arrays are now supported in joins, allowing, for example, joining
+ datasets that contain [geoarrow](https://paleolimbot.github.io/geoarrow/) data.
+* The `keep` argument is now supported, allowing separate columns for the left
+ and right hand side join keys in join output. Full joins now coalesce the
+ join keys (when `keep = FALSE`), avoiding the issue where the join keys would
+ be all `NA` for rows in the right hand side without any matches on the left.
+
+Some changes to improve the consistency of the API:
+
+* In a future release, calling `dplyr::pull()` will return a `?ChunkedArray`
+ instead of an R vector by default. The current default behavior is deprecated.
+ To update to the new behavior now, specify `pull(as_vector = FALSE)` or set
+ `options(arrow.pull_as_vector = FALSE)` globally.
+* Calling `dplyr::compute()` on a query that is grouped returns a `?Table`
+ instead of a query object.
+
+Finally, long-running queries can now be cancelled and will abort their
+computation immediately.
+
+## Arrays and tables
+
+`as_arrow_array()` can now take `blob::blob` and `?vctrs::list_of`, which
+convert to binary and list arrays, respectively. Also fixed an issue where
+`as_arrow_array()` ignored type argument when passed a `StructArray`.
+
+The `unique()` function works on `?Table`, `?RecordBatch`, `?Dataset`, and
+`?RecordBatchReader`.
+
+## Reading and writing
+
+`write_feather()` can take `compression = FALSE` to choose writing uncompressed files.
+
+Also, a breaking change for IPC files in `write_dataset()`: passing
+`"ipc"` or `"feather"` to `format` will now write files with `.arrow`
+extension instead of `.ipc` or `.feather`.
+
+## Installation
+
+As of version 10.0.0, `arrow` requires C++17 to build. This means that:
+
+* On Windows, you need `R >= 4.0`. Version 9.0.0 was the last version to support
+ R 3.6.
+* On CentOS 7, you can build the latest version of `arrow`,
+ but you first need to install a newer compiler than the default system compiler,
+ gcc 4.8. See `vignette("install", package = "arrow")` for guidance.
+ Note that you only need the newer compiler to build `arrow`:
+ installing a binary package, as from RStudio Package Manager,
+ or loading a package you've already installed works fine with the system defaults.
+
# arrow 9.0.0
## Arrow dplyr queries
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 4c5067480a5..aca593551f1 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -18,7 +18,7 @@
#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail
#' @importFrom R6 R6Class
#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dbl map_dfr map_int map_lgl keep imap imap_chr
-#' @importFrom purrr flatten reduce
+#' @importFrom purrr flatten reduce walk
#' @importFrom assertthat assert_that is.string
#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos quo
#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
@@ -54,7 +54,13 @@ supported_dplyr_methods <- list(
transmute = NULL,
arrange = NULL,
rename = NULL,
- pull = "returns an Arrow [ChunkedArray], not an R vector",
+ pull = c(
+ "the `name` argument is not supported;",
+ "returns an R vector by default but this behavior is deprecated and will",
+ "return an Arrow [ChunkedArray] in a future release. Provide",
+ "`as_vector = TRUE/FALSE` to control this behavior, or set",
+ "`options(arrow.pull_as_vector)` globally."
+ ),
relocate = NULL,
compute = NULL,
collapse = NULL,
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index a45bb7ae574..c42fca00b51 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1876,8 +1876,12 @@ Scalar__ApproxEquals <- function(lhs, rhs) {
.Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs)
}
-schema_ <- function(fields) {
- .Call(`_arrow_schema_`, fields)
+Schema__from_fields <- function(fields) {
+ .Call(`_arrow_Schema__from_fields`, fields)
+}
+
+Schema__from_list <- function(field_list) {
+ .Call(`_arrow_Schema__from_list`, field_list)
}
Schema__ToString <- function(s) {
diff --git a/r/R/compute.R b/r/R/compute.R
index a144e7d678a..1386728ac90 100644
--- a/r/R/compute.R
+++ b/r/R/compute.R
@@ -379,9 +379,17 @@ register_scalar_function <- function(name, fun, in_type, out_type,
RegisterScalarUDF(name, scalar_function)
# register with dplyr binding (enables its use in mutate(), filter(), etc.)
+ binding_fun <- function(...) build_expr(name, ...)
+
+ # inject the value of `name` into the expression to avoid saving this
+ # execution environment in the binding, which eliminates a warning when the
+ # same binding is registered twice
+ body(binding_fun) <- expr_substitute(body(binding_fun), sym("name"), name)
+ environment(binding_fun) <- asNamespace("arrow")
+
register_binding(
name,
- function(...) build_expr(name, ...),
+ binding_fun,
update_cache = TRUE
)
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index be58031d968..8bf22728d6a 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -46,16 +46,51 @@ compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame =
compute.ArrowTabular <- function(x, ...) x
compute.Dataset <- compute.RecordBatchReader <- compute.arrow_dplyr_query
-pull.arrow_dplyr_query <- function(.data, var = -1) {
+pull.Dataset <- function(.data,
+ var = -1,
+ ...,
+ as_vector = getOption("arrow.pull_as_vector")) {
.data <- as_adq(.data)
var <- vars_pull(names(.data), !!enquo(var))
.data$selected_columns <- set_names(.data$selected_columns[var], var)
- dplyr::compute(.data)[[1]]
+ out <- dplyr::compute(.data)[[1]]
+ handle_pull_as_vector(out, as_vector)
}
-pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query
+pull.RecordBatchReader <- pull.arrow_dplyr_query <- pull.Dataset
-pull.ArrowTabular <- function(x, var = -1) {
- x[[vars_pull(names(x), !!enquo(var))]]
+pull.ArrowTabular <- function(x,
+ var = -1,
+ ...,
+ as_vector = getOption("arrow.pull_as_vector")) {
+ out <- x[[vars_pull(names(x), !!enquo(var))]]
+ handle_pull_as_vector(out, as_vector)
+}
+
+handle_pull_as_vector <- function(out, as_vector) {
+ if (is.null(as_vector)) {
+ warn(
+ c(
+ paste(
+ "Default behavior of `pull()` on Arrow data is changing. Current",
+ "behavior of returning an R vector is deprecated, and in a future",
+ "release, it will return an Arrow `ChunkedArray`. To control this:"
+ ),
+ i = paste(
+ "Specify `as_vector = TRUE` (the current default) or",
+ "`FALSE` (what it will change to) in `pull()`"
+ ),
+ i = "Or, set `options(arrow.pull_as_vector)` globally"
+ ),
+ .frequency = "regularly",
+ .frequency_id = "arrow.pull_as_vector",
+ class = "lifecycle_warning_deprecated"
+ )
+ as_vector <- TRUE
+ }
+ if (as_vector) {
+ out <- as.vector(out)
+ }
+ out
}
restore_dplyr_features <- function(df, query) {
@@ -115,6 +150,12 @@ implicit_schema <- function(.data) {
# want to go one level up (where we may have called implicit_schema() before)
.data <- ensure_group_vars(.data)
old_schm <- .data$.data$schema
+
+ if (is.null(.data$aggregations) && is.null(.data$join) && !needs_projection(.data$selected_columns, old_schm)) {
+ # Just use the schema we have
+ return(old_schm)
+ }
+
# Add in any augmented fields that may exist in the query but not in the
# real data, in case we have FieldRefs to them
old_schm[["__filename"]] <- string()
diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R
index a8fb7c43300..15618d01d9e 100644
--- a/r/R/dplyr-eval.R
+++ b/r/R/dplyr-eval.R
@@ -95,8 +95,9 @@ arrow_mask <- function(.data, aggregation = FALSE) {
}
}
+ schema <- .data$.data$schema
# Assign the schema to the expressions
- map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+ walk(.data$selected_columns, ~ (.$schema <- schema))
# Add the column references and make the mask
out <- new_data_mask(
diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R
index e1aaa2e12fd..b8337e3069f 100644
--- a/r/R/dplyr-funcs-doc.R
+++ b/r/R/dplyr-funcs-doc.R
@@ -54,7 +54,7 @@
#' * [`inner_join()`][dplyr::inner_join()]: the `copy` and `na_matches` arguments are ignored
#' * [`left_join()`][dplyr::left_join()]: the `copy` and `na_matches` arguments are ignored
#' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported
-#' * [`pull()`][dplyr::pull()]: returns an Arrow [ChunkedArray], not an R vector
+#' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally.
#' * [`relocate()`][dplyr::relocate()]
#' * [`rename()`][dplyr::rename()]
#' * [`rename_with()`][dplyr::rename_with()]
@@ -83,7 +83,7 @@
#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both
#' `str_sub()` and `stringr::str_sub()` work.
#'
-#' In addition to these functions, you can call any of Arrow's 244 compute
+#' In addition to these functions, you can call any of Arrow's 243 compute
#' functions directly. Arrow has many functions that don't map to an existing R
#' function. In other cases where there is an R function mapping, you can still
#' call the Arrow function directly if you don't want the adaptations that the R
diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R
index 296133daeed..46720007dd0 100644
--- a/r/R/dplyr-funcs-type.R
+++ b/r/R/dplyr-funcs-type.R
@@ -32,9 +32,9 @@ register_bindings_type <- function() {
#' @param to [DataType] to cast to; for [Table] and [RecordBatch],
#' it should be a [Schema].
#' @param safe logical: only allow the type conversion if no data is lost
-#' (truncation, overflow, etc.). Default is `TRUE`
+#' (truncation, overflow, etc.). Default is `TRUE`.
#' @param ... specific `CastOptions` to set
-#' @return an `Expression`
+#' @return An [Expression]
#'
#' @examples
#' \dontrun{
@@ -43,8 +43,9 @@ register_bindings_type <- function() {
#' mutate(cyl = cast(cyl, string()))
#' }
#' @keywords internal
-#' @seealso https://arrow.apache.org/docs/cpp/api/compute.html for the list of
-#' supported CastOptions.
+#' @seealso [`data-type`] for a list of [DataType] to be used with `to`.
+#' @seealso [Arrow C++ CastOptions documentation](https://arrow.apache.org/docs/cpp/api/compute.html?highlight=castoptions#arrow%3A%3Acompute%3A%3ACastOptions) # nolint
+#' for the list of supported CastOptions.
cast <- function(x, to, safe = TRUE, ...) {
x$cast(to, safe = safe, ...)
}
diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R
index e5f76570616..ee64a09918d 100644
--- a/r/R/dplyr-funcs.R
+++ b/r/R/dplyr-funcs.R
@@ -75,7 +75,7 @@ register_binding <- function(fun_name,
previous_fun <- registry[[unqualified_name]]
# if the unqualified name exists in the registry, warn
- if (!is.null(previous_fun)) {
+ if (!is.null(previous_fun) && !identical(fun, previous_fun)) {
warn(
paste0(
"A \"",
diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 57cf417c9ad..85825b9bf2b 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -25,7 +25,10 @@ group_by.arrow_dplyr_query <- function(.data,
.drop = dplyr::group_by_drop_default(.data)) {
if (!missing(add)) {
.Deprecated(
- msg = paste("The `add` argument of `group_by()` is deprecated. Please use the `.add` argument instead.")
+ msg = paste(
+ "The `add` argument of `group_by()` is deprecated.",
+ "Please use the `.add` argument instead."
+ )
)
.add <- add
}
diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R
index 3a9d82f9752..9b6d07d375e 100644
--- a/r/R/dplyr-select.R
+++ b/r/R/dplyr-select.R
@@ -45,7 +45,8 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL
.data <- as_adq(.data)
# Assign the schema to the expressions
- map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+ schema <- .data$.data$schema
+ walk(.data$selected_columns, ~ (.$schema <- schema))
# Create a mask for evaluating expressions in tidyselect helpers
mask <- new_environment(.cache$functions, parent = caller_env())
diff --git a/r/R/schema.R b/r/R/schema.R
index c7e26652c90..93e826eff28 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -182,9 +182,9 @@ Schema$create <- function(...) {
}
if (all(map_lgl(.list, ~ inherits(., "Field")))) {
- schema_(.list)
+ Schema__from_fields(.list)
} else {
- schema_(.fields(.list))
+ Schema__from_list(imap(.list, as_type))
}
}
#' @include arrowExports.R
@@ -298,7 +298,7 @@ length.Schema <- function(x) x$num_fields
call. = FALSE
)
}
- schema_(fields)
+ Schema__from_fields(fields)
}
#' @export
diff --git a/r/R/type.R b/r/R/type.R
index 5089789f6c1..cda606e3fa9 100644
--- a/r/R/type.R
+++ b/r/R/type.R
@@ -24,10 +24,13 @@
#'
#' @section Methods:
#'
-#' TODO
+#' - `$ToString()`: String representation of the DataType
+#' - `$Equals(other)`: Is the DataType equal to `other`
+#' - `$fields()`: The children fields associated with this type
#'
#' @rdname DataType
#' @name DataType
+#' @seealso [`data-type`]
DataType <- R6Class("DataType",
inherit = ArrowObject,
public = list(
diff --git a/r/README.md b/r/README.md
index 2a85a82aeb3..edfa4678f3a 100644
--- a/r/README.md
+++ b/r/README.md
@@ -29,8 +29,8 @@ access to the Arrow C++ library API and higher-level access through a
efficiency** (`read_csv_arrow()`, `read_json_arrow()`)
- Write CSV files (`write_csv_arrow()`)
- Manipulate and analyze Arrow data with **`dplyr` verbs**
-- Read and write files in **Amazon S3** buckets with no additional
- function calls
+- Read and write files in **Amazon S3** and **Google Cloud Storage**
+ buckets with no additional function calls
- Exercise **fine control over column types** for seamless
interoperability with databases and data warehouse systems
- Use **compression codecs** including Snappy, gzip, Brotli,
@@ -64,9 +64,18 @@ additional system dependencies. For macOS and Windows, CRAN hosts binary
packages that contain the Arrow C++ library. On Linux, source package
installation will also build necessary C++ dependencies. For a faster,
more complete installation, set the environment variable
-`NOT_CRAN=true`. See `vignette("install", package = "arrow")` for
-details. Note that version 9.0.0 was the last version to support
-R 3.6 and lower on Windows.
+`NOT_CRAN=true`. See `vignette("install", package = "arrow")` for details.
+
+As of version 10.0.0, `arrow` requires C++17 to build. This means that:
+
+* On Windows, you need `R >= 4.0`. Version 9.0.0 was the last version to support
+R 3.6.
+* On CentOS 7, you can build the latest version of `arrow`,
+but you first need to install a newer compiler than the default system compiler,
+gcc 4.8. See `vignette("install", package = "arrow")` for guidance.
+Note that you only need the newer compiler to build `arrow`:
+installing a binary package, as from RStudio Package Manager,
+or loading a package you've already installed works fine with the system defaults.
### Installing a development version
@@ -134,7 +143,7 @@ returns an R `data.frame`. To return an Arrow `Table`, set argument
- `read_json_arrow()`: read a JSON data file
For writing data to single files, the `arrow` package provides the
-functions `write_parquet()`, `write_feather()`, and `write_csv_arrow()`.
+functions `write_parquet()`, `write_feather()`, and `write_csv_arrow()`.
These can be used with R `data.frame` and Arrow `Table` objects.
For example, let’s write the Star Wars characters data that’s included
@@ -266,7 +275,7 @@ sw %>%
```
Additionally, equality joins (e.g. `left_join()`, `inner_join()`) are supported
-for joining multiple tables.
+for joining multiple tables.
```r
jedi <- data.frame(
diff --git a/r/configure b/r/configure
index 95328fd080f..eae33be57a4 100755
--- a/r/configure
+++ b/r/configure
@@ -51,6 +51,14 @@ if [ "$ARROW_R_DEV" = "true" ] && [ -f "data-raw/codegen.R" ]; then
${R_HOME}/bin/Rscript data-raw/codegen.R
fi
+if [ ! "`${R_HOME}/bin/R CMD config CXX17`" ]; then
+ echo "------------------------- NOTE ---------------------------"
+ echo "Cannot install arrow: a C++17 compiler is required."
+ echo "See https://arrow.apache.org/docs/r/articles/install.html"
+ echo "---------------------------------------------------------"
+ exit 1
+fi
+
if [ -f "tools/apache-arrow.rb" ]; then
# If you want to use a local apache-arrow.rb formula, do
# $ cp ../dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb tools/apache-arrow.rb
@@ -177,7 +185,7 @@ else
# Assume nixlibs.R has handled and messaged about its failure already
#
# TODO: what about non-bundled deps?
- # Set CDPATH locally to prevent interference from global CDPATH (if set)
+ # Set CDPATH locally to prevent interference from global CDPATH (if set)
BUNDLED_LIBS=`CDPATH=''; cd $LIB_DIR && ls *.a`
BUNDLED_LIBS=`echo "$BUNDLED_LIBS" | sed -e "s/\\.a lib/ -l/g" | sed -e "s/\\.a$//" | sed -e "s/^lib/-l/" | tr '\n' ' ' | sed -e "s/ $//"`
PKG_DIRS="-L`pwd`/$LIB_DIR"
diff --git a/r/man/DataType.Rd b/r/man/DataType.Rd
index 8c96141bede..7c0bb4ec97c 100644
--- a/r/man/DataType.Rd
+++ b/r/man/DataType.Rd
@@ -9,7 +9,13 @@ class arrow::DataType
}
\section{Methods}{
-
-TODO
+\itemize{
+\item \verb{$ToString()}: String representation of the DataType
+\item \verb{$Equals(other)}: Is the DataType equal to \code{other}
+\item \verb{$fields()}: The children fields associated with this type
+}
}
+\seealso{
+\code{\link{data-type}}
+}
diff --git a/r/man/acero.Rd b/r/man/acero.Rd
index 45afebd336b..84adf081de3 100644
--- a/r/man/acero.Rd
+++ b/r/man/acero.Rd
@@ -38,7 +38,7 @@ Table into an R \code{data.frame}.
\item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} and \code{na_matches} arguments are ignored
\item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} and \code{na_matches} arguments are ignored
\item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported
-\item \code{\link[dplyr:pull]{pull()}}: returns an Arrow \link{ChunkedArray}, not an R vector
+\item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally.
\item \code{\link[dplyr:relocate]{relocate()}}
\item \code{\link[dplyr:rename]{rename()}}
\item \code{\link[dplyr:rename]{rename_with()}}
@@ -68,7 +68,7 @@ can assume that the function works in Acero just as it does in R.
Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both
\code{str_sub()} and \code{stringr::str_sub()} work.
-In addition to these functions, you can call any of Arrow's 244 compute
+In addition to these functions, you can call any of Arrow's 243 compute
functions directly. Arrow has many functions that don't map to an existing R
function. In other cases where there is an R function mapping, you can still
call the Arrow function directly if you don't want the adaptations that the R
diff --git a/r/man/cast.Rd b/r/man/cast.Rd
index 88134f2e022..81e729c704f 100644
--- a/r/man/cast.Rd
+++ b/r/man/cast.Rd
@@ -13,12 +13,12 @@ cast(x, to, safe = TRUE, ...)
it should be a \link{Schema}.}
\item{safe}{logical: only allow the type conversion if no data is lost
-(truncation, overflow, etc.). Default is \code{TRUE}}
+(truncation, overflow, etc.). Default is \code{TRUE}.}
\item{...}{specific \code{CastOptions} to set}
}
\value{
-an \code{Expression}
+An \link{Expression}
}
\description{
This is a wrapper around the \verb{$cast()} method that many Arrow objects have.
@@ -32,7 +32,9 @@ mtcars \%>\%
}
}
\seealso{
-https://arrow.apache.org/docs/cpp/api/compute.html for the list of
-supported CastOptions.
+\code{\link{data-type}} for a list of \link{DataType} to be used with \code{to}.
+
+\href{https://arrow.apache.org/docs/cpp/api/compute.html?highlight=castoptions#arrow\%3A\%3Acompute\%3A\%3ACastOptions}{Arrow C++ CastOptions documentation} # nolint
+for the list of supported CastOptions.
}
\keyword{internal}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 91c3c6a2356..cde8795c9fb 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -4776,11 +4776,19 @@ BEGIN_CPP11
END_CPP11
}
// schema.cpp
-std::shared_ptr schema_(const std::vector>& fields);
-extern "C" SEXP _arrow_schema_(SEXP fields_sexp){
+std::shared_ptr Schema__from_fields(const std::vector>& fields);
+extern "C" SEXP _arrow_Schema__from_fields(SEXP fields_sexp){
BEGIN_CPP11
arrow::r::Input>&>::type fields(fields_sexp);
- return cpp11::as_sexp(schema_(fields));
+ return cpp11::as_sexp(Schema__from_fields(fields));
+END_CPP11
+}
+// schema.cpp
+std::shared_ptr Schema__from_list(cpp11::list field_list);
+extern "C" SEXP _arrow_Schema__from_list(SEXP field_list_sexp){
+BEGIN_CPP11
+ arrow::r::Input::type field_list(field_list_sexp);
+ return cpp11::as_sexp(Schema__from_list(field_list));
END_CPP11
}
// schema.cpp
@@ -5695,7 +5703,8 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1},
{ "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2},
{ "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2},
- { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1},
+ { "_arrow_Schema__from_fields", (DL_FUNC) &_arrow_Schema__from_fields, 1},
+ { "_arrow_Schema__from_list", (DL_FUNC) &_arrow_Schema__from_list, 1},
{ "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1},
{ "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1},
{ "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2},
diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 1ed949e7295..0bfc5172852 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -611,8 +611,8 @@ class RScalarUDFKernelState : public arrow::compute::KernelState {
RScalarUDFKernelState(cpp11::sexp exec_func, cpp11::sexp resolver)
: exec_func_(exec_func), resolver_(resolver) {}
- cpp11::function exec_func_;
- cpp11::function resolver_;
+ cpp11::sexp exec_func_;
+ cpp11::sexp resolver_;
};
arrow::Result ResolveScalarUDFOutputType(
@@ -630,7 +630,8 @@ arrow::Result ResolveScalarUDFOutputType(
cpp11::to_r6(input_types[i].GetSharedPtr());
}
- cpp11::sexp output_type_sexp = state->resolver_(input_types_sexp);
+ cpp11::sexp output_type_sexp =
+ cpp11::function(state->resolver_)(input_types_sexp);
if (!Rf_inherits(output_type_sexp, "DataType")) {
cpp11::stop(
"Function specified as arrow_scalar_function() out_type argument must "
@@ -674,7 +675,8 @@ arrow::Status CallRScalarUDF(arrow::compute::KernelContext* context,
cpp11::writable::list udf_context = {batch_length_sexp, output_type_sexp};
udf_context.names() = {"batch_length", "output_type"};
- cpp11::sexp func_result_sexp = state->exec_func_(udf_context, args_sexp);
+ cpp11::sexp func_result_sexp =
+ cpp11::function(state->exec_func_)(udf_context, args_sexp);
if (Rf_inherits(func_result_sexp, "Array")) {
auto array = cpp11::as_cpp>(func_result_sexp);
diff --git a/r/src/recordbatchreader.cpp b/r/src/recordbatchreader.cpp
index d0c52acc416..8e9df121748 100644
--- a/r/src/recordbatchreader.cpp
+++ b/r/src/recordbatchreader.cpp
@@ -70,7 +70,7 @@ class RFunctionRecordBatchReader : public arrow::RecordBatchReader {
arrow::Status ReadNext(std::shared_ptr* batch_out) {
auto batch = SafeCallIntoR>([&]() {
- cpp11::sexp result_sexp = fun_();
+ cpp11::sexp result_sexp = cpp11::function(fun_)();
if (result_sexp == R_NilValue) {
return std::shared_ptr(nullptr);
} else if (!Rf_inherits(result_sexp, "RecordBatch")) {
@@ -94,7 +94,7 @@ class RFunctionRecordBatchReader : public arrow::RecordBatchReader {
}
private:
- cpp11::function fun_;
+ cpp11::sexp fun_;
std::shared_ptr schema_;
};
diff --git a/r/src/schema.cpp b/r/src/schema.cpp
index 2bc58f0fa36..0dac188ec07 100644
--- a/r/src/schema.cpp
+++ b/r/src/schema.cpp
@@ -22,11 +22,28 @@
#include
// [[arrow::export]]
-std::shared_ptr schema_(
+std::shared_ptr Schema__from_fields(
const std::vector>& fields) {
return arrow::schema(fields);
}
+// [[arrow::export]]
+std::shared_ptr Schema__from_list(cpp11::list field_list) {
+ int n = field_list.size();
+
+ bool nullable = true;
+ cpp11::strings names(field_list.attr(R_NamesSymbol));
+
+ std::vector> fields(n);
+
+ for (int i = 0; i < n; i++) {
+ fields[i] = arrow::field(
+ names[i], cpp11::as_cpp>(field_list[i]),
+ nullable);
+ }
+ return arrow::schema(fields);
+}
+
// [[arrow::export]]
std::string Schema__ToString(const std::shared_ptr& s) {
return s->ToString();
diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R
index d705a8029c5..6812a3eec0a 100644
--- a/r/tests/testthat/helper-arrow.R
+++ b/r/tests/testthat/helper-arrow.R
@@ -29,6 +29,10 @@ Sys.setlocale("LC_COLLATE", "C")
# (R CMD check does this, but in case you're running outside of check)
Sys.setenv(LANGUAGE = "en")
+# Set this option so that the deprecation warning isn't shown
+# (except when we test for it)
+options(arrow.pull_as_vector = FALSE)
+
with_language <- function(lang, expr) {
old <- Sys.getenv("LANGUAGE")
# Check what this message is before changing languages; this will
diff --git a/r/tests/testthat/test-dplyr-funcs.R b/r/tests/testthat/test-dplyr-funcs.R
index 86f984dd32c..48b74c9af43 100644
--- a/r/tests/testthat/test-dplyr-funcs.R
+++ b/r/tests/testthat/test-dplyr-funcs.R
@@ -35,6 +35,9 @@ test_that("register_binding()/unregister_binding() works", {
register_binding("some.pkg2::some_fun", fun2, fake_registry),
"A \"some_fun\" binding already exists in the registry and will be overwritten."
)
+
+ # No warning when an identical function is re-registered
+ expect_silent(register_binding("some.pkg2::some_fun", fun2, fake_registry))
})
test_that("register_binding_agg() works", {
diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R
index db9a3bb30d0..ef9a9bcdc14 100644
--- a/r/tests/testthat/test-dplyr-query.R
+++ b/r/tests/testthat/test-dplyr-query.R
@@ -91,6 +91,17 @@ test_that("pull", {
)
})
+test_that("pull() shows a deprecation warning if the option isn't set", {
+ expect_warning(
+ vec <- tbl %>%
+ arrow_table() %>%
+ pull(as_vector = NULL),
+ "Current behavior of returning an R vector is deprecated"
+ )
+ # And the default is the old behavior, an R vector
+ expect_identical(vec, pull(tbl))
+})
+
test_that("collect(as_data_frame=FALSE)", {
batch <- record_batch(tbl)
@@ -583,9 +594,9 @@ test_that("needs_projection unit tests", {
test_that("compute() on a grouped query returns a Table with groups in metadata", {
tab1 <- tbl %>%
- arrow_table() %>%
- group_by(int) %>%
- compute()
+ arrow_table() %>%
+ group_by(int) %>%
+ compute()
expect_r6_class(tab1, "Table")
expect_equal(
as.data.frame(tab1),
diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R
index 025cf059f8a..817563b85cc 100644
--- a/r/tools/nixlibs.R
+++ b/r/tools/nixlibs.R
@@ -32,7 +32,7 @@ dev_version <- package_version(VERSION)[1, 4]
# Small dev versions are added for R-only changes during CRAN submission.
if (is.na(dev_version) || dev_version < 100) {
VERSION <- package_version(VERSION)[1, 1:3]
- arrow_repo <- sprintf("https://apache.jfrog.io/artifactory/arrow/r/%s/libarrow/", VERSION)
+ arrow_repo <- paste0(getOption("arrow.repo", sprintf("https://apache.jfrog.io/artifactory/arrow/r/%s", VERSION)), "/libarrow/")
} else {
arrow_repo <- paste0(getOption("arrow.dev_repo", "https://nightlies.apache.org/arrow/r"), "/libarrow/")
}
@@ -98,8 +98,8 @@ download_binary <- function(lib) {
# * Some other string: a "distro-version" that corresponds to a binary that is
# available, to override what this function may discover by default.
# Possible values are:
-# * "centos-7" (gcc 4.8, no AWS/GCS support)
-# * "ubuntu-18.04" (gcc 8, openssl 1)
+# * "centos-7" (gcc 8 (devtoolset), openssl 1, glib 2.17)
+# * "ubuntu-18.04" (gcc 8, openssl 1, glib 2.27)
# * "ubuntu-22.04" (openssl 3)
# These string values, along with `NULL`, are the potential return values of
# this function.
@@ -137,28 +137,21 @@ check_allowlist <- function(os, allowed = "https://raw.githubusercontent.com/apa
select_binary <- function(os = tolower(Sys.info()[["sysname"]]),
arch = tolower(Sys.info()[["machine"]]),
- compiler_version = compiler_version_string(),
test_program = test_for_curl_and_openssl) {
if (identical(os, "linux") && identical(arch, "x86_64")) {
# We only host x86 linux binaries today
- is_gcc4 <- any(grepl("^g\\+\\+.*[^\\d.]4(\\.\\d){2}", compiler_version))
- if (is_gcc4) {
- cat("*** Some features are not available with gcc 4\n")
- return("centos-7")
- } else {
- tryCatch(
- # Somehow the test program system2 call errors on the sanitizer builds
- # so globally handle the possibility that this could fail
- {
- errs <- compile_test_program(test_program)
- determine_binary_from_stderr(errs)
- },
- error = function(e) {
- cat("*** Unable to find libcurl and openssl\n")
- NULL
- }
- )
- }
+ tryCatch(
+ # Somehow the test program system2 call errors on the sanitizer builds
+ # so globally handle the possibility that this could fail
+ {
+ errs <- compile_test_program(test_program)
+ determine_binary_from_stderr(errs)
+ },
+ error = function(e) {
+ cat("*** Unable to find libcurl and openssl\n")
+ NULL
+ }
+ )
} else {
# No binary available for arch
cat(sprintf("*** Building on %s %s\n", os, arch))
@@ -196,30 +189,20 @@ compile_test_program <- function(code) {
suppressWarnings(system2("echo", sprintf('"%s" | %s -', code, runner), stdout = FALSE, stderr = TRUE))
}
-# TODO(ARROW-16976): build "ubuntu-18.04" on centos7 with newer devtoolset (but glibc is 2.17) for broader compatibility (like manylinux2014)?
+# TODO(ARROW-16976): drop "ubuntu-18.04" and just use "centos-7"
+# (built with newer devtoolset but older glibc (2.17) for broader compatibility,# like manylinux2014)
determine_binary_from_stderr <- function(errs) {
if (is.null(attr(errs, "status"))) {
# There was no error in compiling: so we found libcurl and openssl > 1.0.2,
# openssl is < 3.0, glibc is >= 2.27, and we're not using a strict libc++
cat("*** Found libcurl and openssl >= 1.0.2\n")
return("ubuntu-18.04")
+ # Else, check for dealbreakers:
} else if (any(grepl("Using libc++", errs, fixed = TRUE))) {
# Our binaries are all built with GNU stdlib so they fail with libc++
cat("*** Found libc++\n")
return(NULL)
- } else if (any(grepl("glibc version too old", errs))) {
- # ubuntu-18.04 has glibc 2.27, so even if you install newer compilers
- # (e.g. devtoolset on centos) and have curl/openssl, you run into problems
- # TODO(ARROW-16976): build binaries with older glibc
- cat("*** Checking glibc version\n")
- # If we're here, we're on an older OS but with a newer compiler than gcc 4.8
- # (we already checked), so it is possible to build with more features on.
- # We just can't use our binaries because they were built with newer glibc.
- return("centos-7")
} else if (header_not_found("curl/curl", errs)) {
- # TODO(ARROW-16985): should these next 3 NULL cases return centos-7? A source build
- # won't be able to include more features.
- # Could check if build_ok (also for glibc?)
cat("*** libcurl not found\n")
return(NULL)
} else if (header_not_found("openssl/opensslv", errs)) {
@@ -228,6 +211,15 @@ determine_binary_from_stderr <- function(errs) {
} else if (any(grepl("OpenSSL version too old", errs))) {
cat("*** openssl found but version >= 1.0.2 is required for some features\n")
return(NULL)
+ # Else, determine which other binary will work
+ } else if (any(grepl("glibc version too old", errs))) {
+ # ubuntu-18.04 has glibc 2.27, so even if you install newer compilers
+ # (e.g. devtoolset on centos) and have curl/openssl, you run into problems
+ # TODO(ARROW-16976): build binaries with older glibc
+ cat("*** Checking glibc version\n")
+ # If we're here, we're on an older OS but with a new enough compiler
+ # (e.g. CentOS 7 with devtoolset-8)
+ return("centos-7")
} else if (any(grepl("Using OpenSSL version 3", errs))) {
cat("*** Found libcurl and openssl >= 3.0.0\n")
return("ubuntu-22.04")
@@ -240,10 +232,6 @@ header_not_found <- function(header, errs) {
any(grepl(regex, errs))
}
-compiler_version_string <- function(compiler = R_CMD_config("CXX17")) {
- system(paste(compiler, "--version"), intern = TRUE)
-}
-
#### start distro ####
distro <- function() {
@@ -436,10 +424,9 @@ build_libarrow <- function(src_dir, dst_dir) {
LDFLAGS = R_CMD_config("LDFLAGS")
)
env_var_list <- with_cloud_support(env_var_list)
- env_var_list <- with_mimalloc(env_var_list)
- # turn_off_all_optional_features() needs to happen after with_mimalloc() and
- # with_cloud_support(), since those might turn features ON.
+ # turn_off_all_optional_features() needs to happen after
+ # with_cloud_support(), since it might turn features ON.
thirdparty_deps_unavailable <- !download_ok &&
!dir.exists(thirdparty_dependency_dir) &&
!env_is("ARROW_DEPENDENCY_SOURCE", "system")
@@ -654,26 +641,12 @@ is_feature_requested <- function(env_varname, default = env_is("LIBARROW_MINIMAL
requested
}
-with_mimalloc <- function(env_var_list) {
- arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC")
- if (arrow_mimalloc) {
- # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9
- if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) {
- cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n")
- arrow_mimalloc <- FALSE
- }
- }
- replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF"))
-}
-
with_cloud_support <- function(env_var_list) {
arrow_s3 <- is_feature_requested("ARROW_S3")
arrow_gcs <- is_feature_requested("ARROW_GCS")
if (arrow_s3 || arrow_gcs) {
# User wants S3 or GCS support.
- # If they're using gcc, let's make sure the version is >= 4.9
- # (aws-sdk-cpp requires that; google-cloud-cpp only tests with >= 6.3)
- # and make sure that we have curl and openssl system libs
+ # Make sure that we have curl and openssl system libs
feats <- c(
if (arrow_s3) "S3",
if (arrow_gcs) "GCS"
@@ -690,11 +663,7 @@ with_cloud_support <- function(env_var_list) {
# capabilities for using binaries. We could consider consolidating this
# logic, though these use cmake in order to match exactly what we do in the
# libarrow build, and maybe that increases the fidelity.
- if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) {
- print_warning("not available for gcc < 4.9")
- arrow_s3 <- FALSE
- arrow_gcs <- FALSE
- } else if (!cmake_find_package("CURL", NULL, env_var_list)) {
+ if (!cmake_find_package("CURL", NULL, env_var_list)) {
# curl on macos should be installed, so no need to alter this for macos
# TODO: check for apt/yum/etc. and message the right thing?
print_warning("requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb)")
@@ -712,25 +681,6 @@ with_cloud_support <- function(env_var_list) {
replace(env_var_list, "ARROW_GCS", ifelse(arrow_gcs, "ON", "OFF"))
}
-cmake_gcc_version <- function(env_var_list) {
- # This function returns NA if using a non-gcc compiler
- # Always enclose calls to it in isTRUE() or isFALSE()
- vals <- cmake_cxx_compiler_vars(env_var_list)
- if (!identical(vals[["CMAKE_CXX_COMPILER_ID"]], "GNU")) {
- return(NA)
- }
- package_version(vals[["CMAKE_CXX_COMPILER_VERSION"]])
-}
-
-cmake_cxx_compiler_vars <- function(env_var_list) {
- env_vars <- env_vars_as_string(env_var_list)
- info <- system(paste("export", env_vars, "&& $CMAKE --system-information"), intern = TRUE)
- info <- grep("^[A-Z_]* .*$", info, value = TRUE)
- vals <- as.list(sub('^.*? "?(.*?)"?$', "\\1", info))
- names(vals) <- sub("^(.*?) .*$", "\\1", info)
- vals[grepl("^CMAKE_CXX_COMPILER_?", names(vals))]
-}
-
cmake_find_package <- function(pkg, version = NULL, env_var_list) {
td <- tempfile()
dir.create(td)
diff --git a/r/tools/test-nixlibs.R b/r/tools/test-nixlibs.R
index d5e83b13058..631ff7a3e35 100644
--- a/r/tools/test-nixlibs.R
+++ b/r/tools/test-nixlibs.R
@@ -39,19 +39,6 @@ test_that("select_binary() based on system", {
expect_null(select_binary("linux", arch = "aarch64")), # Not built today
"Building on linux aarch64"
)
- gcc48 <- c(
- "g++-4.8 (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4",
- "Copyright (C) 2013 Free Software Foundation, Inc.",
- "This is free software; see the source for copying conditions. There is NO",
- "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
- )
- expect_output(
- expect_identical(
- select_binary("linux", "x86_64", compiler_version = gcc48),
- "centos-7"
- ),
- "Some features are not available with gcc 4"
- )
})
test_that("compile_test_program()", {
@@ -87,14 +74,14 @@ test_that("determine_binary_from_stderr", {
test_that("select_binary() with test program", {
expect_output(
expect_identical(
- select_binary("linux", "x86_64", "clang", "int a;"),
+ select_binary("linux", "x86_64", "int a;"),
"ubuntu-18.04"
),
"Found libcurl and openssl >= 1.0.2"
)
expect_output(
expect_identical(
- select_binary("linux", "x86_64", "clang", "#error Using OpenSSL version 3"),
+ select_binary("linux", "x86_64", "#error Using OpenSSL version 3"),
"ubuntu-22.04"
),
"Found libcurl and openssl >= 3.0.0"
diff --git a/r/tools/winlibs.R b/r/tools/winlibs.R
index 165c98da5ea..d941da4baa6 100644
--- a/r/tools/winlibs.R
+++ b/r/tools/winlibs.R
@@ -44,7 +44,10 @@ if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) {
"/libarrow/bin/windows/arrow-%s.zip"
)
# %1$s uses the first variable for both substitutions
- artifactory <- "https://apache.jfrog.io/artifactory/arrow/r/%1$s/libarrow/bin/windows/arrow-%1$s.zip"
+ artifactory <- paste0(
+ getOption("arrow.repo", "https://apache.jfrog.io/artifactory/arrow/r/%1$s"),
+ "/libarrow/bin/windows/arrow-%1$s.zip"
+ )
rwinlib <- "https://github.com/rwinlib/arrow/archive/v%s.zip"
dev_version <- package_version(VERSION)[1, 4]
diff --git a/r/vignettes/developers/setup.Rmd b/r/vignettes/developers/setup.Rmd
index 54575d14cf8..49b52ed8288 100644
--- a/r/vignettes/developers/setup.Rmd
+++ b/r/vignettes/developers/setup.Rmd
@@ -47,9 +47,9 @@ recent version of the library without building from source.
On Linux, you can download a .zip file containing libarrow from the
[nightly repository](https://nightlies.apache.org/arrow/r/libarrow/bin/).
-The directory names correspond to the OS the binaries where built on:
-- "centos-7" (gcc 4.8, no AWS/GCS support)
-- "ubuntu-18.04" (gcc 8, openssl 1)
+The directory names correspond to the OS the binaries where built on:
+- "centos-7" (gcc 8 via devtoolset, openssl 1, glib 2.17)
+- "ubuntu-18.04" (gcc 8, openssl 1, glib 2.27)
- "ubuntu-22.04" (openssl 3)
Version numbers in that repository correspond to dates.
@@ -68,7 +68,7 @@ brew install apache-arrow --HEAD
### Windows
-On Windows, you can download a .zip file containing libarrow from the
+On Windows, you can download a .zip file containing libarrow from the
[nightly repository](https://nightlies.apache.org/arrow/r/libarrow/bin/windows/).
Version numbers in that repository correspond to dates.
@@ -462,4 +462,4 @@ guide](https://arrow.apache.org/docs/developers/cpp/building.html).
## Other installation issues
-There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information.
+There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the article on R package installation](./install_details.html) for more information.
diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd
index 36c973289b2..953f3c41bfc 100644
--- a/r/vignettes/install.Rmd
+++ b/r/vignettes/install.Rmd
@@ -25,6 +25,57 @@ a more detailed discussion of the code run during the installation process in th
> Having trouble installing arrow? See the "Troubleshooting" section below.
+# System dependencies
+
+The arrow package is designed to work with very minimal system requirements,
+but there are a few things to note.
+
+## Compilers
+
+As of version 10.0.0, arrow requires a C++17 compiler to build.
+For `gcc`, this generally means version 7 or newer. Most contemporary Linux
+distributions have a new enough compiler; however, CentOS 7 is a notable
+exception, as it ships with gcc 4.8.
+
+If you are on CentOS 7, to build arrow you will need to install a newer `devtoolset`, and you'll need to update R's Makevars to define the `CXX17` variables. This script installs `devtoolset-8` and configures R to be able to use C++17:
+
+```
+#!/usr/bin/env bash
+
+yum install -y centos-release-scl
+yum install -y devtoolset-8
+# Optional: also install cloud storage dependencies, as described below
+yum install -y libcurl-devel openssl-devel
+
+source /opt/rh/devtoolset-8/enable
+
+if [ ! `R CMD config CXX17` ]; then
+ mkdir -p ~/.R
+ echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars
+ echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars
+ echo "CXX17STD = -std=c++17" >> ~/.R/Makevars
+ echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars
+fi
+```
+
+Note that the C++17 compiler is only required at *build* time. You don't need
+to enable the devtoolset every time you load the package. What's more, if you
+install a binary package from RStudio Package Manager (see method 1a below), you
+do not need to set up any of this. Likewise, if you `R CMD INSTALL --build`
+arrow on a CentOS machine with the newer compilers, you can take the binary
+package it produces and install it on any other CentOS machine without those
+compilers.
+
+## Libraries
+
+Optional support for reading from cloud storage--AWS S3 and
+Google Cloud Storage (GCS)--requires additional system dependencies:
+
+* CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb)
+* OpenSSL >= 1.0.2: install `openssl-devel` (rpm) or `libssl-dev` (deb)
+
+The prebuilt binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support.
+
# Installing a release version (the easy way)
## Method 1 - Installation with a precompiled libarrow binary
@@ -85,7 +136,12 @@ install.packages("arrow")
This installs the source version of the R package, but during the installation process will check for compatible libarrow binaries that we host and use those if available. If no binary is available or can't be found, then this option falls back onto method 2 below (full source build), but setting the environment variable results in a more fully-featured build than default.
-Except for the those built for gcc 4.8 (default on CentOS 7), the binaries include support for AWS S3 and Google Cloud Storage (GCS). These features require libcurl and openssl libraries installed separately; see below on how to install them. If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build.
+The libarrow binaries include support for AWS S3 and GCS, so they require the
+libcurl and openssl libraries installed separately, as noted above.
+If you don't have these installed, the libarrow binary won't be used, and you will fall back to the full source build (with S3 and GCS support disabled).
+
+Users on CentOS 7 will also need to install and configure a C++17 compiler.
+See "System dependencies" above.
# Installing a release version (the less easy way)
@@ -172,20 +228,17 @@ If downloading dependencies at build time is not an option, as when building on
#### Dependencies for S3 and GCS support
-The arrow package allows you to work with data in AWS S3 or in other cloud
-storage system that emulate S3, as well as Google Cloud Storage.
-However, support for working with S3 and GCS is not
-enabled in the default source build, and it has additional system requirements. To
+Support for working with data in S3 and GCS is not enabled in the default
+source build, and it has additional system requirements as described above. To
enable it, set the environment variable `LIBARROW_MINIMAL=false` or
`NOT_CRAN=true` to choose the full-featured build, or more selectively set
`ARROW_S3=ON` and/or `ARROW_GCS=ON`.
-You also need the following system dependencies:
-
-* `gcc` >= 4.9 or `clang` >= 3.3; note that the default compiler on CentOS 7 is gcc 4.8.5, which is not sufficient
-* CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb)
-* OpenSSL >= 1.0.2: install `openssl-devel` (rpm) or `libssl-dev` (deb)
-The prebuilt libarrow binaries come with S3 and GCS support enabled, so you will need to meet these system requirements in order to use them. If you're building everything from source, the install script will check for the presence of these dependencies and turn off S3 and GCS support in the build if the prerequisites are not met--installation will succeed but without S3 or GCS functionality. If afterwards you install the missing system requirements, you'll need to reinstall the package in order to enable S3 and GCS support.
+When either feature is enabled, the install script will check for the presence
+of the required dependencies, and if the prerequisites are met, it will turn
+off S3 and GCS support--installation will succeed but without S3 or GCS
+functionality. If afterwards you install the missing system requirements,
+you'll need to reinstall the package in order to enable S3 and GCS support.
### Advanced configuration for building from source
@@ -239,8 +292,8 @@ See below for more in-depth explanations of these environment variables.
will work with your system. You can set it to `false` to skip this option
altogether, or you can specify a string "distro-version" that corresponds to
a binary that is available, to override what this function may discover by
- default. Possible values are: "centos-7" (gcc 4.8, no AWS/GCS support);
- "ubuntu-18.04" (gcc 8, openssl 1); "ubuntu-22.04" (openssl 3).
+ default. Possible values are: "centos-7",
+ "ubuntu-18.04" (both with gcc 8, and openssl 1), "ubuntu-22.04" (openssl 3).
* `LIBARROW_BUILD` : If set to `false`, the build script
will not attempt to build the C++ from source. This means you will only get
a working arrow R package if a prebuilt binary is found.
@@ -477,19 +530,7 @@ so that we can improve the script.
## Known installation issues
-* On CentOS, if you are using a more modern `devtoolset`, you may need to set
-the environment variables `CC` and `CXX` either in the shell or in R's `Makeconf`.
-For CentOS 7 and above, both the Arrow system packages and the C++ binaries
-for R are built with the default system compilers. If you want to use either of these
-and you have a `devtoolset` installed, set `CC=/usr/bin/gcc CXX=/usr/bin/g++`
-to use the system compilers instead of the `devtoolset`.
-Alternatively, if you want to build arrow with the newer `devtoolset` compilers,
-set both `ARROW_USE_PKG_CONFIG` and `LIBARROW_BINARY` to `false` so that
-you build the Arrow C++ from source using those compilers.
-Compiler mismatch between the arrow system libraries and the R
-package may cause R to segfault when arrow package functions are used.
-See discussions [here](https://issues.apache.org/jira/browse/ARROW-8586)
-and [here](https://issues.apache.org/jira/browse/ARROW-10780).
+* On CentOS, building the package requires a more modern `devtoolset` than the default system compilers. See "System dependencies" above.
* If you have multiple versions of `zstd` installed on your system,
installation by building libarrow from source may fail with an "undefined symbols"
diff --git a/ruby/red-arrow/test/test-orc.rb b/ruby/red-arrow/test/test-orc.rb
index b882da0a1b5..4670350a09d 100644
--- a/ruby/red-arrow/test/test-orc.rb
+++ b/ruby/red-arrow/test/test-orc.rb
@@ -164,8 +164,8 @@ def pp_values(values)
]
end
assert_equal([
- ["boolean1: bool", [pp_values([false, true])]],
- ["short1: int16", [pp_values([1024, 2048])]],
+ ["byte1: int8", [pp_values([1, 100])]],
+ ["int1: int32", [pp_values([65536, 65536])]],
],
dump)
end