diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index 6f8e07ba8..df921ef99 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -348,6 +348,28 @@ if(OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS AND OQS_USE_ARM_NEO
 endif()
 endif()
 
+cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_512 "" ON "OQS_ENABLE_SIG_FALCON" OFF)
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_512_avx2 "" ON "OQS_ENABLE_SIG_falcon_padded_512" OFF)
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+if(OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS AND OQS_USE_ARM_NEON_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_512_aarch64 "" ON "OQS_ENABLE_SIG_falcon_padded_512" OFF)
+endif()
+endif()
+
+cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_1024 "" ON "OQS_ENABLE_SIG_FALCON" OFF)
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_1024_avx2 "" ON "OQS_ENABLE_SIG_falcon_padded_1024" OFF)
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+if(OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS AND OQS_USE_ARM_NEON_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_1024_aarch64 "" ON "OQS_ENABLE_SIG_falcon_padded_1024" OFF)
+endif()
+endif()
+
 
 option(OQS_ENABLE_SIG_SPHINCS "Enable sphincs algorithm family" ON)
 cmake_dependent_option(OQS_ENABLE_SIG_sphincs_sha2_128f_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
@@ -448,7 +470,7 @@ if(NOT ((OQS_MINIMAL_BUILD STREQUAL "") OR (OQS_MINIMAL_BUILD STREQUAL "OFF")))
 	filter_algs("${OQS_MINIMAL_BUILD}")
 elseif (${OQS_ALGS_ENABLED} STREQUAL "STD")
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_LIST_STANDARDIZED_ALGS_START
-	filter_algs("KEM_ml_kem_512_ipd;KEM_ml_kem_512;KEM_ml_kem_768_ipd;KEM_ml_kem_768;KEM_ml_kem_1024_ipd;KEM_ml_kem_1024;SIG_ml_dsa_44_ipd;SIG_ml_dsa_44;SIG_ml_dsa_65_ipd;SIG_ml_dsa_65;SIG_ml_dsa_87_ipd;SIG_ml_dsa_87;SIG_falcon_512;SIG_falcon_1024;SIG_sphincs_sha2_128f_simple;SIG_sphincs_sha2_128s_simple;SIG_sphincs_sha2_192f_simple;SIG_sphincs_sha2_192s_simple;SIG_sphincs_sha2_256f_simple;SIG_sphincs_sha2_256s_simple;SIG_sphincs_shake_128f_simple;SIG_sphincs_shake_128s_simple;SIG_sphincs_shake_192f_simple;SIG_sphincs_shake_192s_simple;SIG_sphincs_shake_256f_simple;SIG_sphincs_shake_256s_simple")
+	filter_algs("KEM_ml_kem_512_ipd;KEM_ml_kem_512;KEM_ml_kem_768_ipd;KEM_ml_kem_768;KEM_ml_kem_1024_ipd;KEM_ml_kem_1024;SIG_ml_dsa_44_ipd;SIG_ml_dsa_44;SIG_ml_dsa_65_ipd;SIG_ml_dsa_65;SIG_ml_dsa_87_ipd;SIG_ml_dsa_87;SIG_falcon_512;SIG_falcon_1024;SIG_falcon_padded_512;SIG_falcon_padded_1024;SIG_sphincs_sha2_128f_simple;SIG_sphincs_sha2_128s_simple;SIG_sphincs_sha2_192f_simple;SIG_sphincs_sha2_192s_simple;SIG_sphincs_sha2_256f_simple;SIG_sphincs_sha2_256s_simple;SIG_sphincs_shake_128f_simple;SIG_sphincs_shake_128s_simple;SIG_sphincs_shake_192f_simple;SIG_sphincs_shake_192s_simple;SIG_sphincs_shake_256f_simple;SIG_sphincs_shake_256s_simple")
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_LIST_STANDARDIZED_ALGS_END
 elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_R4")
 	filter_algs("KEM_classic_mceliece_348864;KEM_classic_mceliece_348864f;KEM_classic_mceliece_460896;KEM_classic_mceliece_460896f;KEM_classic_mceliece_6688128;KEM_classic_mceliece_6688128f;KEM_classic_mceliece_6960119;KEM_classic_mceliece_6960119f;KEM_classic_mceliece_8192128;KEM_classic_mceliece_8192128f;KEM_hqc_128;KEM_hqc_192;KEM_hqc_256;KEM_bike_l1;KEM_bike_l3")
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 493670f80..5c15e2dc3 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -158,7 +158,7 @@ jobs:
         # See https://github.com/open-quantum-safe/liboqs/issues/738#issuecomment-621394744
         default: --numprocesses=auto
     machine:
-      image: ubuntu-2004:202101-01
+      image: default # analogous to ubuntu-latest on GH Actions
     resource_class: arm.medium
     steps:
       - checkout
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 28b6f3cca..327e04d16 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -46,12 +46,10 @@ jobs:
             container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
             CMAKE_ARGS: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=generic
             PYTEST_ARGS: --numprocesses=auto -k 'test_kat_all'
-            SKIP_ALGS: 'Falcon-1024' # re-enable when #1561 is resolved
           - name: extensions
             container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
             CMAKE_ARGS: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=haswell
             PYTEST_ARGS: --numprocesses=auto -k 'test_kat_all'
-            SKIP_ALGS: 'Falcon-1024' # re-enable when #1561 is resolved
     container:
       image: ${{ matrix.container }}
     steps:
diff --git a/README.md b/README.md
index f9b49615d..738fa19d5 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ The list below indicates all algorithms supported by liboqs, but not all those a
 
 <!--- OQS_TEMPLATE_FRAGMENT_LIST_SIGS_START -->
 - **CRYSTALS-Dilithium**: Dilithium2, Dilithium3, Dilithium5
-- **Falcon**: Falcon-512, Falcon-1024
+- **Falcon**: Falcon-512, Falcon-1024, Falcon-padded-512, Falcon-padded-1024
 - **ML-DSA**: ML-DSA-44-ipd (alias: ML-DSA-44), ML-DSA-65-ipd (alias: ML-DSA-65), ML-DSA-87-ipd (alias: ML-DSA-87)
 - **SPHINCS+-SHA2**: SPHINCS+-SHA2-128f-simple, SPHINCS+-SHA2-128s-simple, SPHINCS+-SHA2-192f-simple, SPHINCS+-SHA2-192s-simple, SPHINCS+-SHA2-256f-simple, SPHINCS+-SHA2-256s-simple
 - **SPHINCS+-SHAKE**: SPHINCS+-SHAKE-128f-simple, SPHINCS+-SHAKE-128s-simple, SPHINCS+-SHAKE-192f-simple, SPHINCS+-SHAKE-192s-simple, SPHINCS+-SHAKE-256f-simple, SPHINCS+-SHAKE-256s-simple
@@ -185,6 +185,7 @@ liboqs includes some third party libraries or modules that are licensed differen
 - `src/kem/ml_kem/pqcrystals-*`: public domain (CC0) or Apache License v2.0
 - `src/sig/dilithium/pqcrystals-*`: public domain (CC0) or Apache License v2.0
 - `src/sig/dilithium/pqclean_*`: public domain (CC0), and public domain (CC0) or Apache License v2.0, and public domain (CC0) or MIT, and MIT
+-  src/sig/falcon/pqclean_\*\_aarch64 : Apache License v2.0
 - `src/sig/ml_dsa/pqcrystals-*`: public domain (CC0) or Apache License v2.0
 - `src/sig/sphincs/pqclean_*`: CC0 (public domain)
 
diff --git a/docs/algorithms/kem/classic_mceliece.md b/docs/algorithms/kem/classic_mceliece.md
index 68840c4b0..2c6a267e4 100644
--- a/docs/algorithms/kem/classic_mceliece.md
+++ b/docs/algorithms/kem/classic_mceliece.md
@@ -6,7 +6,7 @@
 - **Authors' website**: https://classic.mceliece.org
 - **Specification version**: SUPERCOP-20221025.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+  - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
   - **Implementation license (SPDX-Identifier)**: Public domain
 - **Ancestors of primary source**:
   - SUPERCOP-20221025 "clean" and "avx2" implementations
diff --git a/docs/algorithms/kem/classic_mceliece.yml b/docs/algorithms/kem/classic_mceliece.yml
index 3af5a3e74..99a828bc6 100644
--- a/docs/algorithms/kem/classic_mceliece.yml
+++ b/docs/algorithms/kem/classic_mceliece.yml
@@ -378,4 +378,4 @@ parameter-sets:
 auxiliary-submitters: []
 primary-upstream:
   spdx-license-identifier: Public domain
-  source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+  source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
diff --git a/docs/algorithms/kem/hqc.md b/docs/algorithms/kem/hqc.md
index 58d083481..dca44d745 100644
--- a/docs/algorithms/kem/hqc.md
+++ b/docs/algorithms/kem/hqc.md
@@ -6,7 +6,7 @@
 - **Authors' website**: https://pqc-hqc.org/
 - **Specification version**: 2023-04-30.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+  - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
   - **Implementation license (SPDX-Identifier)**: Public domain
 - **Ancestors of primary source**:
   - https://github.com/SWilson4/package-pqclean/tree/8db1b24b/hqc, which takes it from:
diff --git a/docs/algorithms/kem/hqc.yml b/docs/algorithms/kem/hqc.yml
index 1bcbe6566..8e78c4f9c 100644
--- a/docs/algorithms/kem/hqc.yml
+++ b/docs/algorithms/kem/hqc.yml
@@ -76,4 +76,4 @@ parameter-sets:
     upstream: primary-upstream
 primary-upstream:
   spdx-license-identifier: Public domain
-  source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+  source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
diff --git a/docs/algorithms/sig/falcon.md b/docs/algorithms/sig/falcon.md
index df0580968..3dd6dddc9 100644
--- a/docs/algorithms/sig/falcon.md
+++ b/docs/algorithms/sig/falcon.md
@@ -7,24 +7,30 @@
 - **Authors' website**: https://falcon-sign.info
 - **Specification version**: 20211101.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+  - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
   - **Implementation license (SPDX-Identifier)**: MIT
+- **Optimized Implementation sources**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
+  - **pqclean-aarch64**:<a name="pqclean-aarch64"></a>
+      - **Source**: https://github.com/PQClean/PQClean/commit/7707d1bcc8ae7f9ffd296dd13b1d76d2767d14f8
+      - **Implementation license (SPDX-Identifier)**: Apache-2.0
 
 
 ## Parameter set summary
 
-|  Parameter set  | Parameter set alias   | Security model   |   Claimed NIST Level |   Public key size (bytes) |   Secret key size (bytes) |   Signature size (bytes) |
-|:---------------:|:----------------------|:-----------------|---------------------:|--------------------------:|--------------------------:|-------------------------:|
-|   Falcon-512    | NA                    | EUF-CMA          |                    1 |                       897 |                      1281 |                      666 |
-|   Falcon-1024   | NA                    | EUF-CMA          |                    5 |                      1793 |                      2305 |                     1280 |
+|   Parameter set    | Parameter set alias   | Security model   |   Claimed NIST Level |   Public key size (bytes) |   Secret key size (bytes) |   Signature size (bytes) |
+|:------------------:|:----------------------|:-----------------|---------------------:|--------------------------:|--------------------------:|-------------------------:|
+|     Falcon-512     | NA                    | EUF-CMA          |                    1 |                       897 |                      1281 |                      752 |
+|    Falcon-1024     | NA                    | EUF-CMA          |                    5 |                      1793 |                      2305 |                     1462 |
+| Falcon-padded-512  | NA                    | EUF-CMA          |                    1 |                       897 |                      1281 |                      666 |
+| Falcon-padded-1024 | NA                    | EUF-CMA          |                    5 |                      1793 |                      2305 |                     1280 |
 
 ## Falcon-512 implementation characteristics
 
-|       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?‡   |
-|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | False                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                 |
-| [Primary Source](#primary-source) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                 |
+|        Implementation source        | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?‡   |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
+|  [Primary Source](#primary-source)  | clean                    | All                         | All                             | None                    | True                               | True                                           | False                 |
+|  [Primary Source](#primary-source)  | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                 |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -32,11 +38,31 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 ## Falcon-1024 implementation characteristics
 
-|       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
-|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | False                |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                |
-| [Primary Source](#primary-source) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                |
+|        Implementation source        | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+|  [Primary Source](#primary-source)  | clean                    | All                         | All                             | None                    | True                               | True                                           | False                |
+|  [Primary Source](#primary-source)  | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## Falcon-padded-512 implementation characteristics
+
+|        Implementation source        | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+|  [Primary Source](#primary-source)  | clean                    | All                         | All                             | None                    | True                               | True                                           | False                |
+|  [Primary Source](#primary-source)  | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## Falcon-padded-1024 implementation characteristics
+
+|        Implementation source        | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+|  [Primary Source](#primary-source)  | clean                    | All                         | All                             | None                    | True                               | True                                           | False                |
+|  [Primary Source](#primary-source)  | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
diff --git a/docs/algorithms/sig/falcon.yml b/docs/algorithms/sig/falcon.yml
index aa6a80304..781e188e0 100644
--- a/docs/algorithms/sig/falcon.yml
+++ b/docs/algorithms/sig/falcon.yml
@@ -18,17 +18,21 @@ website: https://falcon-sign.info
 nist-round: 3
 spec-version: 20211101
 primary-upstream:
-  source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+  source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
   spdx-license-identifier: MIT
   upstream-ancestors:
   - https://www.falcon-sign.info
+optimized-upstreams:
+  pqclean-aarch64:
+    source: https://github.com/PQClean/PQClean/commit/7707d1bcc8ae7f9ffd296dd13b1d76d2767d14f8
+    spdx-license-identifier: Apache-2.0
 parameter-sets:
 - name: Falcon-512
   claimed-nist-level: 1
   claimed-security: EUF-CMA
   length-public-key: 897
   length-secret-key: 1281
-  length-signature: 666
+  length-signature: 752
   implementations-switch-on-runtime-cpu-features: true
   implementations:
   - upstream: primary-upstream
@@ -50,7 +54,7 @@ parameter-sets:
     no-secret-dependent-branching-claimed: false
     no-secret-dependent-branching-checked-by-valgrind: false
     large-stack-usage: false
-  - upstream: primary-upstream
+  - upstream: pqclean-aarch64
     upstream-id: aarch64
     supported-platforms:
     - architecture: ARM64_V8
@@ -67,7 +71,46 @@ parameter-sets:
   claimed-security: EUF-CMA
   length-public-key: 1793
   length-secret-key: 2305
-  length-signature: 1280
+  length-signature: 1462
+  implementations-switch-on-runtime-cpu-features: true
+  implementations:
+  - upstream: primary-upstream
+    upstream-id: clean
+    supported-platforms: all
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: avx2
+    supported-platforms:
+    - architecture: x86_64
+      required_flags:
+      - avx2
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: false
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
+  - upstream: pqclean-aarch64
+    upstream-id: aarch64
+    supported-platforms:
+    - architecture: ARM64_V8
+      operating_systems:
+      - Linux
+      - Darwin
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: false
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
+- name: Falcon-padded-512
+  claimed-nist-level: 1
+  claimed-security: EUF-CMA
+  length-public-key: 897
+  length-secret-key: 1281
+  length-signature: 666
   implementations-switch-on-runtime-cpu-features: true
   implementations:
   - upstream: primary-upstream
@@ -89,7 +132,46 @@ parameter-sets:
     no-secret-dependent-branching-claimed: false
     no-secret-dependent-branching-checked-by-valgrind: false
     large-stack-usage: false
+  - upstream: pqclean-aarch64
+    upstream-id: aarch64
+    supported-platforms:
+    - architecture: ARM64_V8
+      operating_systems:
+      - Linux
+      - Darwin
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: false
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
+- name: Falcon-padded-1024
+  claimed-nist-level: 5
+  claimed-security: EUF-CMA
+  length-public-key: 1793
+  length-secret-key: 2305
+  length-signature: 1280
+  implementations-switch-on-runtime-cpu-features: true
+  implementations:
+  - upstream: primary-upstream
+    upstream-id: clean
+    supported-platforms: all
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
   - upstream: primary-upstream
+    upstream-id: avx2
+    supported-platforms:
+    - architecture: x86_64
+      required_flags:
+      - avx2
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: false
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
+  - upstream: pqclean-aarch64
     upstream-id: aarch64
     supported-platforms:
     - architecture: ARM64_V8
diff --git a/docs/algorithms/sig/sphincs.md b/docs/algorithms/sig/sphincs.md
index a1660e483..096a87b29 100644
--- a/docs/algorithms/sig/sphincs.md
+++ b/docs/algorithms/sig/sphincs.md
@@ -7,7 +7,7 @@
 - **Authors' website**: https://sphincs.org/
 - **Specification version**: NIST Round 3 submission, v3.1 (June 10, 2022).
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852 with copy_from_upstream patches
+  - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789 with copy_from_upstream patches
   - **Implementation license (SPDX-Identifier)**: CC0-1.0
 
 
diff --git a/docs/algorithms/sig/sphincs.yml b/docs/algorithms/sig/sphincs.yml
index b5148335a..d3e6816c9 100644
--- a/docs/algorithms/sig/sphincs.yml
+++ b/docs/algorithms/sig/sphincs.yml
@@ -26,7 +26,7 @@ nist-round: 3
 spec-version: NIST Round 3 submission, v3.1 (June 10, 2022)
 spdx-license-identifier: CC0-1.0
 primary-upstream:
-  source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+  source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
     with copy_from_upstream patches
   spdx-license-identifier: CC0-1.0
   upstream-ancestors:
diff --git a/docs/cbom.json b/docs/cbom.json
index 02d2d59ca..7dd47dc21 100644
--- a/docs/cbom.json
+++ b/docs/cbom.json
@@ -1,23 +1,23 @@
 {
   "bomFormat": "CBOM",
   "specVersion": "1.4-cbom-1.0",
-  "serialNumber": "urn:uuid:c25dad99-ad00-48b6-aa9e-25d4f7c3c8c5",
+  "serialNumber": "urn:uuid:b3ac0f3d-b320-4f0f-bbef-6c535c1e9874",
   "version": 1,
   "metadata": {
-    "timestamp": "2023-12-13T17:05:36.137517",
+    "timestamp": "2024-03-05T11:49:42.428605",
     "component": {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@5f83324a6c464448b70b1e57b3cd161b6832e0e0",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
       "name": "liboqs",
-      "version": "5f83324a6c464448b70b1e57b3cd161b6832e0e0"
+      "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
     }
   },
   "components": [
     {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@5f83324a6c464448b70b1e57b3cd161b6832e0e0",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
       "name": "liboqs",
-      "version": "5f83324a6c464448b70b1e57b3cd161b6832e0e0"
+      "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
     },
     {
       "type": "crypto-asset",
@@ -1419,6 +1419,126 @@
         "nistQuantumSecurityLevel": 5
       }
     },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:Falcon-padded-512:generic",
+      "name": "Falcon",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "Falcon-padded-512",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "generic"
+        },
+        "nistQuantumSecurityLevel": 1
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:Falcon-padded-512:x86_64",
+      "name": "Falcon",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "Falcon-padded-512",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "x86_64"
+        },
+        "nistQuantumSecurityLevel": 1
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:Falcon-padded-512:armv8-a",
+      "name": "Falcon",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "Falcon-padded-512",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "armv8-a"
+        },
+        "nistQuantumSecurityLevel": 1
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:Falcon-padded-1024:generic",
+      "name": "Falcon",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "Falcon-padded-1024",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "generic"
+        },
+        "nistQuantumSecurityLevel": 5
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:Falcon-padded-1024:x86_64",
+      "name": "Falcon",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "Falcon-padded-1024",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "x86_64"
+        },
+        "nistQuantumSecurityLevel": 5
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:Falcon-padded-1024:armv8-a",
+      "name": "Falcon",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "Falcon-padded-1024",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "armv8-a"
+        },
+        "nistQuantumSecurityLevel": 5
+      }
+    },
     {
       "type": "crypto-asset",
       "bom-ref": "alg:ML-DSA-44-ipd:generic",
@@ -2048,7 +2168,7 @@
   ],
   "dependencies": [
     {
-      "ref": "pkg:github/open-quantum-safe/liboqs@5f83324a6c464448b70b1e57b3cd161b6832e0e0",
+      "ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
       "dependsOn": [
         "alg:BIKE-L1:x86_64",
         "alg:BIKE-L3:x86_64",
@@ -2120,6 +2240,12 @@
         "alg:Falcon-1024:generic",
         "alg:Falcon-1024:x86_64",
         "alg:Falcon-1024:armv8-a",
+        "alg:Falcon-padded-512:generic",
+        "alg:Falcon-padded-512:x86_64",
+        "alg:Falcon-padded-512:armv8-a",
+        "alg:Falcon-padded-1024:generic",
+        "alg:Falcon-padded-1024:x86_64",
+        "alg:Falcon-padded-1024:armv8-a",
         "alg:ML-DSA-44-ipd:generic",
         "alg:ML-DSA-44-ipd:x86_64",
         "alg:ML-DSA-65-ipd:generic",
@@ -2675,6 +2801,48 @@
       ],
       "dependencyType": "uses"
     },
+    {
+      "ref": "alg:Falcon-padded-512:generic",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:Falcon-padded-512:x86_64",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:Falcon-padded-512:armv8-a",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:Falcon-padded-1024:generic",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:Falcon-padded-1024:x86_64",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:Falcon-padded-1024:armv8-a",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
     {
       "ref": "alg:ML-DSA-44-ipd:generic",
       "dependsOn": [
diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py
index 32d897cdf..0db38f54b 100755
--- a/scripts/copy_from_upstream/copy_from_upstream.py
+++ b/scripts/copy_from_upstream/copy_from_upstream.py
@@ -548,6 +548,9 @@ def process_families(instructions, basedir, with_kat, with_generator):
                             print("Info: Updating KAT for %s" % (scheme['pretty_name_full']))
                     except KeyError:  # new key
                         print("Adding new KAT for %s" % (scheme['pretty_name_full']))
+                        # either a new scheme or a new KAT
+                        if scheme['pretty_name_full'] not in kats['kem']:
+                            kats['kem'][scheme['pretty_name_full']] = {}
                         pass
                     kats['kem'][scheme['pretty_name_full']]['single'] = scheme['metadata']['nistkat-sha256']
                     if 'alias_pretty_name_full' in scheme:
@@ -558,6 +561,9 @@ def process_families(instructions, basedir, with_kat, with_generator):
                             print("Info: Updating KAT for %s" % (scheme['pretty_name_full']))
                     except KeyError:  # new key
                         print("Adding new KAT for %s" % (scheme['pretty_name_full']))
+                        # either a new scheme or a new KAT
+                        if scheme['pretty_name_full'] not in kats['sig']:
+                            kats['sig'][scheme['pretty_name_full']] = {}
                         pass
                     kats['sig'][scheme['pretty_name_full']]['single'] = scheme['metadata']['nistkat-sha256']
                     if 'alias_pretty_name_full' in scheme:
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index f55b8798b..d8a9a4d12 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -14,7 +14,7 @@ upstreams:
     name: pqclean
     git_url: https://github.com/PQClean/PQClean.git
     git_branch: master
-    git_commit: 0657749a785db30e7f49e9435452cb042edb1852
+    git_commit: 8e221ae797b229858a0b0d784577a8cb149d5789
     kem_meta_path: 'crypto_kem/{pqclean_scheme}/META.yml'
     sig_meta_path: 'crypto_sign/{pqclean_scheme}/META.yml'
     kem_scheme_path: 'crypto_kem/{pqclean_scheme}'
@@ -226,6 +226,16 @@ sigs:
         pqclean_scheme: falcon-1024
         pretty_name_full: Falcon-1024
         signed_msg_order: falcon
+      -
+        scheme: "padded_512"
+        pqclean_scheme: falcon-padded-512
+        pretty_name_full: Falcon-padded-512
+        signed_msg_order: sig_then_msg
+      -
+        scheme: "padded_1024"
+        pqclean_scheme: falcon-padded-1024
+        pretty_name_full: Falcon-padded-1024
+        signed_msg_order: sig_then_msg
   -
     name: sphincs
     default_implementation: clean
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index 4abe5c2ae..1b9b5a2d4 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -149,6 +149,12 @@
 #cmakedefine OQS_ENABLE_SIG_falcon_1024 1
 #cmakedefine OQS_ENABLE_SIG_falcon_1024_avx2 1
 #cmakedefine OQS_ENABLE_SIG_falcon_1024_aarch64 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_512 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_512_avx2 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_512_aarch64 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_1024 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_1024_avx2 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_1024_aarch64 1
 
 #cmakedefine OQS_ENABLE_SIG_SPHINCS 1
 #cmakedefine OQS_ENABLE_SIG_sphincs_sha2_128f_simple 1
diff --git a/src/sig/falcon/CMakeLists.txt b/src/sig/falcon/CMakeLists.txt
index ff5a41b43..4be3ae829 100644
--- a/src/sig/falcon/CMakeLists.txt
+++ b/src/sig/falcon/CMakeLists.txt
@@ -51,4 +51,50 @@ if(OQS_ENABLE_SIG_falcon_1024_aarch64)
     set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_1024_aarch64>)
 endif()
 
+if(OQS_ENABLE_SIG_falcon_padded_512)
+    add_library(falcon_padded_512_clean OBJECT sig_falcon_padded_512.c pqclean_falcon-padded-512_clean/codec.c pqclean_falcon-padded-512_clean/common.c pqclean_falcon-padded-512_clean/fft.c pqclean_falcon-padded-512_clean/fpr.c pqclean_falcon-padded-512_clean/keygen.c pqclean_falcon-padded-512_clean/pqclean.c pqclean_falcon-padded-512_clean/rng.c pqclean_falcon-padded-512_clean/sign.c pqclean_falcon-padded-512_clean/vrfy.c)
+    target_include_directories(falcon_padded_512_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-512_clean)
+    target_include_directories(falcon_padded_512_clean PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_padded_512_clean>)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+    add_library(falcon_padded_512_avx2 OBJECT pqclean_falcon-padded-512_avx2/codec.c pqclean_falcon-padded-512_avx2/common.c pqclean_falcon-padded-512_avx2/fft.c pqclean_falcon-padded-512_avx2/fpr.c pqclean_falcon-padded-512_avx2/keygen.c pqclean_falcon-padded-512_avx2/pqclean.c pqclean_falcon-padded-512_avx2/rng.c pqclean_falcon-padded-512_avx2/sign.c pqclean_falcon-padded-512_avx2/vrfy.c)
+    target_include_directories(falcon_padded_512_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-512_avx2)
+    target_include_directories(falcon_padded_512_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(falcon_padded_512_avx2 PRIVATE -mavx2)
+    set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_padded_512_avx2>)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+    add_library(falcon_padded_512_aarch64 OBJECT pqclean_falcon-padded-512_aarch64/codec.c pqclean_falcon-padded-512_aarch64/common.c pqclean_falcon-padded-512_aarch64/fft.c pqclean_falcon-padded-512_aarch64/fft_tree.c pqclean_falcon-padded-512_aarch64/fpr.c pqclean_falcon-padded-512_aarch64/keygen.c pqclean_falcon-padded-512_aarch64/ntt.c pqclean_falcon-padded-512_aarch64/ntt_consts.c pqclean_falcon-padded-512_aarch64/poly_float.c pqclean_falcon-padded-512_aarch64/poly_int.c pqclean_falcon-padded-512_aarch64/pqclean.c pqclean_falcon-padded-512_aarch64/rng.c pqclean_falcon-padded-512_aarch64/sampler.c pqclean_falcon-padded-512_aarch64/sign.c pqclean_falcon-padded-512_aarch64/util.c pqclean_falcon-padded-512_aarch64/vrfy.c)
+    target_include_directories(falcon_padded_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-512_aarch64)
+    target_include_directories(falcon_padded_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(falcon_padded_512_aarch64 PRIVATE)
+    set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_padded_512_aarch64>)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_1024)
+    add_library(falcon_padded_1024_clean OBJECT sig_falcon_padded_1024.c pqclean_falcon-padded-1024_clean/codec.c pqclean_falcon-padded-1024_clean/common.c pqclean_falcon-padded-1024_clean/fft.c pqclean_falcon-padded-1024_clean/fpr.c pqclean_falcon-padded-1024_clean/keygen.c pqclean_falcon-padded-1024_clean/pqclean.c pqclean_falcon-padded-1024_clean/rng.c pqclean_falcon-padded-1024_clean/sign.c pqclean_falcon-padded-1024_clean/vrfy.c)
+    target_include_directories(falcon_padded_1024_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-1024_clean)
+    target_include_directories(falcon_padded_1024_clean PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_padded_1024_clean>)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+    add_library(falcon_padded_1024_avx2 OBJECT pqclean_falcon-padded-1024_avx2/codec.c pqclean_falcon-padded-1024_avx2/common.c pqclean_falcon-padded-1024_avx2/fft.c pqclean_falcon-padded-1024_avx2/fpr.c pqclean_falcon-padded-1024_avx2/keygen.c pqclean_falcon-padded-1024_avx2/pqclean.c pqclean_falcon-padded-1024_avx2/rng.c pqclean_falcon-padded-1024_avx2/sign.c pqclean_falcon-padded-1024_avx2/vrfy.c)
+    target_include_directories(falcon_padded_1024_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-1024_avx2)
+    target_include_directories(falcon_padded_1024_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(falcon_padded_1024_avx2 PRIVATE -mavx2)
+    set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_padded_1024_avx2>)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+    add_library(falcon_padded_1024_aarch64 OBJECT pqclean_falcon-padded-1024_aarch64/codec.c pqclean_falcon-padded-1024_aarch64/common.c pqclean_falcon-padded-1024_aarch64/fft.c pqclean_falcon-padded-1024_aarch64/fft_tree.c pqclean_falcon-padded-1024_aarch64/fpr.c pqclean_falcon-padded-1024_aarch64/keygen.c pqclean_falcon-padded-1024_aarch64/ntt.c pqclean_falcon-padded-1024_aarch64/ntt_consts.c pqclean_falcon-padded-1024_aarch64/poly_float.c pqclean_falcon-padded-1024_aarch64/poly_int.c pqclean_falcon-padded-1024_aarch64/pqclean.c pqclean_falcon-padded-1024_aarch64/rng.c pqclean_falcon-padded-1024_aarch64/sampler.c pqclean_falcon-padded-1024_aarch64/sign.c pqclean_falcon-padded-1024_aarch64/util.c pqclean_falcon-padded-1024_aarch64/vrfy.c)
+    target_include_directories(falcon_padded_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-1024_aarch64)
+    target_include_directories(falcon_padded_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(falcon_padded_1024_aarch64 PRIVATE)
+    set(_FALCON_OBJS ${_FALCON_OBJS} $<TARGET_OBJECTS:falcon_padded_1024_aarch64>)
+endif()
+
 set(FALCON_OBJS ${_FALCON_OBJS} PARENT_SCOPE)
diff --git a/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h b/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h
index cc2d49cf1..06787aaca 100644
--- a/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h
+++ b/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h
@@ -6,10 +6,12 @@
 
 #define PQCLEAN_FALCON1024_AARCH64_CRYPTO_SECRETKEYBYTES   2305
 #define PQCLEAN_FALCON1024_AARCH64_CRYPTO_PUBLICKEYBYTES   1793
-#define PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES            1280
+#define PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES            1462
 
 #define PQCLEAN_FALCON1024_AARCH64_CRYPTO_ALGNAME          "Falcon-1024"
 
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES      1280 // used in signature verification
+
 /*
  * Generate a new key pair. Public key goes into pk[], private key in sk[].
  * Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c
index dfd6d8aea..e90daf2b7 100644
--- a/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c
+++ b/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c
@@ -281,10 +281,10 @@ int PQCLEAN_FALCON1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int1
     uint16x8_t neon_q;                                  // 1
     neon_127 = vdupq_n_s16(127);
     neon__127 = vdupq_n_s16(-127);
+    neon_q = vdupq_n_u16(FALCON_Q);
     neon_q_2 = vdupq_n_s16(FALCON_Q >> 1);
     neon__q_2 = vdupq_n_s16(-(FALCON_Q >> 1));
 
-    neon_q = vdupq_n_u16(FALCON_Q);
     e.val[1] = vdupq_n_u16(0);
 
     for (int i = 0; i < FALCON_N; i += 64) {
diff --git a/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c
index 1eea81fa8..7355b07db 100644
--- a/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c
@@ -27,15 +27,15 @@
  *
  *   signature:
  *      header byte: 0011nnnn
- *      nonce     40 bytes
- *      value     (12 bits by element)
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
  *
  *   message + signature:
  *      signature length   (2 bytes, big-endian)
  *      nonce              40 bytes
  *      message
  *      header byte:       0010nnnn
- *      value              (12 bits by element)
+ *      value              compressed format
  *      (signature length is 1+len(value), not counting the nonce)
  */
 
@@ -115,10 +115,7 @@ PQCLEAN_FALCON1024_AARCH64_crypto_sign_keypair(
  * receiving the actual value length.
  *
  * If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
  *
  * Return value: 0 on success, -1 on error.
  */
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     inner_shake256_flip(&sc);
 
     /*
-     * Compute and return the signature. This loops until a signature
-     * value is found that fits in the provided buffer.
+     * Compute and return the signature.
      */
-    for (;;) {
-        PQCLEAN_FALCON1024_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
-        v = PQCLEAN_FALCON1024_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
-        if (v != 0) {
-            inner_shake256_ctx_release(&sc);
-            *sigbuflen = v;
-            return 0;
-        }
+    PQCLEAN_FALCON1024_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+    v = PQCLEAN_FALCON1024_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
+    if (v != 0) {
+        inner_shake256_ctx_release(&sc);
+        *sigbuflen = v;
+        return 0;
     }
+    return -1;
 }
 
 /*
@@ -230,6 +225,7 @@ do_verify(
     int16_t hm[FALCON_N];
     int16_t sig[FALCON_N];
     inner_shake256_context sc;
+    size_t v;
 
     /*
      * Decode public key.
@@ -242,6 +238,7 @@ do_verify(
             != PQCLEAN_FALCON1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
         return -1;
     }
+    // We move the conversion to NTT domain of `h` inside verify_raw()
 
     /*
      * Decode signature.
@@ -249,9 +246,22 @@ do_verify(
     if (sigbuflen == 0) {
         return -1;
     }
-    if (PQCLEAN_FALCON1024_AARCH64_comp_decode(sig, sigbuf, sigbuflen) != sigbuflen) {
+
+    v = PQCLEAN_FALCON1024_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+    if (v == 0) {
         return -1;
     }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
 
     /*
      * Hash nonce + message into a vector.
@@ -277,20 +287,9 @@ int
 PQCLEAN_FALCON1024_AARCH64_crypto_sign_signature(
     uint8_t *sig, size_t *siglen,
     const uint8_t *m, size_t mlen, const uint8_t *sk) {
-    /*
-     * The PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES constant is used for
-     * the signed message object (as produced by crypto_sign())
-     * and includes a two-byte length value, so we take care here
-     * to only generate signatures that are two bytes shorter than
-     * the maximum. This is done to ensure that crypto_sign()
-     * and crypto_sign_signature() produce the exact same signature
-     * value, if used on the same message, with the same private key,
-     * and using the same output from randombytes() (this is for
-     * reproducibility of tests).
-     */
     size_t vlen;
 
-    vlen = PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES - NONCELEN - 3;
+    vlen = PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
     if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
         return -1;
     }
diff --git a/src/sig/falcon/pqclean_falcon-1024_avx2/api.h b/src/sig/falcon/pqclean_falcon-1024_avx2/api.h
index a0f6db1f4..85e201fc2 100644
--- a/src/sig/falcon/pqclean_falcon-1024_avx2/api.h
+++ b/src/sig/falcon/pqclean_falcon-1024_avx2/api.h
@@ -6,10 +6,12 @@
 
 #define PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES   2305
 #define PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES   1793
-#define PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES            1280
+#define PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES            1462
 
 #define PQCLEAN_FALCON1024_AVX2_CRYPTO_ALGNAME          "Falcon-1024"
 
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES      1280 // used in signature verification
+
 /*
  * Generate a new key pair. Public key goes into pk[], private key in sk[].
  * Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c
index 27708cd68..ea214a19f 100644
--- a/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c
@@ -27,15 +27,15 @@
  *
  *   signature:
  *      header byte: 0011nnnn
- *      nonce     40 bytes
- *      value     (12 bits by element)
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
  *
  *   message + signature:
  *      signature length   (2 bytes, big-endian)
  *      nonce              40 bytes
  *      message
  *      header byte:       0010nnnn
- *      value              (12 bits by element)
+ *      value              compressed format
  *      (signature length is 1+len(value), not counting the nonce)
  */
 
@@ -115,10 +115,7 @@ PQCLEAN_FALCON1024_AVX2_crypto_sign_keypair(
  * receiving the actual value length.
  *
  * If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
  *
  * Return value: 0 on success, -1 on error.
  */
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     inner_shake256_flip(&sc);
 
     /*
-     * Compute and return the signature. This loops until a signature
-     * value is found that fits in the provided buffer.
+     * Compute and return the signature.
      */
-    for (;;) {
-        PQCLEAN_FALCON1024_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
-        v = PQCLEAN_FALCON1024_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
-        if (v != 0) {
-            inner_shake256_ctx_release(&sc);
-            *sigbuflen = v;
-            return 0;
-        }
+    PQCLEAN_FALCON1024_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+    v = PQCLEAN_FALCON1024_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
+    if (v != 0) {
+        inner_shake256_ctx_release(&sc);
+        *sigbuflen = v;
+        return 0;
     }
+    return -1;
 }
 
 /*
@@ -229,6 +224,7 @@ do_verify(
     uint16_t h[1024], hm[1024];
     int16_t sig[1024];
     inner_shake256_context sc;
+    size_t v;
 
     /*
      * Decode public key.
@@ -249,9 +245,22 @@ do_verify(
     if (sigbuflen == 0) {
         return -1;
     }
-    if (PQCLEAN_FALCON1024_AVX2_comp_decode(sig, 10, sigbuf, sigbuflen) != sigbuflen) {
+
+    v = PQCLEAN_FALCON1024_AVX2_comp_decode(sig, 10, sigbuf, sigbuflen);
+    if (v == 0) {
         return -1;
     }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
 
     /*
      * Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
 PQCLEAN_FALCON1024_AVX2_crypto_sign_signature(
     uint8_t *sig, size_t *siglen,
     const uint8_t *m, size_t mlen, const uint8_t *sk) {
-    /*
-     * The PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES constant is used for
-     * the signed message object (as produced by crypto_sign())
-     * and includes a two-byte length value, so we take care here
-     * to only generate signatures that are two bytes shorter than
-     * the maximum. This is done to ensure that crypto_sign()
-     * and crypto_sign_signature() produce the exact same signature
-     * value, if used on the same message, with the same private key,
-     * and using the same output from randombytes() (this is for
-     * reproducibility of tests).
-     */
     size_t vlen;
 
-    vlen = PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES - NONCELEN - 3;
+    vlen = PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
     if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
         return -1;
     }
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/api.h b/src/sig/falcon/pqclean_falcon-1024_clean/api.h
index 74fe34958..cc6557fde 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/api.h
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/api.h
@@ -6,10 +6,12 @@
 
 #define PQCLEAN_FALCON1024_CLEAN_CRYPTO_SECRETKEYBYTES   2305
 #define PQCLEAN_FALCON1024_CLEAN_CRYPTO_PUBLICKEYBYTES   1793
-#define PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES            1280
+#define PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES            1462
 
 #define PQCLEAN_FALCON1024_CLEAN_CRYPTO_ALGNAME          "Falcon-1024"
 
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES      1280 // used in signature verification
+
 /*
  * Generate a new key pair. Public key goes into pk[], private key in sk[].
  * Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c
index 7ced3ff0b..086d249ef 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c
@@ -27,15 +27,15 @@
  *
  *   signature:
  *      header byte: 0011nnnn
- *      nonce     40 bytes
- *      value     (12 bits by element)
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
  *
  *   message + signature:
  *      signature length   (2 bytes, big-endian)
  *      nonce              40 bytes
  *      message
  *      header byte:       0010nnnn
- *      value              (12 bits by element)
+ *      value              compressed format
  *      (signature length is 1+len(value), not counting the nonce)
  */
 
@@ -115,10 +115,7 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair(
  * receiving the actual value length.
  *
  * If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
  *
  * Return value: 0 on success, -1 on error.
  */
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     inner_shake256_flip(&sc);
 
     /*
-     * Compute and return the signature. This loops until a signature
-     * value is found that fits in the provided buffer.
+     * Compute and return the signature.
      */
-    for (;;) {
-        PQCLEAN_FALCON1024_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
-        v = PQCLEAN_FALCON1024_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
-        if (v != 0) {
-            inner_shake256_ctx_release(&sc);
-            *sigbuflen = v;
-            return 0;
-        }
+    PQCLEAN_FALCON1024_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+    v = PQCLEAN_FALCON1024_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
+    if (v != 0) {
+        inner_shake256_ctx_release(&sc);
+        *sigbuflen = v;
+        return 0;
     }
+    return -1;
 }
 
 /*
@@ -229,6 +224,7 @@ do_verify(
     uint16_t h[1024], hm[1024];
     int16_t sig[1024];
     inner_shake256_context sc;
+    size_t v;
 
     /*
      * Decode public key.
@@ -249,9 +245,22 @@ do_verify(
     if (sigbuflen == 0) {
         return -1;
     }
-    if (PQCLEAN_FALCON1024_CLEAN_comp_decode(sig, 10, sigbuf, sigbuflen) != sigbuflen) {
+
+    v = PQCLEAN_FALCON1024_CLEAN_comp_decode(sig, 10, sigbuf, sigbuflen);
+    if (v == 0) {
         return -1;
     }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
 
     /*
      * Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
 PQCLEAN_FALCON1024_CLEAN_crypto_sign_signature(
     uint8_t *sig, size_t *siglen,
     const uint8_t *m, size_t mlen, const uint8_t *sk) {
-    /*
-     * The PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES constant is used for
-     * the signed message object (as produced by crypto_sign())
-     * and includes a two-byte length value, so we take care here
-     * to only generate signatures that are two bytes shorter than
-     * the maximum. This is done to ensure that crypto_sign()
-     * and crypto_sign_signature() produce the exact same signature
-     * value, if used on the same message, with the same private key,
-     * and using the same output from randombytes() (this is for
-     * reproducibility of tests).
-     */
     size_t vlen;
 
-    vlen = PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES - NONCELEN - 3;
+    vlen = PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
     if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
         return -1;
     }
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/api.h b/src/sig/falcon/pqclean_falcon-512_aarch64/api.h
index 996bf6185..d70db344b 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/api.h
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/api.h
@@ -6,10 +6,12 @@
 
 #define PQCLEAN_FALCON512_AARCH64_CRYPTO_SECRETKEYBYTES   1281
 #define PQCLEAN_FALCON512_AARCH64_CRYPTO_PUBLICKEYBYTES   897
-#define PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES            666
+#define PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES            752
 
 #define PQCLEAN_FALCON512_AARCH64_CRYPTO_ALGNAME          "Falcon-512"
 
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES      666 // used in signature verification
+
 /*
  * Generate a new key pair. Public key goes into pk[], private key in sk[].
  * Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h b/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h
index d1a49920b..c8f82991e 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h
@@ -123,4 +123,3 @@
 #define vfmla_lane(d, c, a, b, i) d = vfmaq_laneq_f64(c, a, b, i);
 // d = c - a * b[i]
 #define vfmls_lane(d, c, a, b, i) d = vfmsq_laneq_f64(c, a, b, i);
-
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h b/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h
index dcacf718d..3702fa1bd 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h
@@ -2,6 +2,7 @@
 #define POLY_H
 
 #include "inner.h"
+#include "params.h"
 
 typedef enum ntt_domain {
     NTT_NONE = 0,
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c
index 8adf73821..b898d746a 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c
@@ -27,15 +27,15 @@
  *
  *   signature:
  *      header byte: 0011nnnn
- *      nonce     40 bytes
- *      value     (12 bits by element)
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
  *
  *   message + signature:
  *      signature length   (2 bytes, big-endian)
  *      nonce              40 bytes
  *      message
  *      header byte:       0010nnnn
- *      value              (12 bits by element)
+ *      value              compressed format
  *      (signature length is 1+len(value), not counting the nonce)
  */
 
@@ -44,7 +44,7 @@ int
 PQCLEAN_FALCON512_AARCH64_crypto_sign_keypair(
     uint8_t *pk, uint8_t *sk) {
     union {
-        uint8_t b[FALCON_KEYGEN_TEMP_9];
+        uint8_t b[28 * FALCON_N];
         uint64_t dummy_u64;
         fpr dummy_fpr;
     } tmp;
@@ -115,10 +115,7 @@ PQCLEAN_FALCON512_AARCH64_crypto_sign_keypair(
  * receiving the actual value length.
  *
  * If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
  *
  * Return value: 0 on success, -1 on error.
  */
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     inner_shake256_flip(&sc);
 
     /*
-     * Compute and return the signature. This loops until a signature
-     * value is found that fits in the provided buffer.
+     * Compute and return the signature.
      */
-    for (;;) {
-        PQCLEAN_FALCON512_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
-        v = PQCLEAN_FALCON512_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
-        if (v != 0) {
-            inner_shake256_ctx_release(&sc);
-            *sigbuflen = v;
-            return 0;
-        }
+    PQCLEAN_FALCON512_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+    v = PQCLEAN_FALCON512_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
+    if (v != 0) {
+        inner_shake256_ctx_release(&sc);
+        *sigbuflen = v;
+        return 0;
     }
+    return -1;
 }
 
 /*
@@ -230,6 +225,7 @@ do_verify(
     int16_t hm[FALCON_N];
     int16_t sig[FALCON_N];
     inner_shake256_context sc;
+    size_t v;
 
     /*
      * Decode public key.
@@ -250,9 +246,22 @@ do_verify(
     if (sigbuflen == 0) {
         return -1;
     }
-    if (PQCLEAN_FALCON512_AARCH64_comp_decode(sig, sigbuf, sigbuflen) != sigbuflen) {
+
+    v = PQCLEAN_FALCON512_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+    if (v == 0) {
         return -1;
     }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
 
     /*
      * Hash nonce + message into a vector.
@@ -278,20 +287,9 @@ int
 PQCLEAN_FALCON512_AARCH64_crypto_sign_signature(
     uint8_t *sig, size_t *siglen,
     const uint8_t *m, size_t mlen, const uint8_t *sk) {
-    /*
-     * The PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES constant is used for
-     * the signed message object (as produced by crypto_sign())
-     * and includes a two-byte length value, so we take care here
-     * to only generate signatures that are two bytes shorter than
-     * the maximum. This is done to ensure that crypto_sign()
-     * and crypto_sign_signature() produce the exact same signature
-     * value, if used on the same message, with the same private key,
-     * and using the same output from randombytes() (this is for
-     * reproducibility of tests).
-     */
     size_t vlen;
 
-    vlen = PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES - NONCELEN - 3;
+    vlen = PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
     if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
         return -1;
     }
diff --git a/src/sig/falcon/pqclean_falcon-512_avx2/api.h b/src/sig/falcon/pqclean_falcon-512_avx2/api.h
index acae41ae3..2f74f2627 100644
--- a/src/sig/falcon/pqclean_falcon-512_avx2/api.h
+++ b/src/sig/falcon/pqclean_falcon-512_avx2/api.h
@@ -6,10 +6,12 @@
 
 #define PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES   1281
 #define PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES   897
-#define PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES            666
+#define PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES            752
 
 #define PQCLEAN_FALCON512_AVX2_CRYPTO_ALGNAME          "Falcon-512"
 
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES      666 // used in signature verification
+
 /*
  * Generate a new key pair. Public key goes into pk[], private key in sk[].
  * Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c
index 143246ebe..84e393d69 100644
--- a/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c
@@ -27,15 +27,15 @@
  *
  *   signature:
  *      header byte: 0011nnnn
- *      nonce     40 bytes
- *      value     (12 bits by element)
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
  *
  *   message + signature:
  *      signature length   (2 bytes, big-endian)
  *      nonce              40 bytes
  *      message
  *      header byte:       0010nnnn
- *      value              (12 bits by element)
+ *      value              compressed format
  *      (signature length is 1+len(value), not counting the nonce)
  */
 
@@ -115,10 +115,7 @@ PQCLEAN_FALCON512_AVX2_crypto_sign_keypair(
  * receiving the actual value length.
  *
  * If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
  *
  * Return value: 0 on success, -1 on error.
  */
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     inner_shake256_flip(&sc);
 
     /*
-     * Compute and return the signature. This loops until a signature
-     * value is found that fits in the provided buffer.
+     * Compute and return the signature.
      */
-    for (;;) {
-        PQCLEAN_FALCON512_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
-        v = PQCLEAN_FALCON512_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
-        if (v != 0) {
-            inner_shake256_ctx_release(&sc);
-            *sigbuflen = v;
-            return 0;
-        }
+    PQCLEAN_FALCON512_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+    v = PQCLEAN_FALCON512_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
+    if (v != 0) {
+        inner_shake256_ctx_release(&sc);
+        *sigbuflen = v;
+        return 0;
     }
+    return -1;
 }
 
 /*
@@ -229,6 +224,7 @@ do_verify(
     uint16_t h[512], hm[512];
     int16_t sig[512];
     inner_shake256_context sc;
+    size_t v;
 
     /*
      * Decode public key.
@@ -249,9 +245,22 @@ do_verify(
     if (sigbuflen == 0) {
         return -1;
     }
-    if (PQCLEAN_FALCON512_AVX2_comp_decode(sig, 9, sigbuf, sigbuflen) != sigbuflen) {
+
+    v = PQCLEAN_FALCON512_AVX2_comp_decode(sig, 9, sigbuf, sigbuflen);
+    if (v == 0) {
         return -1;
     }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
 
     /*
      * Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
 PQCLEAN_FALCON512_AVX2_crypto_sign_signature(
     uint8_t *sig, size_t *siglen,
     const uint8_t *m, size_t mlen, const uint8_t *sk) {
-    /*
-     * The PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES constant is used for
-     * the signed message object (as produced by crypto_sign())
-     * and includes a two-byte length value, so we take care here
-     * to only generate signatures that are two bytes shorter than
-     * the maximum. This is done to ensure that crypto_sign()
-     * and crypto_sign_signature() produce the exact same signature
-     * value, if used on the same message, with the same private key,
-     * and using the same output from randombytes() (this is for
-     * reproducibility of tests).
-     */
     size_t vlen;
 
-    vlen = PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES - NONCELEN - 3;
+    vlen = PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
     if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
         return -1;
     }
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/api.h b/src/sig/falcon/pqclean_falcon-512_clean/api.h
index 5c85f3834..49489d2b1 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/api.h
+++ b/src/sig/falcon/pqclean_falcon-512_clean/api.h
@@ -6,10 +6,12 @@
 
 #define PQCLEAN_FALCON512_CLEAN_CRYPTO_SECRETKEYBYTES   1281
 #define PQCLEAN_FALCON512_CLEAN_CRYPTO_PUBLICKEYBYTES   897
-#define PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES            666
+#define PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES            752
 
 #define PQCLEAN_FALCON512_CLEAN_CRYPTO_ALGNAME          "Falcon-512"
 
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES      666 // used in signature verification
+
 /*
  * Generate a new key pair. Public key goes into pk[], private key in sk[].
  * Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c
index 979146a7d..80d8cbe32 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c
@@ -27,15 +27,15 @@
  *
  *   signature:
  *      header byte: 0011nnnn
- *      nonce     40 bytes
- *      value     (12 bits by element)
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
  *
  *   message + signature:
  *      signature length   (2 bytes, big-endian)
  *      nonce              40 bytes
  *      message
  *      header byte:       0010nnnn
- *      value              (12 bits by element)
+ *      value              compressed format
  *      (signature length is 1+len(value), not counting the nonce)
  */
 
@@ -115,10 +115,7 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair(
  * receiving the actual value length.
  *
  * If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
  *
  * Return value: 0 on success, -1 on error.
  */
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     inner_shake256_flip(&sc);
 
     /*
-     * Compute and return the signature. This loops until a signature
-     * value is found that fits in the provided buffer.
+     * Compute and return the signature.
      */
-    for (;;) {
-        PQCLEAN_FALCON512_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
-        v = PQCLEAN_FALCON512_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
-        if (v != 0) {
-            inner_shake256_ctx_release(&sc);
-            *sigbuflen = v;
-            return 0;
-        }
+    PQCLEAN_FALCON512_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+    v = PQCLEAN_FALCON512_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
+    if (v != 0) {
+        inner_shake256_ctx_release(&sc);
+        *sigbuflen = v;
+        return 0;
     }
+    return -1;
 }
 
 /*
@@ -229,6 +224,7 @@ do_verify(
     uint16_t h[512], hm[512];
     int16_t sig[512];
     inner_shake256_context sc;
+    size_t v;
 
     /*
      * Decode public key.
@@ -249,9 +245,22 @@ do_verify(
     if (sigbuflen == 0) {
         return -1;
     }
-    if (PQCLEAN_FALCON512_CLEAN_comp_decode(sig, 9, sigbuf, sigbuflen) != sigbuflen) {
+
+    v = PQCLEAN_FALCON512_CLEAN_comp_decode(sig, 9, sigbuf, sigbuflen);
+    if (v == 0) {
         return -1;
     }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
 
     /*
      * Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
 PQCLEAN_FALCON512_CLEAN_crypto_sign_signature(
     uint8_t *sig, size_t *siglen,
     const uint8_t *m, size_t mlen, const uint8_t *sk) {
-    /*
-     * The PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES constant is used for
-     * the signed message object (as produced by crypto_sign())
-     * and includes a two-byte length value, so we take care here
-     * to only generate signatures that are two bytes shorter than
-     * the maximum. This is done to ensure that crypto_sign()
-     * and crypto_sign_signature() produce the exact same signature
-     * value, if used on the same message, with the same private key,
-     * and using the same output from randombytes() (this is for
-     * reproducibility of tests).
-     */
     size_t vlen;
 
-    vlen = PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES - NONCELEN - 3;
+    vlen = PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
     if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
         return -1;
     }
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/LICENSE b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/LICENSE
new file mode 100644
index 000000000..4df2d7836
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/LICENSE
@@ -0,0 +1,57 @@
+This ARMv8 NEON implementation is provided under the Apache 2.0 license:
+
+/*
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+Based on the reference code provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin <thomas.pornin@nccgroup.com>.
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/api.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/api.h
new file mode 100644
index 000000000..9b6299841
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED1024_AARCH64_API_H
+#define PQCLEAN_FALCONPADDED1024_AARCH64_API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES   2305
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES   1793
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES            1280
+
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_ALGNAME          "Falcon-padded-1024"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ *   public (pk): PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES
+ *   private (sk): PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/codec.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/codec.c
new file mode 100644
index 000000000..05a8e49f3
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/codec.c
@@ -0,0 +1,554 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_modq_encode(
+    void *out, size_t max_out_len,
+    const uint16_t *x, unsigned logn) {
+    size_t n, out_len, u;
+    uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = 1 << logn;
+    out_len = ((n * 14) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+
+    for (u = 0; u < n; u ++) {
+        if (x[u] >= FALCON_Q) {
+            return 0;
+        }
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << 14) | x[u];
+        acc_len += 14;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_modq_decode(uint16_t *x, const void *in, size_t max_in_len, unsigned logn) {
+    size_t n, in_len, u;
+    const uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = 1 << logn;
+    in_len = ((n * 14) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    u = 0;
+    while (u < n) {
+        acc = (acc << 8) | (*buf ++);
+        acc_len += 8;
+        if (acc_len >= 14) {
+            unsigned w;
+
+            acc_len -= 14;
+            w = (acc >> acc_len) & 0x3FFF;
+            if (w >= 12289) {
+                return 0;
+            }
+            x[u ++] = (uint16_t)w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint16_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_decode(
+    int16_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            w |= -(w & mask2);
+            x[u ++] = (int16_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(void *out, size_t max_out_len,
+        const int8_t *x, uint8_t bits) {
+    size_t u, out_len;
+    int8_t minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    out_len = (size_t) ((FALCON_N * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+
+    maxv = (int8_t) (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int8(x, minv, maxv)) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < FALCON_N; u ++) {
+        acc = (acc << bits) | ((uint8_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(int8_t *x, unsigned bits,
+        const void *in, size_t max_in_len) {
+    size_t in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    in_len = ((FALCON_N * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < FALCON_N) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < FALCON_N) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            x[u ++] = (int8_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x) {
+    uint8_t *buf;
+    size_t u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    buf = out;
+
+    /*
+     * Make sure that all values are within the -2047..+2047 range.
+     */
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int16(x, -2047, 2047)) {
+        return 0;
+    }
+
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < FALCON_N; u ++) {
+        int t;
+        unsigned w;
+
+        /*
+         * Get sign and absolute value of next integer; push the
+         * sign bit.
+         */
+        acc <<= 1;
+        t = x[u];
+        if (t < 0) {
+            t = -t;
+            acc |= 1;
+        }
+        w = (unsigned)t;
+
+        /*
+         * Push the low 7 bits of the absolute value.
+         */
+        acc <<= 7;
+        acc |= w & 127u;
+        w >>= 7;
+
+        /*
+         * We pushed exactly 8 bits.
+         */
+        acc_len += 8;
+
+        /*
+         * Push as many zeros as necessary, then a one. Since the
+         * absolute value is at most 2047, w can only range up to
+         * 15 at this point, thus we will add at most 16 bits
+         * here. With the 8 bits above and possibly up to 7 bits
+         * from previous iterations, we may go up to 31 bits, which
+         * will fit in the accumulator, which is an uint32_t.
+         */
+        acc <<= (w + 1);
+        acc |= 1;
+        acc_len += w + 1;
+
+        /*
+         * Produce all full bytes.
+         */
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            if (buf != NULL) {
+                if (v >= max_out_len) {
+                    return 0;
+                }
+                buf[v] = (uint8_t)(acc >> acc_len);
+            }
+            v ++;
+        }
+    }
+
+    /*
+     * Flush remaining bits (if any).
+     */
+    if (acc_len > 0) {
+        if (buf != NULL) {
+            if (v >= max_out_len) {
+                return 0;
+            }
+            buf[v] = (uint8_t)(acc << (8 - acc_len));
+        }
+        v ++;
+    }
+
+    return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len) {
+    const uint8_t *buf;
+    size_t u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < FALCON_N; u ++) {
+        unsigned b, s, m;
+
+        /*
+         * Get next eight bits: sign and low seven bits of the
+         * absolute value.
+         */
+        if (v >= max_in_len) {
+            return 0;
+        }
+        acc = (acc << 8) | (uint32_t)buf[v ++];
+        b = acc >> acc_len;
+        s = b & 128;
+        m = b & 127;
+
+        /*
+         * Get next bits until a 1 is reached.
+         */
+        for (;;) {
+            if (acc_len == 0) {
+                if (v >= max_in_len) {
+                    return 0;
+                }
+                acc = (acc << 8) | (uint32_t)buf[v ++];
+                acc_len = 8;
+            }
+            acc_len --;
+            if (((acc >> acc_len) & 1) != 0) {
+                break;
+            }
+            m += 128;
+            if (m > 2047) {
+                return 0;
+            }
+        }
+
+        /*
+         * "-0" is forbidden.
+         */
+        if (s && m == 0) {
+            return 0;
+        }
+
+        x[u] = (int16_t)(s ? -(int)m : (int)m);
+    }
+
+    /*
+     * Unused bits in the last byte must be zero.
+     */
+    if ((acc & ((1u << acc_len) - 1u)) != 0) {
+        return 0;
+    }
+
+    return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
+ *      1       2     129       56.31       143       60.02
+ *      2       4     123       40.93       160       46.52
+ *      3       8      97       28.97       159       38.01
+ *      4      16     100       21.48       154       32.50
+ *      5      32      71       15.41       151       29.36
+ *      6      64      59       11.07       138       27.77
+ *      7     128      39        7.91       144       27.00
+ *      8     256      32        5.63       148       26.61
+ *      9     512      22        4.00       137       26.46
+ *     10    1024      15        2.84       146       26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    7,
+    7,
+    6,
+    6,
+    5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ *   log(n)     n   max sig coeff (theoretical)
+ *      1       2       412
+ *      2       4       583
+ *      3       8       824
+ *      4      16      1166
+ *      5      32      1649
+ *      6      64      2332
+ *      7     128      3299
+ *      8     256      4665
+ *      9     512      6598
+ *     10    1024      9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_sig_bits[] = {
+    0, /* unused */
+    10,
+    11,
+    11,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/common.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/common.c
new file mode 100644
index 000000000..883d89055
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/common.c
@@ -0,0 +1,549 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "macrous.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
+
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 5 * FALCON_Q) {
+            while (w >= FALCON_Q) {
+                w -= FALCON_Q;
+            }
+            *x++ = (uint16_t)w;
+            n--;
+        }
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
+    /*
+     * Each 16-bit sample is a value in 0..65535. The value is
+     * kept if it falls in 0..61444 (because 61445 = 5*12289)
+     * and rejected otherwise; thus, each sample has probability
+     * about 0.93758 of being selected.
+     *
+     * We want to oversample enough to be sure that we will
+     * have enough values with probability at least 1 - 2^(-256).
+     * Depending on degree N, this leads to the following
+     * required oversampling:
+     *
+     *   logn     n  oversampling
+     *     1      2     65
+     *     2      4     67
+     *     3      8     71
+     *     4     16     77
+     *     5     32     86
+     *     6     64    100
+     *     7    128    122
+     *     8    256    154
+     *     9    512    205
+     *    10   1024    287
+     *
+     * If logn >= 7, then the provided temporary buffer is large
+     * enough. Otherwise, we use a stack buffer of 63 entries
+     * (i.e. 126 bytes) for the values that do not fit in tmp[].
+     */
+
+    static const uint16_t overtab[] = {
+        0, /* unused */
+        65,
+        67,
+        71,
+        77,
+        86,
+        100,
+        122,
+        154,
+        205,
+        287
+    };
+
+    unsigned n, n2, u, m, p, over;
+    uint16_t *tt1, tt2[63];
+
+    /*
+     * We first generate m 16-bit value. Values 0..n-1 go to x[].
+     * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+     * We also reduce modulo q the values; rejected values are set
+     * to 0xFFFF.
+     */
+    n = 1U << logn;
+    n2 = n << 1;
+    over = overtab[logn];
+    m = n + over;
+    tt1 = (uint16_t *)tmp;
+    for (u = 0; u < m; u++) {
+        uint8_t buf[2];
+        uint32_t w, wr;
+
+        inner_shake256_extract(sc, buf, sizeof buf);
+        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+        wr |= ((w - 61445) >> 31) - 1;
+        if (u < n) {
+            x[u] = (uint16_t)wr;
+        } else if (u < n2) {
+            tt1[u - n] = (uint16_t)wr;
+        } else {
+            tt2[u - n2] = (uint16_t)wr;
+        }
+    }
+
+    /*
+     * Now we must "squeeze out" the invalid values. We do this in
+     * a logarithmic sequence of passes; each pass computes where a
+     * value should go, and moves it down by 'p' slots if necessary,
+     * where 'p' uses an increasing powers-of-two scale. It can be
+     * shown that in all cases where the loop decides that a value
+     * has to be moved down by p slots, the destination slot is
+     * "free" (i.e. contains an invalid value).
+     */
+    for (p = 1; p <= over; p <<= 1) {
+        unsigned v;
+
+        /*
+         * In the loop below:
+         *
+         *   - v contains the index of the final destination of
+         *     the value; it is recomputed dynamically based on
+         *     whether values are valid or not.
+         *
+         *   - u is the index of the value we consider ("source");
+         *     its address is s.
+         *
+         *   - The loop may swap the value with the one at index
+         *     u-p. The address of the swap destination is d.
+         */
+        v = 0;
+        for (u = 0; u < m; u++) {
+            uint16_t *s, *d;
+            unsigned j, sv, dv, mk;
+
+            if (u < n) {
+                s = &x[u];
+            } else if (u < n2) {
+                s = &tt1[u - n];
+            } else {
+                s = &tt2[u - n2];
+            }
+            sv = *s;
+
+            /*
+             * The value in sv should ultimately go to
+             * address v, i.e. jump back by u-v slots.
+             */
+            j = u - v;
+
+            /*
+             * We increment v for the next iteration, but
+             * only if the source value is valid. The mask
+             * 'mk' is -1 if the value is valid, 0 otherwise,
+             * so we _subtract_ mk.
+             */
+            mk = (sv >> 15) - 1U;
+            v -= mk;
+
+            /*
+             * In this loop we consider jumps by p slots; if
+             * u < p then there is nothing more to do.
+             */
+            if (u < p) {
+                continue;
+            }
+
+            /*
+             * Destination for the swap: value at address u-p.
+             */
+            if ((u - p) < n) {
+                d = &x[u - p];
+            } else if ((u - p) < n2) {
+                d = &tt1[(u - p) - n];
+            } else {
+                d = &tt2[(u - p) - n2];
+            }
+            dv = *d;
+
+            /*
+             * The swap should be performed only if the source
+             * is valid AND the jump j has its 'p' bit set.
+             */
+            mk &= -(((j & p) + 0x1FF) >> 9);
+
+            *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+        }
+    }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+    0, /* unused */
+    101498,
+    208714,
+    428865,
+    892039,
+    1852696,
+    3842630,
+    7959734,
+    16468416,
+    34034726,
+    70265242
+};
+
+/* see inner.h
+ * In NEON, there is sign saturating doubling add instruction sqdmlal/sqdmlal2,
+ * thus, we enable 2 parallel dependency rather than 1 for better scheduling.
+ * Each for loop is tuned for cache locality.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short(const int16_t *s1, const int16_t *s2) {
+    // Total SIMD register 18 = 16 + 2
+    int16x8x4_t neon_s1, neon_s2, neon_s3, neon_s4; // 16
+    int32x4_t neon_s, neon_sh;                      // 2
+    int32x2_t tmp;
+    uint32_t s;
+    neon_s = vdupq_n_s32(0);
+    neon_sh = vdupq_n_s32(0);
+
+    for (unsigned u = 0; u < FALCON_N; u += 128) {
+        vload_s16_x4(neon_s1, &s1[u]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+        vload_s16_x4(neon_s2, &s1[u + 32]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+        vload_s16_x4(neon_s3, &s1[u + 64]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+        vload_s16_x4(neon_s4, &s1[u + 96]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+    }
+    for (unsigned u = 0; u < FALCON_N; u += 128) {
+        vload_s16_x4(neon_s1, &s2[u]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+        vload_s16_x4(neon_s2, &s2[u + 32]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+        vload_s16_x4(neon_s3, &s2[u + 64]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+        vload_s16_x4(neon_s4, &s2[u + 96]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+    }
+    // 32x4
+    neon_s = vhaddq_s32(neon_s, neon_sh);
+    // 32x4 -> 32x2
+    tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+    // 32x2 -> 32x1
+    // Use saturating add to prevent overflow
+    s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+    return s <= l2bound[FALCON_LOGN];
+}
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+        const int16_t *hm, const fpr *t0,
+        const fpr *t1) {
+    // Total SIMD registers: 26 = 16 + 8 + 2
+    int16x8x4_t neon_hm, neon_ts;                         // 8
+    float64x2x4_t neon_tf0, neon_tf1, neon_tf2, neon_tf3; // 16
+    int64x2x4_t neon_ts0, neon_ts1, neon_ts2, neon_ts3;   // 16
+    int32x4x4_t neon_ts4, neon_ts5;                       // 8
+    int32x4_t neon_s, neon_sh;                            // 2
+    int32x2_t tmp;
+    uint32_t s;
+
+    neon_s = vdupq_n_s32(0);
+    neon_sh = vdupq_n_s32(0);
+
+    // s1tmp
+    for (int i = 0; i < FALCON_N; i += 32) {
+        vloadx4(neon_tf0, &t0[i]);
+        vloadx4(neon_tf1, &t0[i + 8]);
+        vfrintx4(neon_ts0, neon_tf0);
+        vfrintx4(neon_ts1, neon_tf1);
+
+        neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+        neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+        neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+        neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+        vloadx4(neon_tf2, &t0[i + 16]);
+        vloadx4(neon_tf3, &t0[i + 24]);
+        vfrintx4(neon_ts2, neon_tf2);
+        vfrintx4(neon_ts3, neon_tf3);
+
+        neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+        neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+        neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+        neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+        neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+        neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+        neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+        neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+        // hm = hm - fpr_rint(t0)
+        vload_s16_x4(neon_hm, &hm[i]);
+        neon_hm.val[0] = vsubq_s16(neon_hm.val[0], neon_ts.val[0]);
+        neon_hm.val[1] = vsubq_s16(neon_hm.val[1], neon_ts.val[1]);
+        neon_hm.val[2] = vsubq_s16(neon_hm.val[2], neon_ts.val[2]);
+        neon_hm.val[3] = vsubq_s16(neon_hm.val[3], neon_ts.val[3]);
+        vstore_s16_x4(&s1tmp[i], neon_hm);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[0]), vget_low_s16(neon_hm.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[1]), vget_low_s16(neon_hm.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[2]), vget_low_s16(neon_hm.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[3]), vget_low_s16(neon_hm.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[0], neon_hm.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[1], neon_hm.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[2], neon_hm.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[3], neon_hm.val[3]);
+    }
+
+    // s2tmp
+    for (int i = 0; i < FALCON_N; i += 32) {
+        vloadx4(neon_tf0, &t1[i]);
+        vloadx4(neon_tf1, &t1[i + 8]);
+
+        vfrintx4(neon_ts0, neon_tf0);
+        vfrintx4(neon_ts1, neon_tf1);
+
+        neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+        neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+        neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+        neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+        vloadx4(neon_tf2, &t1[i + 16]);
+        vloadx4(neon_tf3, &t1[i + 24]);
+
+        vfrintx4(neon_ts2, neon_tf2);
+        vfrintx4(neon_ts3, neon_tf3);
+
+        neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+        neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+        neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+        neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+        neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+        neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+        neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+        neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+        neon_ts.val[0] = vnegq_s16(neon_ts.val[0]);
+        neon_ts.val[1] = vnegq_s16(neon_ts.val[1]);
+        neon_ts.val[2] = vnegq_s16(neon_ts.val[2]);
+        neon_ts.val[3] = vnegq_s16(neon_ts.val[3]);
+        vstore_s16_x4(&s2tmp[i], neon_ts);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[0]), vget_low_s16(neon_ts.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[1]), vget_low_s16(neon_ts.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[2]), vget_low_s16(neon_ts.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[3]), vget_low_s16(neon_ts.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[0], neon_ts.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[1], neon_ts.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[2], neon_ts.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[3], neon_ts.val[3]);
+    }
+
+    // 32x4
+    neon_s = vhaddq_s32(neon_s, neon_sh);
+    // 32x4 -> 32x2
+    tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+    // 32x2 -> 32x1
+    // Use saturating add to prevent overflow
+    s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+    return s <= l2bound[FALCON_LOGN];
+}
+
+int32_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_small_sqnorm(const int8_t *f) {
+    int8x16x4_t a;
+    int16x8x4_t b, c;
+    int32x4_t norm, norm_sh;
+
+    norm = vdupq_n_s32(0);
+    norm_sh = vdupq_n_s32(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        a = vld1q_s8_x4(&f[0]);
+
+        b.val[0] = vmovl_s8(vget_low_s8(a.val[0]));
+        b.val[1] = vmovl_high_s8(a.val[0]);
+        b.val[2] = vmovl_s8(vget_low_s8(a.val[1]));
+        b.val[3] = vmovl_high_s8(a.val[1]);
+
+        c.val[0] = vmovl_s8(vget_low_s8(a.val[2]));
+        c.val[1] = vmovl_high_s8(a.val[2]);
+        c.val[2] = vmovl_s8(vget_low_s8(a.val[3]));
+        c.val[3] = vmovl_high_s8(a.val[3]);
+
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[0]), vget_low_s16(b.val[0]));
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[1]), vget_low_s16(b.val[1]));
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[2]), vget_low_s16(b.val[2]));
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[3]), vget_low_s16(b.val[3]));
+
+        norm = vqdmlal_high_s16(norm, b.val[0], b.val[0]);
+        norm = vqdmlal_high_s16(norm, b.val[1], b.val[1]);
+        norm = vqdmlal_high_s16(norm, b.val[2], b.val[2]);
+        norm = vqdmlal_high_s16(norm, b.val[3], b.val[3]);
+
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[0]), vget_low_s16(c.val[0]));
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[1]), vget_low_s16(c.val[1]));
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[2]), vget_low_s16(c.val[2]));
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[3]), vget_low_s16(c.val[3]));
+
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[0], c.val[0]);
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[1], c.val[1]);
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[2], c.val[2]);
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[3], c.val[3]);
+    }
+    // 32x4
+    norm = vhaddq_s32(norm, norm_sh);
+    // 32x4 -> 32x2
+    int32x2_t tmp;
+    tmp = vqadd_s32(vget_low_s32(norm), vget_high_s32(norm));
+
+    // 32x2 -> 32x1
+    // Use saturating add to prevent overflow
+    int32_t s;
+    s = vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+    return s;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft.c
new file mode 100644
index 000000000..652a306b0
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft.c
@@ -0,0 +1,1038 @@
+/*
+ * High-speed vectorize FFT code for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Forward FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log2(fpr *f) {
+    fpr x_re, x_im, y_re, y_im, v_re, v_im, t_re, t_im, s;
+
+    x_re = f[0];
+    y_re = f[1];
+    x_im = f[2];
+    y_im = f[3];
+    s = fpr_tab_log2[0];
+
+    t_re = y_re * s;
+    t_im = y_im * s;
+
+    v_re = t_re - t_im;
+    v_im = t_re + t_im;
+
+    f[0] = x_re + v_re;
+    f[1] = x_re - v_re;
+    f[2] = x_im + v_im;
+    f[3] = x_im - v_im;
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 4 complex points (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log3(fpr *f) {
+    // Total SIMD registers: 18 = 4 + 6 + 8
+    float64x2x4_t tmp;                                        // 4
+    float64x2x2_t s_re_im, x, y;                              // 6
+    float64x2_t v_re, v_im, x_re, x_im, y_re, y_im, t_x, t_y; // 8
+
+    vloadx4(tmp, &f[0]);
+    s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+    vfmul(v_re, tmp.val[1], s_re_im.val[0]);
+    vfmul(v_im, tmp.val[3], s_re_im.val[0]);
+
+    vfsub(t_x, v_re, v_im);
+    vfadd(t_y, v_re, v_im);
+
+    vfsub(tmp.val[1], tmp.val[0], t_x);
+    vfsub(tmp.val[3], tmp.val[2], t_y);
+
+    vfadd(tmp.val[0], tmp.val[0], t_x);
+    vfadd(tmp.val[2], tmp.val[2], t_y);
+
+    x_re = vtrn1q_f64(tmp.val[0], tmp.val[1]);
+    y_re = vtrn2q_f64(tmp.val[0], tmp.val[1]);
+    x_im = vtrn1q_f64(tmp.val[2], tmp.val[3]);
+    y_im = vtrn2q_f64(tmp.val[2], tmp.val[3]);
+
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    FWD_TOP(v_re, v_im, y_re, y_im, s_re_im.val[0], s_re_im.val[1]);
+
+    FPC_ADD(x.val[0], y.val[0], x_re, x_im, v_re, v_im);
+    FPC_SUB(x.val[1], y.val[1], x_re, x_im, v_re, v_im);
+
+    vstore2(&f[0], x);
+    vstore2(&f[4], y);
+}
+
+/*
+ * Vectorized 3 layers of Forward FFT for 8 complex points (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log4(fpr *f) {
+    // Total SIMD register: 26 = 8 + 18
+    float64x2x4_t t0, t1;                                          // 8
+    float64x2x2_t x_re, x_im, y_re, y_im, v1, v2, tx, ty, s_re_im; // 18
+
+    vloadx4(t0, &f[0]);
+    vloadx4(t1, &f[8]);
+    vload(s_re_im.val[0], &fpr_tab_log2[0]);
+
+    vfmul(v1.val[0], t0.val[2], s_re_im.val[0]);
+    vfmul(v1.val[1], t0.val[3], s_re_im.val[0]);
+
+    vfmul(v2.val[0], t1.val[2], s_re_im.val[0]);
+    vfmul(v2.val[1], t1.val[3], s_re_im.val[0]);
+
+    vfsub(tx.val[0], v1.val[0], v2.val[0]);
+    vfsub(tx.val[1], v1.val[1], v2.val[1]);
+
+    vfadd(ty.val[0], v1.val[0], v2.val[0]);
+    vfadd(ty.val[1], v1.val[1], v2.val[1]);
+
+    FWD_BOT(t0.val[0], t1.val[0], t0.val[2], t1.val[2], tx.val[0], ty.val[0]);
+    FWD_BOT(t0.val[1], t1.val[1], t0.val[3], t1.val[3], tx.val[1], ty.val[1]);
+
+    vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+    FWD_TOP_LANE(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0]);
+    FWD_TOP_LANE(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0]);
+
+    FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+    FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+    x_re.val[0] = t0.val[0];
+    x_re.val[1] = t0.val[2];
+    y_re.val[0] = t0.val[1];
+    y_re.val[1] = t0.val[3];
+
+    x_im.val[0] = t1.val[0];
+    x_im.val[1] = t1.val[2];
+    y_im.val[0] = t1.val[1];
+    y_im.val[1] = t1.val[3];
+
+    t0.val[0] = vzip1q_f64(x_re.val[0], x_re.val[1]);
+    t0.val[1] = vzip2q_f64(x_re.val[0], x_re.val[1]);
+    t0.val[2] = vzip1q_f64(y_re.val[0], y_re.val[1]);
+    t0.val[3] = vzip2q_f64(y_re.val[0], y_re.val[1]);
+
+    t1.val[0] = vzip1q_f64(x_im.val[0], x_im.val[1]);
+    t1.val[1] = vzip2q_f64(x_im.val[0], x_im.val[1]);
+    t1.val[2] = vzip1q_f64(y_im.val[0], y_im.val[1]);
+    t1.val[3] = vzip2q_f64(y_im.val[0], y_im.val[1]);
+
+    vload2(s_re_im, &fpr_tab_log4[0]);
+
+    FWD_TOP(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0], s_re_im.val[1]);
+    FWD_TOP(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+    FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+    FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+    vstore4(&f[0], t0);
+    vstore4(&f[8], t1);
+}
+
+/*
+ * Vectorized 4 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(fpr *f, const unsigned logn) {
+    // Total SIMD register: 34 = 2 + 32
+    float64x2x2_t s_re_im;                                        // 2
+    float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im, v_re, v_im; // 32
+
+    const unsigned int falcon_n = 1 << logn;
+    const unsigned int hn = falcon_n >> 1;
+
+    unsigned int level = logn - 3;
+    const fpr *fpr_tab2 = fpr_table[level++],
+               *fpr_tab3 = fpr_table[level++],
+                *fpr_tab4 = fpr_table[level++],
+                 *fpr_tab5 = fpr_table[level];
+    int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+    for (unsigned j = 0; j < hn; j += 16) {
+        vload(s_re_im.val[0], &fpr_tab2[k2]);
+
+        /*
+         * We only increase k2 when j value has the form j = 32*x + 16
+         * Modulo 32 both sides, then check if (j % 32) == 16.
+         */
+        k2 += 2 * ((j & 31) == 16);
+
+        vloadx4(y_re, &f[j + 8]);
+        vloadx4(y_im, &f[j + 8 + hn]);
+
+        if (logn == 5) {
+            // Handle special case when use fpr_tab_log2, where re == im
+            // This reduce number of multiplications,
+            // although equal number of instructions as the "else" branch
+            vfmulx4_i(t_im, y_im, s_re_im.val[0]);
+            vfmulx4_i(t_re, y_re, s_re_im.val[0]);
+            vfsubx4(v_re, t_re, t_im);
+            vfaddx4(v_im, t_re, t_im);
+        } else {
+            FWD_TOP_LANEx4(v_re, v_im, y_re, y_im, s_re_im.val[0]);
+        }
+
+        vloadx4(x_re, &f[j]);
+        vloadx4(x_im, &f[j + hn]);
+
+        if ((j >> 4) & 1) {
+            FWD_BOTJx4(x_re, x_im, y_re, y_im, v_re, v_im);
+        } else {
+            FWD_BOTx4(x_re, x_im, y_re, y_im, v_re, v_im);
+        }
+
+        vload(s_re_im.val[0], &fpr_tab3[k3]);
+        k3 += 2;
+
+        FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[2], x_im.val[2], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[2], y_im.val[2], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0]);
+
+        FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0]);
+        FWD_BOT(x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+        FWD_BOTJ(y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2]);
+        FWD_BOTJ(y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+        vloadx2(s_re_im, &fpr_tab4[k4]);
+        k4 += 4;
+
+        FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[1]);
+        FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[1]);
+
+        FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+        FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+        FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+        FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+        transpose_f64(x_re, x_re, v_re, 0, 2, 0);
+        transpose_f64(x_re, x_re, v_re, 1, 3, 1);
+        transpose_f64(x_im, x_im, v_im, 0, 2, 0);
+        transpose_f64(x_im, x_im, v_im, 1, 3, 1);
+
+        v_re.val[0] = x_re.val[2];
+        x_re.val[2] = x_re.val[1];
+        x_re.val[1] = v_re.val[0];
+
+        v_im.val[0] = x_im.val[2];
+        x_im.val[2] = x_im.val[1];
+        x_im.val[1] = v_im.val[0];
+
+        transpose_f64(y_re, y_re, v_re, 0, 2, 2);
+        transpose_f64(y_re, y_re, v_re, 1, 3, 3);
+        transpose_f64(y_im, y_im, v_im, 0, 2, 2);
+        transpose_f64(y_im, y_im, v_im, 1, 3, 3);
+
+        v_re.val[0] = y_re.val[2];
+        y_re.val[2] = y_re.val[1];
+        y_re.val[1] = v_re.val[0];
+
+        v_im.val[0] = y_im.val[2];
+        y_im.val[2] = y_im.val[1];
+        y_im.val[1] = v_im.val[0];
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        FWD_TOP(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        FWD_TOP(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        FWD_TOP(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        FWD_TOP(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+        FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+        FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+
+        vstore4(&f[j], x_re);
+        vstore4(&f[j + hn], x_im);
+
+        FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+        FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+        vstore4(&f[j + 8], y_re);
+        vstore4(&f[j + 8 + hn], y_im);
+    }
+}
+
+/*
+ * Vectorized 1 layer of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn1(fpr *f, const unsigned logn) {
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+    const unsigned ht = n >> 2;
+
+    // Total SIMD register: 25 = 1 + 24
+    float64x2_t s_re_im;                                          // 1
+    float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im, v_re, v_im; // 24
+
+    s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+    for (unsigned j = 0; j < ht; j += 8) {
+        vloadx4(b_re, &f[j + ht]);
+        vfmulx4_i(t_re, b_re, s_re_im);
+
+        vloadx4(b_im, &f[j + ht + hn]);
+        vfmulx4_i(t_im, b_im, s_re_im);
+
+        vfsubx4(v_re, t_re, t_im);
+        vfaddx4(v_im, t_re, t_im);
+
+        vloadx4(a_re, &f[j]);
+        vloadx4(a_im, &f[j + hn]);
+
+        FWD_BOTx4(a_re, a_im, b_re, b_im, v_re, v_im);
+        vstorex4(&f[j + ht], b_re);
+        vstorex4(&f[j], a_re);
+
+        vstorex4(&f[j + ht + hn], b_im);
+        vstorex4(&f[j + hn], a_im);
+    }
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn2(fpr *f, const unsigned logn, const unsigned level) {
+    const unsigned int falcon_n = 1 << logn;
+    const unsigned int hn = falcon_n >> 1;
+
+    // Total SIMD register: 26 = 8 + 16 + 2
+    float64x2x4_t t_re, t_im;                   // 8
+    float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+                  y1_re, y2_re, y1_im, y2_im;   // 16
+    float64x2_t s1_re_im, s2_re_im;             // 2
+
+    const fpr *fpr_tab1 = NULL, *fpr_tab2 = NULL;
+    unsigned l, len, start, j, k1, k2;
+    unsigned bar = logn - level + 2;
+
+    for (l = level - 1; l > 4; l -= 2) {
+        len = 1 << (l - 2);
+        fpr_tab1 = fpr_table[bar++];
+        fpr_tab2 = fpr_table[bar++];
+        k1 = 0;
+        k2 = 0;
+
+        for (start = 0; start < hn; start += 1U << l) {
+            vload(s1_re_im, &fpr_tab1[k1]);
+            vload(s2_re_im, &fpr_tab2[k2]);
+            k1 += 2U * ((start & 127) == 64);
+            k2 += 2;
+
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(y1_re, &f[j + 2 * len]);
+                vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(x2_re, &f[j + len]);
+                vloadx2(x2_im, &f[j + len + hn]);
+
+                FWD_BOT(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOT(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+                FWD_BOT(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOT(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+                FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+                vstorex2(&f[j + len], x2_re);
+                vstorex2(&f[j + len + hn], x2_im);
+
+                FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                vstorex2(&f[j + 2 * len], y1_re);
+                vstorex2(&f[j + 2 * len + hn], y1_im);
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+            }
+
+            start += 1U << l;
+            if (start >= hn) {
+                break;
+            }
+
+            vload(s1_re_im, &fpr_tab1[k1]);
+            vload(s2_re_im, &fpr_tab2[k2]);
+            k1 += 2U * ((start & 127) == 64);
+            k2 += 2;
+
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(y1_re, &f[j + 2 * len]);
+                vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(x2_re, &f[j + len]);
+                vloadx2(x2_im, &f[j + len + hn]);
+
+                FWD_BOTJ(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOTJ(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+                FWD_BOTJ(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOTJ(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+                FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+                vstorex2(&f[j + len], x2_re);
+                vstorex2(&f[j + len + hn], x2_im);
+
+                FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                vstorex2(&f[j + 2 * len], y1_re);
+                vstorex2(&f[j + 2 * len + hn], y1_im);
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+            }
+        }
+    }
+}
+
+/*
+ * 1 layer of Inverse FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log2(fpr *f) {
+    fpr x_re, x_im, y_re, y_im, s;
+    x_re = f[0];
+    y_re = f[1];
+    x_im = f[2];
+    y_im = f[3];
+    s = fpr_tab_log2[0] * 0.5;
+
+    f[0] = (x_re + y_re) * 0.5;
+    f[2] = (x_im + y_im) * 0.5;
+
+    x_re = (x_re - y_re) * s;
+    x_im = (x_im - y_im) * s;
+
+    f[1] = x_im + x_re;
+    f[3] = x_im - x_re;
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 4 complex point (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log3(fpr *f) {
+    // Total SIMD registers: 12 = 4 + 8
+    float64x2x4_t tmp;                          // 4
+    float64x2x2_t x_re_im, y_re_im, v, s_re_im; // 8
+
+    vload2(x_re_im, &f[0]);
+    vload2(y_re_im, &f[4]);
+
+    vfsub(v.val[0], x_re_im.val[0], x_re_im.val[1]);
+    vfsub(v.val[1], y_re_im.val[0], y_re_im.val[1]);
+    vfadd(x_re_im.val[0], x_re_im.val[0], x_re_im.val[1]);
+    vfadd(x_re_im.val[1], y_re_im.val[0], y_re_im.val[1]);
+
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    vfmul(y_re_im.val[0], v.val[1], s_re_im.val[1]);
+    vfmla(y_re_im.val[0], y_re_im.val[0], v.val[0], s_re_im.val[0]);
+    vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+    vfmls(y_re_im.val[1], y_re_im.val[1], v.val[0], s_re_im.val[1]);
+
+    tmp.val[0] = vtrn1q_f64(x_re_im.val[0], y_re_im.val[0]);
+    tmp.val[1] = vtrn2q_f64(x_re_im.val[0], y_re_im.val[0]);
+    tmp.val[2] = vtrn1q_f64(x_re_im.val[1], y_re_im.val[1]);
+    tmp.val[3] = vtrn2q_f64(x_re_im.val[1], y_re_im.val[1]);
+
+    s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+    vfadd(x_re_im.val[0], tmp.val[0], tmp.val[1]);
+    vfadd(x_re_im.val[1], tmp.val[2], tmp.val[3]);
+    vfsub(v.val[0], tmp.val[0], tmp.val[1]);
+    vfsub(v.val[1], tmp.val[2], tmp.val[3]);
+
+    vfmuln(tmp.val[0], x_re_im.val[0], 0.25);
+    vfmuln(tmp.val[2], x_re_im.val[1], 0.25);
+
+    vfmuln(s_re_im.val[0], s_re_im.val[0], 0.25);
+
+    vfmul(y_re_im.val[0], v.val[0], s_re_im.val[0]);
+    vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+
+    vfadd(tmp.val[1], y_re_im.val[1], y_re_im.val[0]);
+    vfsub(tmp.val[3], y_re_im.val[1], y_re_im.val[0]);
+
+    vstorex4(&f[0], tmp);
+}
+
+/*
+ * Vectorized 3 layers of Inverse FFT for 8 complex point (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log4(fpr *f) {
+    // Total SIMD registers: 18 = 12 + 6
+    float64x2x4_t re, im, t;           // 12
+    float64x2x2_t t_re, t_im, s_re_im; // 6
+
+    vload4(re, &f[0]);
+    vload4(im, &f[8]);
+
+    INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+    INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+    vload2(s_re_im, &fpr_tab_log4[0]);
+
+    INV_BOTJ(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+    INV_BOTJm(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+    // re: 0, 4 | 1, 5 | 2, 6 | 3, 7
+    // im: 8, 12| 9, 13|10, 14|11, 15
+    transpose_f64(re, re, t, 0, 1, 0);
+    transpose_f64(re, re, t, 2, 3, 1);
+    transpose_f64(im, im, t, 0, 1, 2);
+    transpose_f64(im, im, t, 2, 3, 3);
+
+    // re: 0, 1 | 4,  5 | 2, 3 | 6, 7
+    // im: 8, 9 | 12, 13|10, 11| 14, 15
+    t.val[0] = re.val[1];
+    re.val[1] = re.val[2];
+    re.val[2] = t.val[0];
+
+    t.val[1] = im.val[1];
+    im.val[1] = im.val[2];
+    im.val[2] = t.val[1];
+
+    // re: 0, 1 |  2,  3| 4,  5 | 6, 7
+    // im: 8, 9 | 10, 11| 12, 13| 14, 15
+    INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+    INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+    vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+    INV_BOTJ_LANE(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+    INV_BOTJm_LANE(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+    INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[2], im.val[2]);
+    INV_TOPJ(t_re.val[1], t_im.val[1], re.val[1], im.val[1], re.val[3], im.val[3]);
+
+    vfmuln(re.val[0], re.val[0], 0.12500000000);
+    vfmuln(re.val[1], re.val[1], 0.12500000000);
+    vfmuln(im.val[0], im.val[0], 0.12500000000);
+    vfmuln(im.val[1], im.val[1], 0.12500000000);
+
+    s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+    vfmuln(s_re_im.val[0], s_re_im.val[0], 0.12500000000);
+
+    vfmul(t_re.val[0], t_re.val[0], s_re_im.val[0]);
+    vfmul(t_re.val[1], t_re.val[1], s_re_im.val[0]);
+    vfmul(t_im.val[0], t_im.val[0], s_re_im.val[0]);
+    vfmul(t_im.val[1], t_im.val[1], s_re_im.val[0]);
+
+    vfsub(im.val[2], t_im.val[0], t_re.val[0]);
+    vfsub(im.val[3], t_im.val[1], t_re.val[1]);
+    vfadd(re.val[2], t_im.val[0], t_re.val[0]);
+    vfadd(re.val[3], t_im.val[1], t_re.val[1]);
+
+    vstorex4(&f[0], re);
+    vstorex4(&f[8], im);
+}
+
+/*
+ * Vectorized 4 layers of Inverse FFT for 16 complex point (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(fpr *f, const unsigned logn, const unsigned last) {
+    // Total SIMD register: 26 = 24 + 2
+    float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im; // 24
+    float64x2x2_t s_re_im;                            // 2
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+
+    unsigned int level = logn;
+    const fpr *fpr_tab5 = fpr_table[level--],
+               *fpr_tab4 = fpr_table[level--],
+                *fpr_tab3 = fpr_table[level--],
+                 *fpr_tab2 = fpr_table[level];
+    int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+    for (unsigned j = 0; j < hn; j += 16) {
+
+        vload4(x_re, &f[j]);
+        vload4(x_im, &f[j + hn]);
+
+        INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+        INV_TOPJm(t_re.val[2], t_im.val[2], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+        vload4(y_re, &f[j + 8]);
+        vload4(y_im, &f[j + 8 + hn]);
+
+        INV_TOPJ(t_re.val[1], t_im.val[1], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+        INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        INV_BOTJ(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+        INV_BOTJm(x_re.val[3], x_im.val[3], t_re.val[2], t_im.val[2], s_re_im.val[0], s_re_im.val[1]);
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        INV_BOTJ(y_re.val[1], y_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        INV_BOTJm(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+        transpose_f64(x_re, x_re, t_re, 0, 1, 0);
+        transpose_f64(x_re, x_re, t_re, 2, 3, 1);
+        transpose_f64(y_re, y_re, t_re, 0, 1, 2);
+        transpose_f64(y_re, y_re, t_re, 2, 3, 3);
+
+        transpose_f64(x_im, x_im, t_im, 0, 1, 0);
+        transpose_f64(x_im, x_im, t_im, 2, 3, 1);
+        transpose_f64(y_im, y_im, t_im, 0, 1, 2);
+        transpose_f64(y_im, y_im, t_im, 2, 3, 3);
+
+        t_re.val[0] = x_re.val[1];
+        x_re.val[1] = x_re.val[2];
+        x_re.val[2] = t_re.val[0];
+
+        t_re.val[1] = y_re.val[1];
+        y_re.val[1] = y_re.val[2];
+        y_re.val[2] = t_re.val[1];
+
+        t_im.val[0] = x_im.val[1];
+        x_im.val[1] = x_im.val[2];
+        x_im.val[2] = t_im.val[0];
+
+        t_im.val[1] = y_im.val[1];
+        y_im.val[1] = y_im.val[2];
+        y_im.val[2] = t_im.val[1];
+
+        INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+        INV_TOPJm(t_re.val[1], t_im.val[1], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+        INV_TOPJ(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+        INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+        vloadx2(s_re_im, &fpr_tab4[k4]);
+        k4 += 4;
+
+        INV_BOTJ_LANE(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+        INV_BOTJm_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+        INV_BOTJ_LANE(y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2], s_re_im.val[1]);
+        INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[1]);
+
+        INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2]);
+        INV_TOPJ(t_re.val[1], t_im.val[1], x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3]);
+
+        INV_TOPJm(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2]);
+        INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3]);
+
+        vload(s_re_im.val[0], &fpr_tab3[k3]);
+        k3 += 2;
+
+        INV_BOTJ_LANE(x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+        INV_BOTJ_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+        INV_BOTJm_LANE(y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2], s_re_im.val[0]);
+        INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0]);
+
+        if ((j >> 4) & 1) {
+            INV_TOPJmx4(t_re, t_im, x_re, x_im, y_re, y_im);
+        } else {
+            INV_TOPJx4(t_re, t_im, x_re, x_im, y_re, y_im);
+        }
+
+        vload(s_re_im.val[0], &fpr_tab2[k2]);
+        k2 += 2 * ((j & 31) == 16);
+
+        if (last) {
+            vfmuln(s_re_im.val[0], s_re_im.val[0], fpr_p2_tab[logn]);
+            vfmulnx4(x_re, x_re, fpr_p2_tab[logn]);
+            vfmulnx4(x_im, x_im, fpr_p2_tab[logn]);
+        }
+        vstorex4(&f[j], x_re);
+        vstorex4(&f[j + hn], x_im);
+
+        if (logn == 5) {
+            // Special case in fpr_tab_log2 where re == im
+            vfmulx4_i(t_re, t_re, s_re_im.val[0]);
+            vfmulx4_i(t_im, t_im, s_re_im.val[0]);
+
+            vfaddx4(y_re, t_im, t_re);
+            vfsubx4(y_im, t_im, t_re);
+        } else {
+            if ((j >> 4) & 1) {
+                INV_BOTJm_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+            } else {
+                INV_BOTJ_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+            }
+        }
+
+        vstorex4(&f[j + 8], y_re);
+        vstorex4(&f[j + 8 + hn], y_im);
+    }
+}
+
+/*
+ * Vectorized 1 layer of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn1(fpr *f, const unsigned logn, const unsigned last) {
+    // Total SIMD register 26 = 24 + 2
+    float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im; // 24
+    float64x2_t s_re_im;                              // 2
+
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+    const unsigned ht = n >> 2;
+
+    for (unsigned j = 0; j < ht; j += 8) {
+        vloadx4(a_re, &f[j]);
+        vloadx4(a_im, &f[j + hn]);
+        vloadx4(b_re, &f[j + ht]);
+        vloadx4(b_im, &f[j + ht + hn]);
+
+        INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im);
+
+        s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+        if (last) {
+            vfmuln(s_re_im, s_re_im, fpr_p2_tab[logn]);
+            vfmulnx4(a_re, a_re, fpr_p2_tab[logn]);
+            vfmulnx4(a_im, a_im, fpr_p2_tab[logn]);
+        }
+
+        vstorex4(&f[j], a_re);
+        vstorex4(&f[j + hn], a_im);
+
+        vfmulx4_i(t_re, t_re, s_re_im);
+        vfmulx4_i(t_im, t_im, s_re_im);
+
+        vfaddx4(b_re, t_im, t_re);
+        vfsubx4(b_im, t_im, t_re);
+
+        vstorex4(&f[j + ht], b_re);
+        vstorex4(&f[j + ht + hn], b_im);
+    }
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn2(fpr *f, const unsigned logn, const unsigned level, unsigned last) {
+    const unsigned int falcon_n = 1 << logn;
+    const unsigned int hn = falcon_n >> 1;
+
+    // Total SIMD register: 26 = 16 + 8 + 2
+    float64x2x4_t t_re, t_im;                   // 8
+    float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+                  y1_re, y2_re, y1_im, y2_im;   // 16
+    float64x2_t s1_re_im, s2_re_im;             // 2
+
+    const fpr *fpr_inv_tab1 = NULL, *fpr_inv_tab2 = NULL;
+    unsigned l, len, start, j, k1, k2;
+    unsigned bar = logn - 4;
+
+    for (l = 4; l < logn - level - 1; l += 2) {
+        len = 1 << l;
+        last -= 1;
+        fpr_inv_tab1 = fpr_table[bar--];
+        fpr_inv_tab2 = fpr_table[bar--];
+        k1 = 0;
+        k2 = 0;
+
+        for (start = 0; start < hn; start += 1U << (l + 2)) {
+            vload(s1_re_im, &fpr_inv_tab1[k1]);
+            vload(s2_re_im, &fpr_inv_tab2[k2]);
+            k1 += 2;
+            k2 += 2U * ((start & 127) == 64);
+            if (!last) {
+                vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+            }
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(y1_re, &f[j + len]);
+                vloadx2(y1_im, &f[j + len + hn]);
+
+                INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+                INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+                vloadx2(x2_re, &f[j + 2 * len]);
+                vloadx2(x2_im, &f[j + 2 * len + hn]);
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+                INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+                INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+                INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+                INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+                INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+                INV_TOPJ(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJ(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJ_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+                INV_BOTJ_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+                INV_BOTJ_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+                INV_BOTJ_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+                vstorex2(&f[j + 2 * len], x2_re);
+                vstorex2(&f[j + 2 * len + hn], x2_im);
+
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+
+                if (!last) {
+                    vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+                    vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+                }
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+
+                vstorex2(&f[j + len], y1_re);
+                vstorex2(&f[j + len + hn], y1_im);
+            }
+
+            start += 1U << (l + 2);
+            if (start >= hn) {
+                break;
+            }
+
+            vload(s1_re_im, &fpr_inv_tab1[k1]);
+            vload(s2_re_im, &fpr_inv_tab2[k2]);
+            k1 += 2;
+            k2 += 2U * ((start & 127) == 64);
+            if (!last) {
+                vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+            }
+
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(y1_re, &f[j + len]);
+                vloadx2(y1_im, &f[j + len + hn]);
+
+                INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+                INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+                vloadx2(x2_re, &f[j + 2 * len]);
+                vloadx2(x2_im, &f[j + 2 * len + hn]);
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+                INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+                INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+                INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+                INV_TOPJm(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+                INV_TOPJm(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+                INV_TOPJm(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJm(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJm_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+                INV_BOTJm_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+                INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+                INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+                vstorex2(&f[j + 2 * len], x2_re);
+                vstorex2(&f[j + 2 * len + hn], x2_im);
+
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+
+                if (!last) {
+                    vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+                    vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+                }
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+
+                vstorex2(&f[j + len], y1_re);
+                vstorex2(&f[j + len + hn], y1_im);
+            }
+        }
+    }
+}
+
+/*
+ * Scalable vectorized Forward FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_FFT(fpr *f, const unsigned logn) {
+    unsigned level = logn;
+    switch (logn) {
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log2(f);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log3(f);
+        break;
+
+    case 4:
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log4(f);
+        break;
+
+    case 5:
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, 5);
+        break;
+
+    case 6:
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn1(f, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, logn);
+        break;
+
+    case 7:
+    case 9:
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn2(f, logn, level);
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, logn);
+        break;
+
+    case 8:
+    case 10:
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn1(f, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn2(f, logn, level - 1);
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, logn);
+        break;
+
+    default:
+        break;
+    }
+}
+
+/*
+ * Scalable vectorized Inverse FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(fpr *f, const unsigned logn) {
+    const unsigned level = (logn - 5) & 1;
+
+    switch (logn) {
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log2(f);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log3(f);
+        break;
+
+    case 4:
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log4(f);
+        break;
+
+    case 5:
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, 5, 1);
+        break;
+
+    case 6:
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, logn, 0);
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn1(f, logn, 1);
+        break;
+
+    case 7:
+    case 9:
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, logn, 0);
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn2(f, logn, level, 1);
+        break;
+
+    case 8:
+    case 10:
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, logn, 0);
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn2(f, logn, level, 0);
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn1(f, logn, 1);
+        break;
+
+    default:
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft_tree.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft_tree.c
new file mode 100644
index 000000000..6e5432e25
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft_tree.c
@@ -0,0 +1,247 @@
+/*
+ * High-speed vectorize FFT tree for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Merge FFT for 2 complex points (4 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log2(fpr *f, const fpr *f0, const fpr *f1) {
+    fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+    a_re = f0[0];
+    a_im = f0[1];
+    s = fpr_tab_log2[0];
+    b_re = f1[0] * s;
+    b_im = f1[1] * s;
+
+    d_re = b_re - b_im;
+    d_im = b_re + b_im;
+
+    f[0] = a_re + d_re;
+    f[2] = a_im + d_im;
+    f[1] = a_re - d_re;
+    f[3] = a_im - d_im;
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log3(fpr *f, const fpr *f0, const fpr *f1) {
+    // Total SIMD registers: 12 = 10 + 2
+    float64x2x2_t g1, g0, g_re, g_im, s_re_im; // 10
+    float64x2_t t_re, t_im;                    // 2
+
+    vloadx2(g1, &f1[0]);
+
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    FWD_TOP(t_re, t_im, g1.val[0], g1.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+    vloadx2(g0, &f0[0]);
+
+    FPC_ADD(g_re.val[0], g_im.val[0], g0.val[0], g0.val[1], t_re, t_im);
+    FPC_SUB(g_re.val[1], g_im.val[1], g0.val[0], g0.val[1], t_re, t_im);
+
+    vstore2(&f[0], g_re);
+    vstore2(&f[4], g_im);
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log4(fpr *f, const fpr *f0, const fpr *f1, const unsigned logn) {
+    const unsigned n = 1 << logn;
+    const unsigned ht = n >> 2;
+    const fpr *fpr_merge = fpr_table[logn];
+
+    // Total SIMD register 22 = 14 + 8
+    float64x2x2_t g1_re, g1_im, g0_re, g0_im, s_re_im, t_re, t_im; // 14
+    float64x2x4_t g_re, g_im;                                      // 8
+
+    for (unsigned j = 0; j < ht; j += 4) {
+        vload2(g1_re, &f1[j]);
+        vload2(g1_im, &f1[j + ht]);
+
+        vload2(s_re_im, &fpr_merge[j]);
+
+        FWD_TOP(t_re.val[0], t_im.val[0], g1_re.val[0], g1_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+        vload2(g0_re, &f0[j]);
+
+        FWD_TOP(t_re.val[1], t_im.val[1], g1_re.val[1], g1_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        vload2(g0_im, &f0[j + ht]);
+
+        FPC_ADD(g_re.val[0], g_im.val[0], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+        FPC_SUB(g_re.val[1], g_im.val[1], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+        FPC_ADDJ(g_re.val[2], g_im.val[2], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+        FPC_SUBJ(g_re.val[3], g_im.val[3], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+
+        vstore4(&f[j << 1], g_re);
+        vstore4(&f[(j + ht) << 1], g_im);
+    }
+}
+
+/*
+ * 1 layer of Split FFT for 2 complex points (4 coefficients).
+ */
+static void
+PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log2(fpr *restrict f0, fpr *restrict f1, const fpr *restrict f) {
+    fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+    a_re = f[0];
+    b_re = f[1];
+    a_im = f[2];
+    b_im = f[3];
+    s = fpr_tab_log2[0] * 0.5;
+
+    f0[0] = (a_re + b_re) * 0.5;
+    f0[1] = (a_im + b_im) * 0.5;
+
+    d_re = (a_re - b_re) * s;
+    d_im = (a_im - b_im) * s;
+
+    f1[0] = d_im + d_re;
+    f1[1] = d_im - d_re;
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log3(fpr *f0, fpr *f1, const fpr *f) {
+    // Total SIMD registers: 12
+    float64x2x2_t re, im, g0, g1, s_re_im, tm; // 12
+
+    vload2(re, &f[0]);
+    vload2(im, &f[4]);
+
+    FPC_ADD(g0.val[0], g0.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+    FPC_SUB(tm.val[0], tm.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    vfmuln(g0.val[0], g0.val[0], 0.5);
+    vfmuln(g0.val[1], g0.val[1], 0.5);
+    vstorex2(&f0[0], g0);
+
+    vfmuln(s_re_im.val[0], s_re_im.val[0], 0.5);
+    vfmuln(s_re_im.val[1], s_re_im.val[1], 0.5);
+
+    INV_BOTJ(g1.val[0], g1.val[1], tm.val[0], tm.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+    vstorex2(&f1[0], g1);
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log4(fpr *f0, fpr *f1, const fpr *f, const unsigned logn) {
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+    const unsigned ht = n >> 2;
+    const fpr *fpr_split = fpr_table[logn];
+
+    // Total SIMD register 23 = 1 + 8 + 14
+    float64x2_t half;                                              // 1
+    float64x2x4_t g_re, g_im;                                      // 8
+    float64x2x2_t s_re_im, t_re, t_im, g1_re, g1_im, g0_re, g0_im; // 14
+
+    half = vdupq_n_f64(0.5);
+    for (unsigned j = 0; j < ht; j += 4) {
+        unsigned j2 = j << 1;
+        vload4(g_re, &f[j2]);
+        vload4(g_im, &f[j2 + hn]);
+
+        FPC_ADD(g0_re.val[0], g0_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+        FPC_ADD(g0_re.val[1], g0_im.val[1], g_re.val[2], g_im.val[2], g_re.val[3], g_im.val[3]);
+
+        FPC_SUB(t_re.val[0], t_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+        FPC_SUB(t_re.val[1], t_im.val[1], g_re.val[3], g_im.val[3], g_re.val[2], g_im.val[2]);
+
+        vload2(s_re_im, &fpr_split[j]);
+
+        vfmul(g0_re.val[0], g0_re.val[0], half);
+        vfmul(g0_re.val[1], g0_re.val[1], half);
+        vstore2(&f0[j], g0_re);
+
+        vfmul(g0_im.val[0], g0_im.val[0], half);
+        vfmul(g0_im.val[1], g0_im.val[1], half);
+        vstore2(&f0[j + ht], g0_im);
+
+        vfmul(s_re_im.val[0], s_re_im.val[0], half);
+        vfmul(s_re_im.val[1], s_re_im.val[1], half);
+
+        INV_BOTJ(g1_re.val[0], g1_im.val[0], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+        INV_BOTJm(g1_re.val[1], g1_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+        vstore2(&f1[j], g1_re);
+        vstore2(&f1[j + ht], g1_im);
+    }
+}
+
+/*
+ * Vectorized Split FFT implementation
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1, const fpr *f, const unsigned logn) {
+    switch (logn) {
+    case 1:
+        //  n = 2; hn = 1; qn = 0;
+        f0[0] = f[0];
+        f1[0] = f[1];
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log2(f0, f1, f);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log3(f0, f1, f);
+        break;
+
+    default:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log4(f0, f1, f, logn);
+        break;
+    }
+}
+
+/*
+ * Vectorized Merge FFT implementation
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(fpr *restrict f, const fpr *restrict f0,
+        const fpr *restrict f1, const unsigned logn) {
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1;
+        f[0] = f0[0];
+        f[1] = f1[0];
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log2(f, f0, f1);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log3(f, f0, f1);
+        break;
+
+    default:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log4(f, f0, f1, logn);
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.c
new file mode 100644
index 000000000..3270c0d38
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.c
@@ -0,0 +1,336 @@
+/*
+ * Compressed floating-point Twiddle Factor.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+
+const fpr fpr_p2_tab[] = {
+    2.00000000000,
+    1.00000000000,
+    0.50000000000,
+    0.25000000000,
+    0.12500000000,
+    0.06250000000,
+    0.03125000000,
+    0.01562500000,
+    0.00781250000,
+    0.00390625000,
+    0.00195312500
+};
+
+const fpr fpr_tab_log2[] = {
+    0.707106781186547524400844362, 0.707106781186547524400844362, // 4, 5
+};
+
+const fpr fpr_tab_log3[] = {
+    0.923879532511286756128183189, 0.382683432365089771728459984, // 8, 9
+    -0.382683432365089771728459984, 0.923879532511286756128183189,
+};
+
+const fpr fpr_tab_log4[] = {
+    0.980785280403230449126182236, 0.195090322016128267848284868, // 16
+    0.555570233019602224742830814, 0.831469612302545237078788378, // 20
+};
+
+const fpr fpr_tab_log5[] = {
+    0.995184726672196886244836953, 0.098017140329560601994195564, // 32
+    0.634393284163645498215171613, 0.773010453362736960810906610, // 36
+    0.881921264348355029712756864, 0.471396736825997648556387626, // 40
+    0.290284677254462367636192376, 0.956940335732208864935797887, // 44
+};
+
+const fpr fpr_tab_log6[] = {
+    0.998795456205172392714771605, 0.049067674327418014254954977, // 64
+    0.671558954847018400625376850, 0.740951125354959091175616897, // 68
+    0.903989293123443331586200297, 0.427555093430282094320966857, // 72
+    0.336889853392220050689253213, 0.941544065183020778412509403, // 76
+    0.970031253194543992603984207, 0.242980179903263889948274162, // 80
+    0.514102744193221726593693839, 0.857728610000272069902269984, // 84
+    0.803207531480644909806676513, 0.595699304492433343467036529, // 88
+    0.146730474455361751658850130, 0.989176509964780973451673738, // 92
+};
+
+const fpr fpr_tab_log7[] = {
+    0.999698818696204220115765650, 0.024541228522912288031734529, // 128
+    0.689540544737066924616730630, 0.724247082951466920941069243, // 132
+    0.914209755703530654635014829, 0.405241314004989870908481306, // 136
+    0.359895036534988148775104572, 0.932992798834738887711660256, // 140
+    0.975702130038528544460395766, 0.219101240156869797227737547, // 144
+    0.534997619887097210663076905, 0.844853565249707073259571205, // 148
+    0.817584813151583696504920884, 0.575808191417845300745972454, // 152
+    0.170961888760301226363642357, 0.985277642388941244774018433, // 156
+    0.992479534598709998156767252, 0.122410675199216198498704474, // 160
+    0.615231590580626845484913563, 0.788346427626606262009164705, // 164
+    0.870086991108711418652292404, 0.492898192229784036873026689, // 168
+    0.266712757474898386325286515, 0.963776065795439866686464356, // 172
+    0.949528180593036667195936074, 0.313681740398891476656478846, // 176
+    0.449611329654606600046294579, 0.893224301195515320342416447, // 180
+    0.757208846506484547575464054, 0.653172842953776764084203014, // 184
+    0.073564563599667423529465622, 0.997290456678690216135597140, // 188
+};
+
+const fpr fpr_tab_log8[] = {
+    0.999924701839144540921646491, 0.012271538285719926079408262, // 256
+    0.698376249408972853554813503, 0.715730825283818654125532623, // 260
+    0.919113851690057743908477789, 0.393992040061048108596188661, // 264
+    0.371317193951837543411934967, 0.928506080473215565937167396, // 268
+    0.978317370719627633106240097, 0.207111376192218549708116020, // 272
+    0.545324988422046422313987347, 0.838224705554838043186996856, // 276
+    0.824589302785025264474803737, 0.565731810783613197389765011, // 280
+    0.183039887955140958516532578, 0.983105487431216327180301155, // 284
+    0.993906970002356041546922813, 0.110222207293883058807899140, // 288
+    0.624859488142386377084072816, 0.780737228572094478301588484, // 292
+    0.876070094195406607095844268, 0.482183772079122748517344481, // 296
+    0.278519689385053105207848526, 0.960430519415565811199035138, // 300
+    0.953306040354193836916740383, 0.302005949319228067003463232, // 304
+    0.460538710958240023633181487, 0.887639620402853947760181617, // 308
+    0.765167265622458925888815999, 0.643831542889791465068086063, // 312
+    0.085797312344439890461556332, 0.996312612182778012627226190, // 316
+    0.998118112900149207125155861, 0.061320736302208577782614593, // 320
+    0.662415777590171761113069817, 0.749136394523459325469203257, // 324
+    0.898674465693953843041976744, 0.438616238538527637647025738, // 328
+    0.325310292162262934135954708, 0.945607325380521325730945387, // 332
+    0.966976471044852109087220226, 0.254865659604514571553980779, // 336
+    0.503538383725717558691867071, 0.863972856121586737918147054, // 340
+    0.795836904608883536262791915, 0.605511041404325513920626941, // 344
+    0.134580708507126186316358409, 0.990902635427780025108237011, // 348
+    0.987301418157858382399815802, 0.158858143333861441684385360, // 352
+    0.585797857456438860328080838, 0.810457198252594791726703434, // 356
+    0.851355193105265142261290312, 0.524589682678468906215098464, // 360
+    0.231058108280671119643236018, 0.972939952205560145467720114, // 364
+    0.937339011912574923201899593, 0.348418680249434568419308588, // 368
+    0.416429560097637182562598911, 0.909167983090522376563884788, // 372
+    0.732654271672412834615546649, 0.680600997795453050594430464, // 376
+    0.036807222941358832324332691, 0.999322384588349500896221011, // 380
+};
+
+const fpr fpr_tab_log9[] = {
+    0.999981175282601142656990438, 0.006135884649154475359640235, // 512
+    0.702754744457225302452914421, 0.711432195745216441522130290, // 516
+    0.921514039342041943465396332, 0.388345046698826291624993541, // 520
+    0.377007410216418256726567823, 0.926210242138311341974793388, // 524
+    0.979569765685440534439326110, 0.201104634842091911558443546, // 528
+    0.550457972936604802977289893, 0.834862874986380056304401383, // 532
+    0.828045045257755752067527592, 0.560661576197336023839710223, // 536
+    0.189068664149806212754997837, 0.981963869109555264072848154, // 540
+    0.994564570734255452119106243, 0.104121633872054579120943880, // 544
+    0.629638238914927025372981341, 0.776888465673232450040827983, // 548
+    0.879012226428633477831323711, 0.476799230063322133342158117, // 552
+    0.284407537211271843618310615, 0.958703474895871555374645792, // 556
+    0.955141168305770721498157712, 0.296150888243623824121786128, // 560
+    0.465976495767966177902756065, 0.884797098430937780104007041, // 564
+    0.769103337645579639346626069, 0.639124444863775743801488193, // 568
+    0.091908956497132728624990979, 0.995767414467659793982495643, // 572
+    0.998475580573294752208559038, 0.055195244349689939809447526, // 576
+    0.666999922303637506650154222, 0.745057785441465962407907310, // 580
+    0.901348847046022014570746093, 0.433093818853151968484222638, // 584
+    0.331106305759876401737190737, 0.943593458161960361495301445, // 588
+    0.968522094274417316221088329, 0.248927605745720168110682816, // 592
+    0.508830142543107036931749324, 0.860866938637767279344583877, // 596
+    0.799537269107905033500246232, 0.600616479383868926653875896, // 600
+    0.140658239332849230714788846, 0.990058210262297105505906464, // 604
+    0.988257567730749491404792538, 0.152797185258443427720336613, // 608
+    0.590759701858874228423887908, 0.806847553543799272206514313, // 612
+    0.854557988365400520767862276, 0.519355990165589587361829932, // 616
+    0.237023605994367206867735915, 0.971503890986251775537099622, // 620
+    0.939459223602189911962669246, 0.342660717311994397592781983, // 624
+    0.422000270799799685941287941, 0.906595704514915365332960588, // 628
+    0.736816568877369875090132520, 0.676092703575315960360419228, // 632
+    0.042938256934940823077124540, 0.999077727752645382888781997, // 636
+    0.999529417501093163079703322, 0.030674803176636625934021028, // 640
+    0.685083667772700381362052545, 0.728464390448225196492035438, // 644
+    0.911706032005429851404397325, 0.410843171057903942183466675, // 648
+    0.354163525420490382357395796, 0.935183509938947577642207480, // 652
+    0.974339382785575860518721668, 0.225083911359792835991642120, // 656
+    0.529803624686294668216054671, 0.848120344803297251279133563, // 660
+    0.814036329705948361654516690, 0.580813958095764545075595272, // 664
+    0.164913120489969921418189113, 0.986308097244598647863297524, // 668
+    0.991709753669099522860049931, 0.128498110793793172624415589, // 672
+    0.610382806276309452716352152, 0.792106577300212351782342879, // 676
+    0.867046245515692651480195629, 0.498227666972781852410983869, // 680
+    0.260794117915275518280186509, 0.965394441697689374550843858, // 684
+    0.947585591017741134653387321, 0.319502030816015677901518272, // 688
+    0.444122144570429231642069418, 0.895966249756185155914560282, // 692
+    0.753186799043612482483430486, 0.657806693297078656931182264, // 696
+    0.067443919563664057897972422, 0.997723066644191609848546728, // 700
+    0.996820299291165714972629398, 0.079682437971430121147120656, // 704
+    0.648514401022112445084560551, 0.761202385484261814029709836, // 708
+    0.890448723244757889952150560, 0.455083587126343823535869268, // 712
+    0.307849640041534893682063646, 0.951435020969008369549175569, // 716
+    0.962121404269041595429604316, 0.272621355449948984493347477, // 720
+    0.487550160148435954641485027, 0.873094978418290098636085973, // 724
+    0.784556597155575233023892575, 0.620057211763289178646268191, // 728
+    0.116318630911904767252544319, 0.993211949234794533104601012, // 732
+    0.984210092386929073193874387, 0.177004220412148756196839844, // 736
+    0.570780745886967280232652864, 0.821102514991104679060430820, // 740
+    0.841554977436898409603499520, 0.540171472729892881297845480, // 744
+    0.213110319916091373967757518, 0.977028142657754351485866211, // 748
+    0.930766961078983731944872340, 0.365612997804773870011745909, // 752
+    0.399624199845646828544117031, 0.916679059921042663116457013, // 756
+    0.720002507961381629076682999, 0.693971460889654009003734389, // 760
+    0.018406729905804820927366313, 0.999830581795823422015722275, // 764
+};
+
+const fpr fpr_tab_log10[] = {
+    0.999995293809576171511580126, 0.003067956762965976270145365, // 1024
+    0.704934080375904908852523758, 0.709272826438865651316533772, // 1028
+    0.922701128333878570437264227, 0.385516053843918864075607949, // 1032
+    0.379847208924051170576281147, 0.925049240782677590302371869, // 1036
+    0.980182135968117392690210009, 0.198098410717953586179324918, // 1040
+    0.553016705580027531764226988, 0.833170164701913186439915922, // 1044
+    0.829761233794523042469023765, 0.558118531220556115693702964, // 1048
+    0.192080397049892441679288205, 0.981379193313754574318224190, // 1052
+    0.994879330794805620591166107, 0.101069862754827824987887585, // 1056
+    0.632018735939809021909403706, 0.774953106594873878359129282, // 1060
+    0.880470889052160770806542929, 0.474100214650550014398580015, // 1064
+    0.287347459544729526477331841, 0.957826413027532890321037029, // 1068
+    0.956045251349996443270479823, 0.293219162694258650606608599, // 1072
+    0.468688822035827933697617870, 0.883363338665731594736308015, // 1076
+    0.771060524261813773200605759, 0.636761861236284230413943435, // 1080
+    0.094963495329638998938034312, 0.995480755491926941769171600, // 1084
+    0.998640218180265222418199049, 0.052131704680283321236358216, // 1088
+    0.669282588346636065720696366, 0.743007952135121693517362293, // 1092
+    0.902673318237258806751502391, 0.430326481340082633908199031, // 1096
+    0.333999651442009404650865481, 0.942573197601446879280758735, // 1100
+    0.969281235356548486048290738, 0.245955050335794611599924709, // 1104
+    0.511468850437970399504391001, 0.859301818357008404783582139, // 1108
+    0.801376171723140219430247777, 0.598160706996342311724958652, // 1112
+    0.143695033150294454819773349, 0.989622017463200834623694454, // 1116
+    0.988721691960323767604516485, 0.149764534677321517229695737, // 1120
+    0.593232295039799808047809426, 0.805031331142963597922659282, // 1124
+    0.856147328375194481019630732, 0.516731799017649881508753876, // 1128
+    0.240003022448741486568922365, 0.970772140728950302138169611, // 1132
+    0.940506070593268323787291309, 0.339776884406826857828825803, // 1136
+    0.424779681209108833357226189, 0.905296759318118774354048329, // 1140
+    0.738887324460615147933116508, 0.673829000378756060917568372, // 1144
+    0.046003182130914628814301788, 0.998941293186856850633930266, // 1148
+    0.999618822495178597116830637, 0.027608145778965741612354872, // 1152
+    0.687315340891759108199186948, 0.726359155084345976817494315, // 1156
+    0.912962190428398164628018233, 0.408044162864978680820747499, // 1160
+    0.357030961233430032614954036, 0.934092550404258914729877883, // 1164
+    0.975025345066994146844913468, 0.222093620973203534094094721, // 1168
+    0.532403127877197971442805218, 0.846490938774052078300544488, // 1172
+    0.815814410806733789010772660, 0.578313796411655563342245019, // 1176
+    0.167938294974731178054745536, 0.985797509167567424700995000, // 1180
+    0.992099313142191757112085445, 0.125454983411546238542336453, // 1184
+    0.612810082429409703935211936, 0.790230221437310055030217152, // 1188
+    0.868570705971340895340449876, 0.495565261825772531150266670, // 1192
+    0.263754678974831383611349322, 0.964589793289812723836432159, // 1196
+    0.948561349915730288158494826, 0.316593375556165867243047035, // 1200
+    0.446868840162374195353044389, 0.894599485631382678433072126, // 1204
+    0.755201376896536527598710756, 0.655492852999615385312679701, // 1208
+    0.070504573389613863027351471, 0.997511456140303459699448390, // 1212
+    0.997060070339482978987989949, 0.076623861392031492278332463, // 1216
+    0.650846684996380915068975573, 0.759209188978388033485525443, // 1220
+    0.891840709392342727796478697, 0.452349587233770874133026703, // 1224
+    0.310767152749611495835997250, 0.950486073949481721759926101, // 1228
+    0.962953266873683886347921481, 0.269668325572915106525464462, // 1232
+    0.490226483288291154229598449, 0.871595086655951034842481435, // 1236
+    0.786455213599085757522319464, 0.617647307937803932403979402, // 1240
+    0.119365214810991364593637790, 0.992850414459865090793563344, // 1244
+    0.984748501801904218556553176, 0.173983873387463827950700807, // 1248
+    0.573297166698042212820171239, 0.819347520076796960824689637, // 1252
+    0.843208239641845437161743865, 0.537587076295645482502214932, // 1256
+    0.216106797076219509948385131, 0.976369731330021149312732194, // 1260
+    0.931884265581668106718557199, 0.362755724367397216204854462, // 1264
+    0.402434650859418441082533934, 0.915448716088267819566431292, // 1268
+    0.722128193929215321243607198, 0.691759258364157774906734132, // 1272
+    0.021474080275469507418374898, 0.999769405351215321657617036, // 1276
+    0.999882347454212525633049627, 0.015339206284988101044151868, // 1280
+    0.696177131491462944788582591, 0.717870045055731736211325329, // 1284
+    0.917900775621390457642276297, 0.396809987416710328595290911, // 1288
+    0.368466829953372331712746222, 0.929640895843181265457918066, // 1292
+    0.977677357824509979943404762, 0.210111836880469621717489972, // 1296
+    0.542750784864515906586768661, 0.839893794195999504583383987, // 1300
+    0.822849781375826332046780034, 0.568258952670131549790548489, // 1304
+    0.180022901405699522679906590, 0.983662419211730274396237776, // 1308
+    0.993564135520595333782021697, 0.113270952177564349018228733, // 1312
+    0.622461279374149972519166721, 0.782650596166575738458949301, // 1316
+    0.874586652278176112634431897, 0.484869248000791101822951699, // 1320
+    0.275571819310958163076425168, 0.961280485811320641748659653, // 1324
+    0.952375012719765858529893608, 0.304929229735402406490728633, // 1328
+    0.457813303598877221904961155, 0.889048355854664562540777729, // 1332
+    0.763188417263381271704838297, 0.646176012983316364832802220, // 1336
+    0.082740264549375693111987083, 0.996571145790554847093566910, // 1340
+    0.997925286198596012623025462, 0.064382630929857460819324537, // 1344
+    0.660114342067420478559490747, 0.751165131909686411205819422, // 1348
+    0.897324580705418281231391836, 0.441371268731716692879988968, // 1352
+    0.322407678801069848384807478, 0.946600913083283570044599823, // 1356
+    0.966190003445412555433832961, 0.257831102162159005614471295, // 1360
+    0.500885382611240786241285004, 0.865513624090569082825488358, // 1364
+    0.793975477554337164895083757, 0.607949784967773667243642671, // 1368
+    0.131540028702883111103387493, 0.991310859846115418957349799, // 1372
+    0.986809401814185476970235952, 0.161886393780111837641387995, // 1376
+    0.583308652937698294392830961, 0.812250586585203913049744181, // 1380
+    0.849741768000852489471268395, 0.527199134781901348464274575, // 1384
+    0.228072083170885739254457379, 0.973644249650811925318383912, // 1388
+    0.936265667170278246576310996, 0.351292756085567125601307623, // 1392
+    0.413638312238434547471944324, 0.910441292258067196934095369, // 1396
+    0.730562769227827561177758850, 0.682845546385248068164596123, // 1400
+    0.033741171851377584833716112, 0.999430604555461772019008327, // 1404
+    0.999204758618363895492950001, 0.039872927587739811128578738, // 1408
+    0.678350043129861486873655042, 0.734738878095963464563223604, // 1412
+    0.907886116487666212038681480, 0.419216888363223956433010020, // 1416
+    0.345541324963989065539191723, 0.938403534063108112192420774, // 1420
+    0.972226497078936305708321144, 0.234041958583543423191242045, // 1424
+    0.521975292937154342694258318, 0.852960604930363657746588082, // 1428
+    0.808656181588174991946968128, 0.588281548222645304786439813, // 1432
+    0.155828397654265235743101486, 0.987784141644572154230969032, // 1436
+    0.990485084256457037998682243, 0.137620121586486044948441663, // 1440
+    0.603066598540348201693430617, 0.797690840943391108362662755, // 1444
+    0.862423956111040538690933878, 0.506186645345155291048942344, // 1448
+    0.251897818154216950498106628, 0.967753837093475465243391912, // 1452
+    0.944604837261480265659265493, 0.328209843579092526107916817, // 1456
+    0.435857079922255491032544080, 0.900015892016160228714535267, // 1460
+    0.747100605980180144323078847, 0.664710978203344868130324985, // 1464
+    0.058258264500435759613979782, 0.998301544933892840738782163, // 1468
+    0.996044700901251989887944810, 0.088853552582524596561586535, // 1472
+    0.641481012808583151988739898, 0.767138911935820381181694573, // 1476
+    0.886222530148880631647990821, 0.463259783551860197390719637, // 1480
+    0.299079826308040476750336973, 0.954228095109105629780430732, // 1484
+    0.959571513081984528335528181, 0.281464937925757984095231007, // 1488
+    0.479493757660153026679839798, 0.877545290207261291668470750, // 1492
+    0.778816512381475953374724325, 0.627251815495144113509622565, // 1496
+    0.107172424956808849175529148, 0.994240449453187946358413442, // 1500
+    0.982539302287441255907040396, 0.186055151663446648105438304, // 1504
+    0.563199344013834115007363772, 0.826321062845663480311195452, // 1508
+    0.836547727223511984524285790, 0.547894059173100165608820571, // 1512
+    0.204108966092816874181696950, 0.978948175319062194715480124, // 1516
+    0.927362525650401087274536959, 0.374164062971457997104393020, // 1520
+    0.391170384302253888687512949, 0.920318276709110566440076541, // 1524
+    0.713584868780793592903125099, 0.700568793943248366792866380, // 1528
+    0.009203754782059819315102378, 0.999957644551963866333120920, // 1532
+};
+
+const fpr *fpr_table[] = {
+    NULL, NULL,
+    fpr_tab_log2,
+    fpr_tab_log3,
+    fpr_tab_log4,
+    fpr_tab_log5,
+    fpr_tab_log6,
+    fpr_tab_log7,
+    fpr_tab_log8,
+    fpr_tab_log9,
+    fpr_tab_log10,
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.h
new file mode 100644
index 000000000..ae99a0bd6
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.h
@@ -0,0 +1,247 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/* ====================================================================== */
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "macrof.h"
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef double fpr;
+
+static inline fpr
+FPR(double v) {
+    fpr x;
+
+    x = v;
+    return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+    return (double)i;
+}
+
+static const fpr fpr_q = 12289.0 ;
+static const fpr fpr_inverse_of_q = 1.0 / 12289.0 ;
+static const fpr fpr_inv_2sqrsigma0 = .150865048875372721532312163019 ;
+static const fpr fpr_inv_sigma_10 = 0.0059386453095331159950250124336477482 ;
+static const fpr fpr_sigma_min_10 = 1.2982803343442918539708792538826807 ;
+static const fpr fpr_log2 = 0.69314718055994530941723212146 ;
+static const fpr fpr_inv_log2 = 1.4426950408889634073599246810 ;
+static const fpr fpr_bnorm_max = 16822.4121 ;
+static const fpr fpr_zero = 0.0 ;
+static const fpr fpr_one = 1.0 ;
+static const fpr fpr_two = 2.0 ;
+static const fpr fpr_onehalf = 0.5 ;
+static const fpr fpr_invsqrt2 = 0.707106781186547524400844362105 ;
+static const fpr fpr_invsqrt8 = 0.353553390593273762200422181052 ;
+static const fpr fpr_ptwo31 = 2147483648.0 ;
+static const fpr fpr_ptwo31m1 = 2147483647.0 ;
+static const fpr fpr_mtwo31m1 = -2147483647.0 ;
+static const fpr fpr_ptwo63m1 = 9223372036854775807.0 ;
+static const fpr fpr_mtwo63m1 = -9223372036854775807.0 ;
+static const fpr fpr_ptwo63 = 9223372036854775808.0 ;
+
+static inline int64_t
+fpr_rint(fpr x) {
+    int64_t t;
+    __asm__ ( "fcvtns   %x0, %d1": "=r" (t) : "w" (x));
+    return t;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+    int64_t r;
+
+    /*
+     * The cast performs a trunc() (rounding toward 0) and thus is
+     * wrong by 1 for most negative values. The correction below is
+     * constant-time as long as the compiler turns the
+     * floating-point conversion result into a 0/1 integer without a
+     * conditional branch or another non-constant-time construction.
+     * This should hold on all modern architectures with an FPU (and
+     * if it is false on a given arch, then chances are that the FPU
+     * itself is not constant-time, making the point moot).
+     */
+    r = (int64_t)x;
+    return r - (x < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+    return (int64_t)x;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+    return (x + y);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+    return (x - y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+    return (-x);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+    return (x * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+    return (x + x);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+    return (x * y);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+    return (x * x);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+    return (1.0 / x);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+    return (x / y);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+    __asm__ ( "fsqrt   %d0, %d0" : "+w" (x) : : );
+    return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+    return x < y;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+    static const double C_expm[] = {
+        1.000000000000000000000000000000,  // c0
+        -0.999999999999994892974086724280, // c1
+        0.500000000000019206858326015208,  // c2
+        -0.166666666666984014666397229121, // c3
+        0.041666666666110491190622155955,  // c4
+        -0.008333333327800835146903501993, // c5
+        0.001388888894063186997887560103,  // c6
+        -0.000198412739277311890541063977, // c7
+        0.000024801566833585381209939524,  // c8
+        -0.000002755586350219122514855659, // c9
+        0.000000275607356160477811864927,  // c10
+        -0.000000025299506379442070029551, // c11
+        0.000000002073772366009083061987,  // c12
+        0.000000000000000000000000000000,
+    };
+    float64x2_t neon_x, neon_1x, neon_x2,
+                neon_x4, neon_x8, neon_x12, neon_ccs;
+    float64x2x4_t neon_exp0;
+    float64x2x3_t neon_exp1;
+    float64x2_t y1, y2, y3, y;
+    double ret;
+
+    neon_exp0 = vld1q_f64_x4(&C_expm[0]);
+    neon_exp1 = vld1q_f64_x3(&C_expm[8]);
+    neon_ccs = vdupq_n_f64(ccs);
+    neon_ccs = vmulq_n_f64(neon_ccs, fpr_ptwo63);
+
+    // x | x
+    neon_x = vdupq_n_f64(x);
+    // 1 | x
+    neon_1x = vsetq_lane_f64(1.0, neon_x, 0);
+    neon_x2 = vmulq_f64(neon_x, neon_x);
+    neon_x4 = vmulq_f64(neon_x2, neon_x2);
+    neon_x8 = vmulq_f64(neon_x4, neon_x4);
+    neon_x12 = vmulq_f64(neon_x8, neon_x4);
+
+    vfmla(y1, neon_exp0.val[0], neon_exp0.val[1], neon_x2);
+    vfmla(y2, neon_exp0.val[2], neon_exp0.val[3], neon_x2);
+    vfmla(y3, neon_exp1.val[0], neon_exp1.val[1], neon_x2);
+
+    y1 = vmulq_f64(y1, neon_1x);
+    y2 = vmulq_f64(y2, neon_1x);
+    y3 = vmulq_f64(y3, neon_1x);
+
+    vfmla(y, y1, y2, neon_x4);
+    vfmla(y,  y, y3, neon_x8);
+    vfmla(y,  y, neon_exp1.val[2], neon_x12);
+    y = vmulq_f64( y, neon_ccs);
+    ret = vaddvq_f64(y);
+
+    return (uint64_t) ret;
+}
+
+#define fpr_p2_tab   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+#define fpr_tab_log2   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log2
+#define fpr_tab_log3   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log3
+#define fpr_tab_log4   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log4
+#define fpr_tab_log5   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log5
+#define fpr_tab_log6   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log6
+#define fpr_tab_log7   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log7
+#define fpr_tab_log8   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log8
+#define fpr_tab_log9   PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log9
+#define fpr_tab_log10  PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log10
+#define fpr_table      PQCLEAN_FALCONPADDED1024_AARCH64_fpr_table
+
+extern const fpr fpr_tab_log2[];
+extern const fpr fpr_tab_log3[];
+extern const fpr fpr_tab_log4[];
+extern const fpr fpr_tab_log5[];
+extern const fpr fpr_tab_log6[];
+extern const fpr fpr_tab_log7[];
+extern const fpr fpr_tab_log8[];
+extern const fpr fpr_tab_log9[];
+extern const fpr fpr_tab_log10[];
+extern const fpr *fpr_table[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/inner.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/inner.h
new file mode 100644
index 000000000..9674aecfc
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/inner.h
@@ -0,0 +1,825 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+#include "params.h"
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCONPADDED1024_AARCH64_ macro (e.g. PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc)        shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ *   - modq: sequence of values modulo 12289, each encoded over exactly
+ *     14 bits. The encoder and decoder verify that integers are within
+ *     the valid range (0..12288). Values are arrays of uint16.
+ *
+ *   - trim: sequence of signed integers, a specified number of bits
+ *     each. The number of bits is provided as parameter and includes
+ *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ *     (which means that the -2^(bits-1) value is forbidden); encode and
+ *     decode functions check that property. Values are arrays of
+ *     int16_t or int8_t, corresponding to names 'trim_i16' and
+ *     'trim_i8', respectively.
+ *
+ *   - comp: variable-length encoding for signed integers; each integer
+ *     uses a minimum of 9 bits, possibly more. This is normally used
+ *     only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_modq_encode(void *out, size_t max_out_len,
+        const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(void *out, size_t max_out_len, const int8_t *x, uint8_t bits);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x);
+
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_modq_decode(uint16_t *x, const void *in,
+        size_t max_in_len, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(int8_t *x, unsigned bits, const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short(const int16_t *s1, const int16_t *s2);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+        const int16_t *hm, const double *t0,
+        const double *t1);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+/*
+ * Convert a public key to NTT. Conversion is done in place.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt(int16_t *h);
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt_monty(int16_t *h);
+
+/*
+ * Internal signature verification code:
+ *   c0[]      contains the hashed nonce+message
+ *   s2[]      is the decoded signature
+ *   h[]       contains the public key, in NTT + Montgomery format
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+        int16_t *h, int16_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_compute_public(int16_t *h, const int8_t *f,
+        const int8_t *g, int16_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_complete_private(int8_t *G, const int8_t *f,
+        const int8_t *g, const int8_t *F,
+        uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+        const int16_t *s1, const int16_t *s2,
+        uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ *   fpr
+ *         type for a real number
+ *
+ *   fpr fpr_of(int64_t i)
+ *         cast an integer into a real number; source must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_scaled(int64_t i, int sc)
+ *         compute i*2^sc as a real number; source 'i' must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_ldexp(fpr x, int e)
+ *         compute x*2^e
+ *
+ *   int64_t fpr_rint(fpr x)
+ *         round x to the nearest integer; x must be in the -(2^63-1)
+ *         to +(2^63-1) range
+ *
+ *   int64_t fpr_trunc(fpr x)
+ *         round to an integer; this rounds towards zero; value must
+ *         be in the -(2^63-1) to +(2^63-1) range
+ *
+ *   fpr fpr_add(fpr x, fpr y)
+ *         compute x + y
+ *
+ *   fpr fpr_sub(fpr x, fpr y)
+ *         compute x - y
+ *
+ *   fpr fpr_neg(fpr x)
+ *         compute -x
+ *
+ *   fpr fpr_half(fpr x)
+ *         compute x/2
+ *
+ *   fpr fpr_double(fpr x)
+ *         compute x*2
+ *
+ *   fpr fpr_mul(fpr x, fpr y)
+ *         compute x * y
+ *
+ *   fpr fpr_sqr(fpr x)
+ *         compute x * x
+ *
+ *   fpr fpr_inv(fpr x)
+ *         compute 1/x
+ *
+ *   fpr fpr_div(fpr x, fpr y)
+ *         compute x/y
+ *
+ *   fpr fpr_sqrt(fpr x)
+ *         compute the square root of x
+ *
+ *   int fpr_lt(fpr x, fpr y)
+ *         return 1 if x < y, 0 otherwise
+ *
+ *   uint64_t fpr_expm_p63(fpr x)
+ *         return exp(x), assuming that 0 <= x < log(2). Returned value
+ *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ *         rounded to the nearest integer). Computation should have a
+ *         precision of at least 45 bits.
+ *
+ *   const fpr fpr_gm_tab[]
+ *         array of constants for FFT / iFFT
+ *
+ *   const fpr fpr_p2_tab[]
+ *         precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ *   fpr fpr_q                 12289
+ *   fpr fpr_inverse_of_q      1/12289
+ *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
+ *   fpr fpr_inv_sigma[]       1/sigma (indexed by logn, 1 to 10)
+ *   fpr fpr_sigma_min[]       1/sigma_min (indexed by logn, 1 to 10)
+ *   fpr fpr_log2              log(2)
+ *   fpr fpr_inv_log2          1/log(2)
+ *   fpr fpr_bnorm_max         16822.4121
+ *   fpr fpr_zero              0
+ *   fpr fpr_one               1
+ *   fpr fpr_two               2
+ *   fpr fpr_onehalf           0.5
+ *   fpr fpr_ptwo31            2^31
+ *   fpr fpr_ptwo31m1          2^31-1
+ *   fpr fpr_mtwo31m1          -(2^31-1)
+ *   fpr fpr_ptwo63m1          2^63-1
+ *   fpr fpr_mtwo63m1          -(2^63-1)
+ *   fpr fpr_ptwo63            2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+    union {
+        uint8_t d[512]; /* MUST be 512, exactly */
+        uint64_t dummy_u64;
+    } buf;
+    size_t ptr;
+    union {
+        uint8_t d[256];
+        uint64_t dummy_u64;
+    } state;
+    int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+    }
+    return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(fpr *c, const fpr *a, const fpr *restrict b, const fpr *restrict d, unsigned logn);
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(fpr *d, fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d,
+        const fpr *a, const fpr *restrict b, unsigned logn);
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(fpr *c, const fpr *restrict d, const fpr *restrict a, unsigned logn);
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_fft(fpr *restrict c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+        const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add_muladj_fft(fpr *restrict d,
+        const fpr *restrict F, const fpr *restrict G,
+        const fpr *restrict f, const fpr *restrict g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+        fpr *restrict g01, fpr *restrict g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(fpr *restrict d11, fpr *restrict l10,
+        const fpr *restrict g00, const fpr *restrict g01,
+        const fpr *restrict g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1,
+        const fpr *restrict f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(fpr *restrict f,
+        const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm, const unsigned falcon_n);
+
+fpr PQCLEAN_FALCONPADDED1024_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2);
+
+int32_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_small_sqnorm(const int8_t *f); // common.c
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1      136
+#define FALCON_KEYGEN_TEMP_2      272
+#define FALCON_KEYGEN_TEMP_3      224
+#define FALCON_KEYGEN_TEMP_4      448
+#define FALCON_KEYGEN_TEMP_5      896
+#define FALCON_KEYGEN_TEMP_6     1792
+#define FALCON_KEYGEN_TEMP_7     3584
+#define FALCON_KEYGEN_TEMP_8     7168
+#define FALCON_KEYGEN_TEMP_9    14336
+#define FALCON_KEYGEN_TEMP_10   28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_keygen(inner_shake256_context *rng,
+        int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_expand_privkey(fpr *restrict expanded_key,
+        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+        uint8_t *restrict tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED1024_AARCH64_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *restrict expanded_key,
+        const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *restrict f, const int8_t *restrict g,
+        const int8_t *restrict F, const int8_t *restrict G,
+        const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/keygen.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/keygen.c
new file mode 100644
index 000000000..d023e58c0
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/keygen.c
@@ -0,0 +1,4200 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "util.h"
+
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ *   Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ *   Montgomery multiplication computes (x*y)/R mod p for
+ *   operands x and y. Therefore:
+ *
+ *    - if operands are x*R and y*R (Montgomery representations of x and
+ *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ *      mod p, which is the Montgomery representation of the product x*y;
+ *
+ *    - if operands are x*R and y (or x and y*R), then Montgomery
+ *      multiplication returns x*y mod p: mixed-representation
+ *      multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ *  p   The prime itself.
+ *
+ *  g   A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ *  s   The inverse of the product of all previous primes in the array,
+ *      computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+    uint32_t p;
+    uint32_t g;
+    uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+    { 2147473409,  383167813,      10239 },
+    { 2147389441,  211808905,  471403745 },
+    { 2147387393,   37672282, 1329335065 },
+    { 2147377153, 1977035326,  968223422 },
+    { 2147358721, 1067163706,  132460015 },
+    { 2147352577, 1606082042,  598693809 },
+    { 2147346433, 2033915641, 1056257184 },
+    { 2147338241, 1653770625,  421286710 },
+    { 2147309569,  631200819, 1111201074 },
+    { 2147297281, 2038364663, 1042003613 },
+    { 2147295233, 1962540515,   19440033 },
+    { 2147239937, 2100082663,  353296760 },
+    { 2147235841, 1991153006, 1703918027 },
+    { 2147217409,  516405114, 1258919613 },
+    { 2147205121,  409347988, 1089726929 },
+    { 2147196929,  927788991, 1946238668 },
+    { 2147178497, 1136922411, 1347028164 },
+    { 2147100673,  868626236,  701164723 },
+    { 2147082241, 1897279176,  617820870 },
+    { 2147074049, 1888819123,  158382189 },
+    { 2147051521,   25006327,  522758543 },
+    { 2147043329,  327546255,   37227845 },
+    { 2147039233,  766324424, 1133356428 },
+    { 2146988033, 1862817362,   73861329 },
+    { 2146963457,  404622040,  653019435 },
+    { 2146959361, 1936581214,  995143093 },
+    { 2146938881, 1559770096,  634921513 },
+    { 2146908161,  422623708, 1985060172 },
+    { 2146885633, 1751189170,  298238186 },
+    { 2146871297,  578919515,  291810829 },
+    { 2146846721, 1114060353,  915902322 },
+    { 2146834433, 2069565474,   47859524 },
+    { 2146818049, 1552824584,  646281055 },
+    { 2146775041, 1906267847, 1597832891 },
+    { 2146756609, 1847414714, 1228090888 },
+    { 2146744321, 1818792070, 1176377637 },
+    { 2146738177, 1118066398, 1054971214 },
+    { 2146736129,   52057278,  933422153 },
+    { 2146713601,  592259376, 1406621510 },
+    { 2146695169,  263161877, 1514178701 },
+    { 2146656257,  685363115,  384505091 },
+    { 2146650113,  927727032,  537575289 },
+    { 2146646017,   52575506, 1799464037 },
+    { 2146643969, 1276803876, 1348954416 },
+    { 2146603009,  814028633, 1521547704 },
+    { 2146572289, 1846678872, 1310832121 },
+    { 2146547713,  919368090, 1019041349 },
+    { 2146508801,  671847612,   38582496 },
+    { 2146492417,  283911680,  532424562 },
+    { 2146490369, 1780044827,  896447978 },
+    { 2146459649,  327980850, 1327906900 },
+    { 2146447361, 1310561493,  958645253 },
+    { 2146441217,  412148926,  287271128 },
+    { 2146437121,  293186449, 2009822534 },
+    { 2146430977,  179034356, 1359155584 },
+    { 2146418689, 1517345488, 1790248672 },
+    { 2146406401, 1615820390, 1584833571 },
+    { 2146404353,  826651445,  607120498 },
+    { 2146379777,    3816988, 1897049071 },
+    { 2146363393, 1221409784, 1986921567 },
+    { 2146355201, 1388081168,  849968120 },
+    { 2146336769, 1803473237, 1655544036 },
+    { 2146312193, 1023484977,  273671831 },
+    { 2146293761, 1074591448,  467406983 },
+    { 2146283521,  831604668, 1523950494 },
+    { 2146203649,  712865423, 1170834574 },
+    { 2146154497, 1764991362, 1064856763 },
+    { 2146142209,  627386213, 1406840151 },
+    { 2146127873, 1638674429, 2088393537 },
+    { 2146099201, 1516001018,  690673370 },
+    { 2146093057, 1294931393,  315136610 },
+    { 2146091009, 1942399533,  973539425 },
+    { 2146078721, 1843461814, 2132275436 },
+    { 2146060289, 1098740778,  360423481 },
+    { 2146048001, 1617213232, 1951981294 },
+    { 2146041857, 1805783169, 2075683489 },
+    { 2146019329,  272027909, 1753219918 },
+    { 2145986561, 1206530344, 2034028118 },
+    { 2145976321, 1243769360, 1173377644 },
+    { 2145964033,  887200839, 1281344586 },
+    { 2145906689, 1651026455,  906178216 },
+    { 2145875969, 1673238256, 1043521212 },
+    { 2145871873, 1226591210, 1399796492 },
+    { 2145841153, 1465353397, 1324527802 },
+    { 2145832961, 1150638905,  554084759 },
+    { 2145816577,  221601706,  427340863 },
+    { 2145785857,  608896761,  316590738 },
+    { 2145755137, 1712054942, 1684294304 },
+    { 2145742849, 1302302867,  724873116 },
+    { 2145728513,  516717693,  431671476 },
+    { 2145699841,  524575579, 1619722537 },
+    { 2145691649, 1925625239,  982974435 },
+    { 2145687553,  463795662, 1293154300 },
+    { 2145673217,  771716636,  881778029 },
+    { 2145630209, 1509556977,  837364988 },
+    { 2145595393,  229091856,  851648427 },
+    { 2145587201, 1796903241,  635342424 },
+    { 2145525761,  715310882, 1677228081 },
+    { 2145495041, 1040930522,  200685896 },
+    { 2145466369,  949804237, 1809146322 },
+    { 2145445889, 1673903706,   95316881 },
+    { 2145390593,  806941852, 1428671135 },
+    { 2145372161, 1402525292,  159350694 },
+    { 2145361921, 2124760298, 1589134749 },
+    { 2145359873, 1217503067, 1561543010 },
+    { 2145355777,  338341402,   83865711 },
+    { 2145343489, 1381532164,  641430002 },
+    { 2145325057, 1883895478, 1528469895 },
+    { 2145318913, 1335370424,   65809740 },
+    { 2145312769, 2000008042, 1919775760 },
+    { 2145300481,  961450962, 1229540578 },
+    { 2145282049,  910466767, 1964062701 },
+    { 2145232897,  816527501,  450152063 },
+    { 2145218561, 1435128058, 1794509700 },
+    { 2145187841,   33505311, 1272467582 },
+    { 2145181697,  269767433, 1380363849 },
+    { 2145175553,   56386299, 1316870546 },
+    { 2145079297, 2106880293, 1391797340 },
+    { 2145021953, 1347906152,  720510798 },
+    { 2145015809,  206769262, 1651459955 },
+    { 2145003521, 1885513236, 1393381284 },
+    { 2144960513, 1810381315,   31937275 },
+    { 2144944129, 1306487838, 2019419520 },
+    { 2144935937,   37304730, 1841489054 },
+    { 2144894977, 1601434616,  157985831 },
+    { 2144888833,   98749330, 2128592228 },
+    { 2144880641, 1772327002, 2076128344 },
+    { 2144864257, 1404514762, 2029969964 },
+    { 2144827393,  801236594,  406627220 },
+    { 2144806913,  349217443, 1501080290 },
+    { 2144796673, 1542656776, 2084736519 },
+    { 2144778241, 1210734884, 1746416203 },
+    { 2144759809, 1146598851,  716464489 },
+    { 2144757761,  286328400, 1823728177 },
+    { 2144729089, 1347555695, 1836644881 },
+    { 2144727041, 1795703790,  520296412 },
+    { 2144696321, 1302475157,  852964281 },
+    { 2144667649, 1075877614,  504992927 },
+    { 2144573441,  198765808, 1617144982 },
+    { 2144555009,  321528767,  155821259 },
+    { 2144550913,  814139516, 1819937644 },
+    { 2144536577,  571143206,  962942255 },
+    { 2144524289, 1746733766,    2471321 },
+    { 2144512001, 1821415077,  124190939 },
+    { 2144468993,  917871546, 1260072806 },
+    { 2144458753,  378417981, 1569240563 },
+    { 2144421889,  175229668, 1825620763 },
+    { 2144409601, 1699216963,  351648117 },
+    { 2144370689, 1071885991,  958186029 },
+    { 2144348161, 1763151227,  540353574 },
+    { 2144335873, 1060214804,  919598847 },
+    { 2144329729,  663515846, 1448552668 },
+    { 2144327681, 1057776305,  590222840 },
+    { 2144309249, 1705149168, 1459294624 },
+    { 2144296961,  325823721, 1649016934 },
+    { 2144290817,  738775789,  447427206 },
+    { 2144243713,  962347618,  893050215 },
+    { 2144237569, 1655257077,  900860862 },
+    { 2144161793,  242206694, 1567868672 },
+    { 2144155649,  769415308, 1247993134 },
+    { 2144137217,  320492023,  515841070 },
+    { 2144120833, 1639388522,  770877302 },
+    { 2144071681, 1761785233,  964296120 },
+    { 2144065537,  419817825,  204564472 },
+    { 2144028673,  666050597, 2091019760 },
+    { 2144010241, 1413657615, 1518702610 },
+    { 2143952897, 1238327946,  475672271 },
+    { 2143940609,  307063413, 1176750846 },
+    { 2143918081, 2062905559,  786785803 },
+    { 2143899649, 1338112849, 1562292083 },
+    { 2143891457,   68149545,   87166451 },
+    { 2143885313,  921750778,  394460854 },
+    { 2143854593,  719766593,  133877196 },
+    { 2143836161, 1149399850, 1861591875 },
+    { 2143762433, 1848739366, 1335934145 },
+    { 2143756289, 1326674710,  102999236 },
+    { 2143713281,  808061791, 1156900308 },
+    { 2143690753,  388399459, 1926468019 },
+    { 2143670273, 1427891374, 1756689401 },
+    { 2143666177, 1912173949,  986629565 },
+    { 2143645697, 2041160111,  371842865 },
+    { 2143641601, 1279906897, 2023974350 },
+    { 2143635457,  720473174, 1389027526 },
+    { 2143621121, 1298309455, 1732632006 },
+    { 2143598593, 1548762216, 1825417506 },
+    { 2143567873,  620475784, 1073787233 },
+    { 2143561729, 1932954575,  949167309 },
+    { 2143553537,  354315656, 1652037534 },
+    { 2143541249,  577424288, 1097027618 },
+    { 2143531009,  357862822,  478640055 },
+    { 2143522817, 2017706025, 1550531668 },
+    { 2143506433, 2078127419, 1824320165 },
+    { 2143488001,  613475285, 1604011510 },
+    { 2143469569, 1466594987,  502095196 },
+    { 2143426561, 1115430331, 1044637111 },
+    { 2143383553,    9778045, 1902463734 },
+    { 2143377409, 1557401276, 2056861771 },
+    { 2143363073,  652036455, 1965915971 },
+    { 2143260673, 1464581171, 1523257541 },
+    { 2143246337, 1876119649,  764541916 },
+    { 2143209473, 1614992673, 1920672844 },
+    { 2143203329,  981052047, 2049774209 },
+    { 2143160321, 1847355533,  728535665 },
+    { 2143129601,  965558457,  603052992 },
+    { 2143123457, 2140817191,    8348679 },
+    { 2143100929, 1547263683,  694209023 },
+    { 2143092737,  643459066, 1979934533 },
+    { 2143082497,  188603778, 2026175670 },
+    { 2143062017, 1657329695,  377451099 },
+    { 2143051777,  114967950,  979255473 },
+    { 2143025153, 1698431342, 1449196896 },
+    { 2143006721, 1862741675, 1739650365 },
+    { 2142996481,  756660457,  996160050 },
+    { 2142976001,  927864010, 1166847574 },
+    { 2142965761,  905070557,  661974566 },
+    { 2142916609,   40932754, 1787161127 },
+    { 2142892033, 1987985648,  675335382 },
+    { 2142885889,  797497211, 1323096997 },
+    { 2142871553, 2068025830, 1411877159 },
+    { 2142861313, 1217177090, 1438410687 },
+    { 2142830593,  409906375, 1767860634 },
+    { 2142803969, 1197788993,  359782919 },
+    { 2142785537,  643817365,  513932862 },
+    { 2142779393, 1717046338,  218943121 },
+    { 2142724097,   89336830,  416687049 },
+    { 2142707713,    5944581, 1356813523 },
+    { 2142658561,  887942135, 2074011722 },
+    { 2142638081,  151851972, 1647339939 },
+    { 2142564353, 1691505537, 1483107336 },
+    { 2142533633, 1989920200, 1135938817 },
+    { 2142529537,  959263126, 1531961857 },
+    { 2142527489,  453251129, 1725566162 },
+    { 2142502913, 1536028102,  182053257 },
+    { 2142498817,  570138730,  701443447 },
+    { 2142416897,  326965800,  411931819 },
+    { 2142363649, 1675665410, 1517191733 },
+    { 2142351361,  968529566, 1575712703 },
+    { 2142330881, 1384953238, 1769087884 },
+    { 2142314497, 1977173242, 1833745524 },
+    { 2142289921,   95082313, 1714775493 },
+    { 2142283777,  109377615, 1070584533 },
+    { 2142277633,   16960510,  702157145 },
+    { 2142263297,  553850819,  431364395 },
+    { 2142208001,  241466367, 2053967982 },
+    { 2142164993, 1795661326, 1031836848 },
+    { 2142097409, 1212530046,  712772031 },
+    { 2142087169, 1763869720,  822276067 },
+    { 2142078977,  644065713, 1765268066 },
+    { 2142074881,  112671944,  643204925 },
+    { 2142044161, 1387785471, 1297890174 },
+    { 2142025729,  783885537, 1000425730 },
+    { 2142011393,  905662232, 1679401033 },
+    { 2141974529,  799788433,  468119557 },
+    { 2141943809, 1932544124,  449305555 },
+    { 2141933569, 1527403256,  841867925 },
+    { 2141931521, 1247076451,  743823916 },
+    { 2141902849, 1199660531,  401687910 },
+    { 2141890561,  150132350, 1720336972 },
+    { 2141857793, 1287438162,  663880489 },
+    { 2141833217,  618017731, 1819208266 },
+    { 2141820929,  999578638, 1403090096 },
+    { 2141786113,   81834325, 1523542501 },
+    { 2141771777,  120001928,  463556492 },
+    { 2141759489,  122455485, 2124928282 },
+    { 2141749249,  141986041,  940339153 },
+    { 2141685761,  889088734,  477141499 },
+    { 2141673473,  324212681, 1122558298 },
+    { 2141669377, 1175806187, 1373818177 },
+    { 2141655041, 1113654822,  296887082 },
+    { 2141587457,  991103258, 1585913875 },
+    { 2141583361, 1401451409, 1802457360 },
+    { 2141575169, 1571977166,  712760980 },
+    { 2141546497, 1107849376, 1250270109 },
+    { 2141515777,  196544219,  356001130 },
+    { 2141495297, 1733571506, 1060744866 },
+    { 2141483009,  321552363, 1168297026 },
+    { 2141458433,  505818251,  733225819 },
+    { 2141360129, 1026840098,  948342276 },
+    { 2141325313,  945133744, 2129965998 },
+    { 2141317121, 1871100260, 1843844634 },
+    { 2141286401, 1790639498, 1750465696 },
+    { 2141267969, 1376858592,  186160720 },
+    { 2141255681, 2129698296, 1876677959 },
+    { 2141243393, 2138900688, 1340009628 },
+    { 2141214721, 1933049835, 1087819477 },
+    { 2141212673, 1898664939, 1786328049 },
+    { 2141202433,  990234828,  940682169 },
+    { 2141175809, 1406392421,  993089586 },
+    { 2141165569, 1263518371,  289019479 },
+    { 2141073409, 1485624211,  507864514 },
+    { 2141052929, 1885134788,  311252465 },
+    { 2141040641, 1285021247,  280941862 },
+    { 2141028353, 1527610374,  375035110 },
+    { 2141011969, 1400626168,  164696620 },
+    { 2140999681,  632959608,  966175067 },
+    { 2140997633, 2045628978, 1290889438 },
+    { 2140993537, 1412755491,  375366253 },
+    { 2140942337,  719477232,  785367828 },
+    { 2140925953,   45224252,  836552317 },
+    { 2140917761, 1157376588, 1001839569 },
+    { 2140887041,  278480752, 2098732796 },
+    { 2140837889, 1663139953,  924094810 },
+    { 2140788737,  802501511, 2045368990 },
+    { 2140766209, 1820083885, 1800295504 },
+    { 2140764161, 1169561905, 2106792035 },
+    { 2140696577,  127781498, 1885987531 },
+    { 2140684289,   16014477, 1098116827 },
+    { 2140653569,  665960598, 1796728247 },
+    { 2140594177, 1043085491,  377310938 },
+    { 2140579841, 1732838211, 1504505945 },
+    { 2140569601,  302071939,  358291016 },
+    { 2140567553,  192393733, 1909137143 },
+    { 2140557313,  406595731, 1175330270 },
+    { 2140549121, 1748850918,  525007007 },
+    { 2140477441,  499436566, 1031159814 },
+    { 2140469249, 1886004401, 1029951320 },
+    { 2140426241, 1483168100, 1676273461 },
+    { 2140420097, 1779917297,  846024476 },
+    { 2140413953,  522948893, 1816354149 },
+    { 2140383233, 1931364473, 1296921241 },
+    { 2140366849, 1917356555,  147196204 },
+    { 2140354561,   16466177, 1349052107 },
+    { 2140348417, 1875366972, 1860485634 },
+    { 2140323841,  456498717, 1790256483 },
+    { 2140321793, 1629493973,  150031888 },
+    { 2140315649, 1904063898,  395510935 },
+    { 2140280833, 1784104328,  831417909 },
+    { 2140250113,  256087139,  697349101 },
+    { 2140229633,  388553070,  243875754 },
+    { 2140223489,  747459608, 1396270850 },
+    { 2140200961,  507423743, 1895572209 },
+    { 2140162049,  580106016, 2045297469 },
+    { 2140149761,  712426444,  785217995 },
+    { 2140137473, 1441607584,  536866543 },
+    { 2140119041,  346538902, 1740434653 },
+    { 2140090369,  282642885,   21051094 },
+    { 2140076033, 1407456228,  319910029 },
+    { 2140047361, 1619330500, 1488632070 },
+    { 2140041217, 2089408064, 2012026134 },
+    { 2140008449, 1705524800, 1613440760 },
+    { 2139924481, 1846208233, 1280649481 },
+    { 2139906049,  989438755, 1185646076 },
+    { 2139867137, 1522314850,  372783595 },
+    { 2139842561, 1681587377,  216848235 },
+    { 2139826177, 2066284988, 1784999464 },
+    { 2139824129,  480888214, 1513323027 },
+    { 2139789313,  847937200,  858192859 },
+    { 2139783169, 1642000434, 1583261448 },
+    { 2139770881,  940699589,  179702100 },
+    { 2139768833,  315623242,  964612676 },
+    { 2139666433,  331649203,  764666914 },
+    { 2139641857, 2118730799, 1313764644 },
+    { 2139635713,  519149027,  519212449 },
+    { 2139598849, 1526413634, 1769667104 },
+    { 2139574273,  551148610,  820739925 },
+    { 2139568129, 1386800242,  472447405 },
+    { 2139549697,  813760130, 1412328531 },
+    { 2139537409, 1615286260, 1609362979 },
+    { 2139475969, 1352559299, 1696720421 },
+    { 2139455489, 1048691649, 1584935400 },
+    { 2139432961,  836025845,  950121150 },
+    { 2139424769, 1558281165, 1635486858 },
+    { 2139406337, 1728402143, 1674423301 },
+    { 2139396097, 1727715782, 1483470544 },
+    { 2139383809, 1092853491, 1741699084 },
+    { 2139369473,  690776899, 1242798709 },
+    { 2139351041, 1768782380, 2120712049 },
+    { 2139334657, 1739968247, 1427249225 },
+    { 2139332609, 1547189119,  623011170 },
+    { 2139310081, 1346827917, 1605466350 },
+    { 2139303937,  369317948,  828392831 },
+    { 2139301889, 1560417239, 1788073219 },
+    { 2139283457, 1303121623,  595079358 },
+    { 2139248641, 1354555286,  573424177 },
+    { 2139240449,   60974056,  885781403 },
+    { 2139222017,  355573421, 1221054839 },
+    { 2139215873,  566477826, 1724006500 },
+    { 2139150337,  871437673, 1609133294 },
+    { 2139144193, 1478130914, 1137491905 },
+    { 2139117569, 1854880922,  964728507 },
+    { 2139076609,  202405335,  756508944 },
+    { 2139062273, 1399715741,  884826059 },
+    { 2139045889, 1051045798, 1202295476 },
+    { 2139033601, 1707715206,  632234634 },
+    { 2139006977, 2035853139,  231626690 },
+    { 2138951681,  183867876,  838350879 },
+    { 2138945537, 1403254661,  404460202 },
+    { 2138920961,  310865011, 1282911681 },
+    { 2138910721, 1328496553,  103472415 },
+    { 2138904577,   78831681,  993513549 },
+    { 2138902529, 1319697451, 1055904361 },
+    { 2138816513,  384338872, 1706202469 },
+    { 2138810369, 1084868275,  405677177 },
+    { 2138787841,  401181788, 1964773901 },
+    { 2138775553, 1850532988, 1247087473 },
+    { 2138767361,  874261901, 1576073565 },
+    { 2138757121, 1187474742,  993541415 },
+    { 2138748929, 1782458888, 1043206483 },
+    { 2138744833, 1221500487,  800141243 },
+    { 2138738689,  413465368, 1450660558 },
+    { 2138695681,  739045140,  342611472 },
+    { 2138658817, 1355845756,  672674190 },
+    { 2138644481,  608379162, 1538874380 },
+    { 2138632193, 1444914034,  686911254 },
+    { 2138607617,  484707818, 1435142134 },
+    { 2138591233,  539460669, 1290458549 },
+    { 2138572801, 2093538990, 2011138646 },
+    { 2138552321, 1149786988, 1076414907 },
+    { 2138546177,  840688206, 2108985273 },
+    { 2138533889,  209669619,  198172413 },
+    { 2138523649, 1975879426, 1277003968 },
+    { 2138490881, 1351891144, 1976858109 },
+    { 2138460161, 1817321013, 1979278293 },
+    { 2138429441, 1950077177,  203441928 },
+    { 2138400769,  908970113,  628395069 },
+    { 2138398721,  219890864,  758486760 },
+    { 2138376193, 1306654379,  977554090 },
+    { 2138351617,  298822498, 2004708503 },
+    { 2138337281,  441457816, 1049002108 },
+    { 2138320897, 1517731724, 1442269609 },
+    { 2138290177, 1355911197, 1647139103 },
+    { 2138234881,  531313247, 1746591962 },
+    { 2138214401, 1899410930,  781416444 },
+    { 2138202113, 1813477173, 1622508515 },
+    { 2138191873, 1086458299, 1025408615 },
+    { 2138183681, 1998800427,  827063290 },
+    { 2138173441, 1921308898,  749670117 },
+    { 2138103809, 1620902804, 2126787647 },
+    { 2138099713,  828647069, 1892961817 },
+    { 2138085377,  179405355, 1525506535 },
+    { 2138060801,  615683235, 1259580138 },
+    { 2138044417, 2030277840, 1731266562 },
+    { 2138042369, 2087222316, 1627902259 },
+    { 2138032129,  126388712, 1108640984 },
+    { 2138011649,  715026550, 1017980050 },
+    { 2137993217, 1693714349, 1351778704 },
+    { 2137888769, 1289762259, 1053090405 },
+    { 2137853953,  199991890, 1254192789 },
+    { 2137833473,  941421685,  896995556 },
+    { 2137817089,  750416446, 1251031181 },
+    { 2137792513,  798075119,  368077456 },
+    { 2137786369,  878543495, 1035375025 },
+    { 2137767937,    9351178, 1156563902 },
+    { 2137755649, 1382297614, 1686559583 },
+    { 2137724929, 1345472850, 1681096331 },
+    { 2137704449,  834666929,  630551727 },
+    { 2137673729, 1646165729, 1892091571 },
+    { 2137620481,  778943821,   48456461 },
+    { 2137618433, 1730837875, 1713336725 },
+    { 2137581569,  805610339, 1378891359 },
+    { 2137538561,  204342388, 1950165220 },
+    { 2137526273, 1947629754, 1500789441 },
+    { 2137516033,  719902645, 1499525372 },
+    { 2137491457,  230451261,  556382829 },
+    { 2137440257,  979573541,  412760291 },
+    { 2137374721,  927841248, 1954137185 },
+    { 2137362433, 1243778559,  861024672 },
+    { 2137313281, 1341338501,  980638386 },
+    { 2137311233,  937415182, 1793212117 },
+    { 2137255937,  795331324, 1410253405 },
+    { 2137243649,  150756339, 1966999887 },
+    { 2137182209,  163346914, 1939301431 },
+    { 2137171969, 1952552395,  758913141 },
+    { 2137159681,  570788721,  218668666 },
+    { 2137147393, 1896656810, 2045670345 },
+    { 2137141249,  358493842,  518199643 },
+    { 2137139201, 1505023029,  674695848 },
+    { 2137133057,   27911103,  830956306 },
+    { 2137122817,  439771337, 1555268614 },
+    { 2137116673,  790988579, 1871449599 },
+    { 2137110529,  432109234,  811805080 },
+    { 2137102337, 1357900653, 1184997641 },
+    { 2137098241,  515119035, 1715693095 },
+    { 2137090049,  408575203, 2085660657 },
+    { 2137085953, 2097793407, 1349626963 },
+    { 2137055233, 1556739954, 1449960883 },
+    { 2137030657, 1545758650, 1369303716 },
+    { 2136987649,  332602570,  103875114 },
+    { 2136969217, 1499989506, 1662964115 },
+    { 2136924161,  857040753,    4738842 },
+    { 2136895489, 1948872712,  570436091 },
+    { 2136893441,   58969960, 1568349634 },
+    { 2136887297, 2127193379,  273612548 },
+    { 2136850433,  111208983, 1181257116 },
+    { 2136809473, 1627275942, 1680317971 },
+    { 2136764417, 1574888217,   14011331 },
+    { 2136741889,   14011055, 1129154251 },
+    { 2136727553,   35862563, 1838555253 },
+    { 2136721409,  310235666, 1363928244 },
+    { 2136698881, 1612429202, 1560383828 },
+    { 2136649729, 1138540131,  800014364 },
+    { 2136606721,  602323503, 1433096652 },
+    { 2136563713,  182209265, 1919611038 },
+    { 2136555521,  324156477,  165591039 },
+    { 2136549377,  195513113,  217165345 },
+    { 2136526849, 1050768046,  939647887 },
+    { 2136508417, 1886286237, 1619926572 },
+    { 2136477697,  609647664,   35065157 },
+    { 2136471553,  679352216, 1452259468 },
+    { 2136457217,  128630031,  824816521 },
+    { 2136422401,   19787464, 1526049830 },
+    { 2136420353,  698316836, 1530623527 },
+    { 2136371201, 1651862373, 1804812805 },
+    { 2136334337,  326596005,  336977082 },
+    { 2136322049,   63253370, 1904972151 },
+    { 2136297473,  312176076,  172182411 },
+    { 2136248321,  381261841,  369032670 },
+    { 2136242177,  358688773, 1640007994 },
+    { 2136229889,  512677188,   75585225 },
+    { 2136219649, 2095003250, 1970086149 },
+    { 2136207361, 1909650722,  537760675 },
+    { 2136176641, 1334616195, 1533487619 },
+    { 2136158209, 2096285632, 1793285210 },
+    { 2136143873, 1897347517,  293843959 },
+    { 2136133633,  923586222, 1022655978 },
+    { 2136096769, 1464868191, 1515074410 },
+    { 2136094721, 2020679520, 2061636104 },
+    { 2136076289,  290798503, 1814726809 },
+    { 2136041473,  156415894, 1250757633 },
+    { 2135996417,  297459940, 1132158924 },
+    { 2135955457,  538755304, 1688831340 },
+    { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+    uint32_t w;
+
+    w = (uint32_t)x;
+    w += p & -(w >> 31);
+    return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+    return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+    uint32_t y;
+
+    y = 2 - p;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+    /*
+     * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+     * 2^31 - p.
+     */
+    return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a + b - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a - b;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+    a += p & -(a & 1);
+    return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+    uint64_t z, w;
+    uint32_t d;
+
+    z = (uint64_t)a * (uint64_t)b;
+    w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+    d = (uint32_t)((z + w) >> 31) - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+    uint32_t z;
+
+    /*
+     * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+     * representation), then double it with an addition.
+     */
+    z = modp_R(p);
+    z = modp_add(z, z, p);
+
+    /*
+     * Square it five times to obtain 2^32 in Montgomery representation
+     * (i.e. 2^63 mod p).
+     */
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+
+    /*
+     * Halve the value mod p to get 2^62.
+     */
+    z = (z + (p & -(z & 1))) >> 1;
+    return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+    int i;
+    uint32_t r, z;
+
+    /*
+     * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+     * representation of (2^31)^e mod p, where e = x-1.
+     * R2 is 2^31 in Montgomery representation.
+     */
+    x --;
+    r = R2;
+    z = modp_R(p);
+    for (i = 0; (1U << i) <= x; i ++) {
+        if ((x & (1U << i)) != 0) {
+            z = modp_montymul(z, r, p, p0i);
+        }
+        r = modp_montymul(r, r, p, p0i);
+    }
+    return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ *   a     dividend
+ *   b     divisor
+ *   p     odd prime modulus
+ *   p0i   -1/p mod 2^31
+ *   R     2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+    uint32_t z, e;
+    int i;
+
+    e = p - 2;
+    z = R;
+    for (i = 30; i >= 0; i --) {
+        uint32_t z2;
+
+        z = modp_montymul(z, z, p, p0i);
+        z2 = modp_montymul(z, b, p, p0i);
+        z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+    }
+
+    /*
+     * The loop above just assumed that b was in Montgomery
+     * representation, i.e. really contained b*R; under that
+     * assumption, it returns 1/b in Montgomery representation,
+     * which is R/b. But we gave it b in normal representation,
+     * so the loop really returned R/(b/R) = R^2/b.
+     *
+     * We want a/b, so we need one Montgomery multiplication with a,
+     * which also remove one of the R factors, and another such
+     * multiplication to remove the second R factor.
+     */
+    z = modp_montymul(z, 1, p, p0i);
+    return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+    0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
+    192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
+    96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
+    144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
+    48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
+    240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
+    72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
+    168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
+    24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
+    216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
+    120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
+    132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
+    36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
+    228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
+    84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
+    180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
+    12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
+    204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
+    108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
+    156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
+    60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
+    252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
+    66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
+    162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
+    18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
+    210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
+    114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
+    138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
+    42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
+    234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
+    90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
+    186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
+    6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
+    198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
+    102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
+    150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
+    54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
+    246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
+    78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
+    174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
+    30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
+    222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
+    126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
+    129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
+    33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
+    225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
+    81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
+    177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
+    9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
+    201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
+    105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
+    153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
+    57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
+    249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
+    69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
+    165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
+    21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
+    213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
+    117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
+    141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
+    45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
+    237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
+    93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
+    189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
+    3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
+    195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
+    99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
+    147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
+    51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
+    243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
+    75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
+    171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
+    27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
+    219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
+    123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
+    135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
+    39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
+    231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
+    87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
+    183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
+    15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
+    207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
+    111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
+    159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
+    63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
+    255,  767,  511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ *   gm[rev(i)] = g^i mod p
+ *   igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
+           uint32_t g, uint32_t p, uint32_t p0i) {
+    size_t u, n;
+    unsigned k;
+    uint32_t ig, x1, x2, R2;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * We want g such that g^(2N) = 1 mod p, but the provided
+     * generator has order 2048. We must square it a few times.
+     */
+    R2 = modp_R2(p, p0i);
+    g = modp_montymul(g, R2, p, p0i);
+    for (k = logn; k < 10; k ++) {
+        g = modp_montymul(g, g, p, p0i);
+    }
+
+    ig = modp_div(R2, g, p, p0i, modp_R(p));
+    k = 10 - logn;
+    x1 = x2 = modp_R(p);
+    for (u = 0; u < n; u ++) {
+        size_t v;
+
+        v = REV10[u << k];
+        gm[v] = x1;
+        igm[v] = x2;
+        x1 = modp_montymul(x1, g, p, p0i);
+        x2 = modp_montymul(x2, ig, p, p0i);
+    }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+              uint32_t p, uint32_t p0i) {
+    size_t t, m, n;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, u, v1;
+
+        ht = t >> 1;
+        for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = gm[m + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + ht * stride;
+            for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = modp_montymul(*r2, s, p, p0i);
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_sub(x, y, p);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+               uint32_t p, uint32_t p0i) {
+    size_t t, m, n, k;
+    uint32_t ni;
+    uint32_t *r;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = 1;
+    for (m = n; m > 1; m >>= 1) {
+        size_t hm, dt, u, v1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = igm[hm + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + t * stride;
+            for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = *r2;
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_montymul(
+                          modp_sub(x, y, p), s, p, p0i);;
+            }
+        }
+        t = dt;
+    }
+
+    /*
+     * We need 1/n in Montgomery representation, i.e. R/n. Since
+     * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+     * thus a simple shift will do.
+     */
+    ni = (uint32_t)1 << (31 - logn);
+    for (k = 0, r = a; k < n; k ++, r += stride) {
+        *r = modp_montymul(*r, ni, p, p0i);
+    }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+                  uint32_t p, uint32_t p0i, uint32_t R2) {
+    size_t hn, u;
+
+    hn = (size_t)1 << (logn - 1);
+    for (u = 0; u < hn; u ++) {
+        uint32_t w0, w1;
+
+        w0 = f[(u << 1) + 0];
+        w1 = f[(u << 1) + 1];
+        f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+    }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ *  - Rebuild the resultant and the polynomial coefficients from their
+ *    values modulo small primes (of length 31 bits each).
+ *
+ *  - Compute an extended GCD between the two computed resultants.
+ *
+ *  - Extract top bits and add scaled values during the successive steps
+ *    of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
+         uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    cc = 0;
+    m = -ctl;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, w;
+
+        aw = a[u];
+        w = aw - b[u] - cc;
+        cc = w >> 31;
+        aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+        a[u] = aw;
+    }
+    return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < mlen; u ++) {
+        uint64_t z;
+
+        z = (uint64_t)m[u] * (uint64_t)x + cc;
+        m[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ *  d is unsigned
+ *  p is prime
+ *  2^30 < p < 2^31
+ *  p0i = -(1/p) mod 2^31
+ *  R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+                        uint32_t p, uint32_t p0i, uint32_t R2) {
+    uint32_t x;
+    size_t u;
+
+    /*
+     * Algorithm: we inject words one by one, starting with the high
+     * word. Each step is:
+     *  - multiply x by 2^31
+     *  - add new word
+     */
+    x = 0;
+    u = dlen;
+    while (u -- > 0) {
+        uint32_t w;
+
+        x = modp_montymul(x, R2, p, p0i);
+        w = d[u] - p;
+        w += p & -(w >> 31);
+        x = modp_add(x, w, p);
+    }
+    return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+                      uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+    uint32_t z;
+
+    if (dlen == 0) {
+        return 0;
+    }
+    z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+    z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+    return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *restrict x,
+                   const uint32_t *restrict y, size_t len, uint32_t s) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t xw, yw;
+        uint64_t z;
+
+        xw = x[u];
+        yw = y[u];
+        z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) {
+    size_t u;
+    uint32_t r, bb;
+
+    /*
+     * Compare x with p/2. We use the shifted version of p, and p
+     * is odd, so we really compare with (p-1)/2; we want to perform
+     * the subtraction if and only if x > (p-1)/2.
+     */
+    r = 0;
+    bb = 0;
+    u = len;
+    while (u -- > 0) {
+        uint32_t wx, wp, cc;
+
+        /*
+         * Get the two words to compare in wx and wp (both over
+         * 31 bits exactly).
+         */
+        wx = x[u];
+        wp = (p[u] >> 1) | (bb << 30);
+        bb = p[u] & 1;
+
+        /*
+         * We set cc to -1, 0 or 1, depending on whether wp is
+         * lower than, equal to, or greater than wx.
+         */
+        cc = wp - wx;
+        cc = ((-cc) >> 31) | -(cc >> 31);
+
+        /*
+         * If r != 0 then it is either 1 or -1, and we keep its
+         * value. Otherwise, if r = 0, then we replace it with cc.
+         */
+        r |= cc & ((r & 1) - 1);
+    }
+
+    /*
+     * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+     * is lower than, equal to, or greater than x. We thus want to
+     * do the subtraction only if r = -1.
+     */
+    zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
+                 size_t num, const small_prime *primes, int normalize_signed,
+                 uint32_t *restrict tmp) {
+    size_t u;
+    uint32_t *x;
+
+    tmp[0] = primes[0].p;
+    for (u = 1; u < xlen; u ++) {
+        /*
+         * At the entry of each loop iteration:
+         *  - the first u words of each array have been
+         *    reassembled;
+         *  - the first u words of tmp[] contains the
+         * product of the prime moduli processed so far.
+         *
+         * We call 'q' the product of all previous primes.
+         */
+        uint32_t p, p0i, s, R2;
+        size_t v;
+
+        p = primes[u].p;
+        s = primes[u].s;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        for (v = 0, x = xx; v < num; v ++, x += xstride) {
+            uint32_t xp, xq, xr;
+            /*
+             * xp = the integer x modulo the prime p for this
+             *      iteration
+             * xq = (x mod q) mod p
+             */
+            xp = x[u];
+            xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+            /*
+             * New value is (x mod q) + q * (s * (xp - xq) mod p)
+             */
+            xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+            zint_add_mul_small(x, tmp, u, xr);
+        }
+
+        /*
+         * Update product of primes in tmp[].
+         */
+        tmp[u] = zint_mul_small(tmp, u, p);
+    }
+
+    /*
+     * Normalize the reconstructed values around 0.
+     */
+    if (normalize_signed) {
+        for (u = 0, x = xx; u < num; u ++, x += xstride) {
+            zint_norm_zero(x, tmp, xlen);
+        }
+    }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    /*
+     * If ctl = 1 then we flip the bits of a by XORing with
+     * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+     * with 0 and add 0, which leaves the value unchanged.
+     */
+    cc = ctl;
+    m = -ctl >> 1;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw;
+
+        aw = a[u];
+        aw = (aw ^ m) + cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ *  0  both values were positive
+ *  1  new a had to be negated
+ *  2  new b had to be negated
+ *  3  both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+               int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t nega, negb;
+
+    cca = 0;
+    ccb = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    nega = (uint32_t)((uint64_t)cca >> 63);
+    negb = (uint32_t)((uint64_t)ccb >> 63);
+    zint_negate(a, len, nega);
+    zint_negate(b, len, negb);
+    return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+    size_t u;
+    uint32_t cc, xm, ym;
+
+    /*
+     * First pass: compare a (assumed nonnegative) with m. Note that
+     * if the top word uses 32 bits, subtracting m must yield a
+     * value less than 2^31 since a < 2*m.
+     */
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        cc = (a[u] - m[u] - cc) >> 31;
+    }
+
+    /*
+     * If neg = 1 then we must add m (regardless of cc)
+     * If neg = 0 and cc = 0 then we must subtract m
+     * If neg = 0 and cc = 1 then we must do nothing
+     *
+     * In the loop below, we conditionally subtract either m or -m
+     * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+     * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+     */
+    xm = -neg >> 1;
+    ym = -(neg | (1 - cc));
+    cc = neg;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, mw;
+
+        aw = a[u];
+        mw = (m[u] ^ xm) & ym;
+        aw = aw - mw - cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+                   uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t fa, fb;
+
+    /*
+     * These are actually four combined Montgomery multiplications.
+     */
+    cca = 0;
+    ccb = 0;
+    fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+    fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb
+             + m[u] * (uint64_t)fa + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+             + m[u] * (uint64_t)fb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    /*
+     * At this point:
+     *   -m <= a < 2*m
+     *   -m <= b < 2*m
+     * (this is a case of Montgomery reduction)
+     * The top words of 'a' and 'b' may have a 32-th bit set.
+     * We want to add or subtract the modulus, as required.
+     */
+    zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+    zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ *   0 <= u <= y
+ *   0 <= v <= x
+ *   x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
+            const uint32_t *restrict x, const uint32_t *restrict y,
+            size_t len, uint32_t *restrict tmp) {
+    /*
+     * Algorithm is an extended binary GCD. We maintain 6 values
+     * a, b, u0, u1, v0 and v1 with the following invariants:
+     *
+     *  a = x*u0 - y*v0
+     *  b = x*u1 - y*v1
+     *  0 <= a <= x
+     *  0 <= b <= y
+     *  0 <= u0 < y
+     *  0 <= v0 < x
+     *  0 <= u1 <= y
+     *  0 <= v1 < x
+     *
+     * Initial values are:
+     *
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     *
+     * Each iteration reduces either a or b, and maintains the
+     * invariants. Algorithm stops when a = b, at which point their
+     * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+     * the values (u,v) we want to return.
+     *
+     * The formal definition of the algorithm is a sequence of steps:
+     *
+     *  - If a is even, then:
+     *        a <- a/2
+     *        u0 <- u0/2 mod y
+     *        v0 <- v0/2 mod x
+     *
+     *  - Otherwise, if b is even, then:
+     *        b <- b/2
+     *        u1 <- u1/2 mod y
+     *        v1 <- v1/2 mod x
+     *
+     *  - Otherwise, if a > b, then:
+     *        a <- (a-b)/2
+     *        u0 <- (u0-u1)/2 mod y
+     *        v0 <- (v0-v1)/2 mod x
+     *
+     *  - Otherwise:
+     *        b <- (b-a)/2
+     *        u1 <- (u1-u0)/2 mod y
+     *        v1 <- (v1-v0)/2 mod y
+     *
+     * We can show that the operations above preserve the invariants:
+     *
+     *  - If a is even, then u0 and v0 are either both even or both
+     *    odd (since a = x*u0 - y*v0, and x and y are both odd).
+     *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+     *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+     *    the a = x*u0 - y*v0 invariant is preserved.
+     *
+     *  - The same holds for the case where b is even.
+     *
+     *  - If a and b are odd, and a > b, then:
+     *
+     *      a-b = x*(u0-u1) - y*(v0-v1)
+     *
+     *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+     *    a-b > 0; therefore, it must be that v0 < v1, and the
+     *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+     *    which preserves the invariants. Otherwise, if u0 > u1,
+     *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+     *    b >= 0, hence a-b <= x. It follows that, in that case,
+     *    v0-v1 >= 0. The first part of the update is then:
+     *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+     *    invariants.
+     *
+     *    Either way, once the subtraction is done, the new value of
+     *    a, which is the difference of two odd values, is even,
+     *    and the remaining of this step is a subcase of the
+     *    first algorithm case (i.e. when a is even).
+     *
+     *  - If a and b are odd, and b > a, then the a similar
+     *    argument holds.
+     *
+     * The values a and b start at x and y, respectively. Since x
+     * and y are odd, their GCD is odd, and it is easily seen that
+     * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+     * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+     * or b is reduced by at least one bit at each iteration, so
+     * the algorithm necessarily converges on the case a = b, at
+     * which point the common value is the GCD.
+     *
+     * In the algorithm expressed above, when a = b, the fourth case
+     * applies, and sets b = 0. Since a contains the GCD of x and y,
+     * which are both odd, a must be odd, and subsequent iterations
+     * (if any) will simply divide b by 2 repeatedly, which has no
+     * consequence. Thus, the algorithm can run for more iterations
+     * than necessary; the final GCD will be in a, and the (u,v)
+     * coefficients will be (u0,v0).
+     *
+     *
+     * The presentation above is bit-by-bit. It can be sped up by
+     * noticing that all decisions are taken based on the low bits
+     * and high bits of a and b. We can extract the two top words
+     * and low word of each of a and b, and compute reduction
+     * parameters pa, pb, qa and qb such that the new values for
+     * a and b are:
+     *    a' = (a*pa + b*pb) / (2^31)
+     *    b' = (a*qa + b*qb) / (2^31)
+     * the two divisions being exact. The coefficients are obtained
+     * just from the extracted words, and may be slightly off, requiring
+     * an optional correction: if a' < 0, then we replace pa with -pa
+     * and pb with -pb. Each such step will reduce the total length
+     * (sum of lengths of a and b) by at least 30 bits at each
+     * iteration.
+     */
+    uint32_t *u0, *u1, *v0, *v1, *a, *b;
+    uint32_t x0i, y0i;
+    uint32_t num, rc;
+    size_t j;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    /*
+     * u0 and v0 are the u and v result buffers; the four other
+     * values (u1, v1, a and b) are taken from tmp[].
+     */
+    u0 = u;
+    v0 = v;
+    u1 = tmp;
+    v1 = u1 + len;
+    a = v1 + len;
+    b = a + len;
+
+    /*
+     * We'll need the Montgomery reduction coefficients.
+     */
+    x0i = modp_ninv31(x[0]);
+    y0i = modp_ninv31(y[0]);
+
+    /*
+     * Initialize a, b, u0, u1, v0 and v1.
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     * Note that x is odd, so computing x-1 is easy.
+     */
+    memcpy(a, x, len * sizeof * x);
+    memcpy(b, y, len * sizeof * y);
+    u0[0] = 1;
+    memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+    memset(v0, 0, len * sizeof * v0);
+    memcpy(u1, y, len * sizeof * u1);
+    memcpy(v1, x, len * sizeof * v1);
+    v1[0] --;
+
+    /*
+     * Each input operand may be as large as 31*len bits, and we
+     * reduce the total length by at least 30 bits at each iteration.
+     */
+    for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+        uint32_t c0, c1;
+        uint32_t a0, a1, b0, b1;
+        uint64_t a_hi, b_hi;
+        uint32_t a_lo, b_lo;
+        int64_t pa, pb, qa, qb;
+        int i;
+        uint32_t r;
+
+        /*
+         * Extract the top words of a and b. If j is the highest
+         * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+         * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+         * If a and b are down to one word each, then we use
+         * a[0] and b[0].
+         */
+        c0 = (uint32_t) -1;
+        c1 = (uint32_t) -1;
+        a0 = 0;
+        a1 = 0;
+        b0 = 0;
+        b1 = 0;
+        j = len;
+        while (j -- > 0) {
+            uint32_t aw, bw;
+
+            aw = a[j];
+            bw = b[j];
+            a0 ^= (a0 ^ aw) & c0;
+            a1 ^= (a1 ^ aw) & c1;
+            b0 ^= (b0 ^ bw) & c0;
+            b1 ^= (b1 ^ bw) & c1;
+            c1 = c0;
+            c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+        }
+
+        /*
+         * If c1 = 0, then we grabbed two words for a and b.
+         * If c1 != 0 but c0 = 0, then we grabbed one word. It
+         * is not possible that c1 != 0 and c0 != 0, because that
+         * would mean that both integers are zero.
+         */
+        a1 |= a0 & c1;
+        a0 &= ~c1;
+        b1 |= b0 & c1;
+        b0 &= ~c1;
+        a_hi = ((uint64_t)a0 << 31) + a1;
+        b_hi = ((uint64_t)b0 << 31) + b1;
+        a_lo = a[0];
+        b_lo = b[0];
+
+        /*
+         * Compute reduction factors:
+         *
+         *   a' = a*pa + b*pb
+         *   b' = a*qa + b*qb
+         *
+         * such that a' and b' are both multiple of 2^31, but are
+         * only marginally larger than a and b.
+         */
+        pa = 1;
+        pb = 0;
+        qa = 0;
+        qb = 1;
+        for (i = 0; i < 31; i ++) {
+            /*
+             * At each iteration:
+             *
+             *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+             *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+             *   a <- a/2 if: a is even
+             *   b <- b/2 if: a is odd, b is even
+             *
+             * We multiply a_lo and b_lo by 2 at each
+             * iteration, thus a division by 2 really is a
+             * non-multiplication by 2.
+             */
+            uint32_t rt, oa, ob, cAB, cBA, cA;
+            uint64_t rz;
+
+            /*
+             * rt = 1 if a_hi > b_hi, 0 otherwise.
+             */
+            rz = b_hi - a_hi;
+            rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+                                   & (a_hi ^ rz))) >> 63);
+
+            /*
+             * cAB = 1 if b must be subtracted from a
+             * cBA = 1 if a must be subtracted from b
+             * cA = 1 if a must be divided by 2
+             *
+             * Rules:
+             *
+             *   cAB and cBA cannot both be 1.
+             *   If a is not divided by 2, b is.
+             */
+            oa = (a_lo >> i) & 1;
+            ob = (b_lo >> i) & 1;
+            cAB = oa & ob & rt;
+            cBA = oa & ob & ~rt;
+            cA = cAB | (oa ^ 1);
+
+            /*
+             * Conditional subtractions.
+             */
+            a_lo -= b_lo & -cAB;
+            a_hi -= b_hi & -(uint64_t)cAB;
+            pa -= qa & -(int64_t)cAB;
+            pb -= qb & -(int64_t)cAB;
+            b_lo -= a_lo & -cBA;
+            b_hi -= a_hi & -(uint64_t)cBA;
+            qa -= pa & -(int64_t)cBA;
+            qb -= pb & -(int64_t)cBA;
+
+            /*
+             * Shifting.
+             */
+            a_lo += a_lo & (cA - 1);
+            pa += pa & ((int64_t)cA - 1);
+            pb += pb & ((int64_t)cA - 1);
+            a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+            b_lo += b_lo & -cA;
+            qa += qa & -(int64_t)cA;
+            qb += qb & -(int64_t)cA;
+            b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+        }
+
+        /*
+         * Apply the computed parameters to our values. We
+         * may have to correct pa and pb depending on the
+         * returned value of zint_co_reduce() (when a and/or b
+         * had to be negated).
+         */
+        r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+        pa -= (pa + pa) & -(int64_t)(r & 1);
+        pb -= (pb + pb) & -(int64_t)(r & 1);
+        qa -= (qa + qa) & -(int64_t)(r >> 1);
+        qb -= (qb + qb) & -(int64_t)(r >> 1);
+        zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+        zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+    }
+
+    /*
+     * At that point, array a[] should contain the GCD, and the
+     * results (u,v) should already be set. We check that the GCD
+     * is indeed 1. We also check that the two operands x and y
+     * are odd.
+     */
+    rc = a[0] ^ 1;
+    for (j = 1; j < len; j ++) {
+        rc |= a[j];
+    }
+    return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
+                          const uint32_t *restrict y, size_t ylen, int32_t k,
+                          uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    int32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t wy, wys, ccu;
+        uint64_t z;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        wy = v < ylen ? y[v] : ysign;
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        /*
+         * The expression below does not overflow.
+         */
+        z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+        /*
+         * Right-shifting the signed value z would yield
+         * implementation-defined results (arithmetic shift is
+         * not guaranteed). However, we can cast to unsigned,
+         * and get the next carry as an unsigned word. We can
+         * then convert it back to signed by using the guaranteed
+         * fact that 'int32_t' uses two's complement with no
+         * trap representation or padding bit, and with a layout
+         * compatible with that of 'uint32_t'.
+         */
+        ccu = (uint32_t)(z >> 31);
+        cc = *(int32_t *)&ccu;
+    }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *restrict x, size_t xlen,
+                const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    uint32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t w, wy, wys;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        wy = v < ylen ? y[v] : ysign;
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        w = x[u] - wys - cc;
+        x[u] = w & 0x7FFFFFFF;
+        cc = w >> 31;
+    }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+    uint32_t w;
+
+    w = x[0];
+    w |= (w & 0x40000000) << 1;
+    return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+               unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    if (flen == 0) {
+        for (u = 0; u < n; u ++) {
+            d[u] = fpr_zero;
+        }
+        return;
+    }
+    for (u = 0; u < n; u ++, f += fstride) {
+        size_t v;
+        uint32_t neg, cc, xm;
+        fpr x, fsc;
+
+        /*
+         * Get sign of the integer; if it is negative, then we
+         * will load its absolute value instead, and negate the
+         * result.
+         */
+        neg = -(f[flen - 1] >> 30);
+        xm = neg >> 1;
+        cc = neg & 1;
+        x = fpr_zero;
+        fsc = fpr_one;
+        for (v = 0; v < flen; v++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+            uint32_t w;
+
+            w = (f[v] ^ xm) + cc;
+            cc = w >> 31;
+            w &= 0x7FFFFFFF;
+            w -= (w << 1) & neg;
+            x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+        }
+        d[u] = x;
+    }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = zint_one_to_plain(s + u);
+        if (z < -lim || z > lim) {
+            return 0;
+        }
+        d[u] = (int8_t)z;
+    }
+    return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
+                const uint32_t *restrict f, size_t flen, size_t fstride,
+                const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t kf;
+        size_t v;
+        uint32_t *x;
+        const uint32_t *y;
+
+        kf = -k[u];
+        x = F + u * Fstride;
+        y = f;
+        for (v = 0; v < n; v ++) {
+            zint_add_scaled_mul_small(
+                x, Flen, y, flen, kf, sch, scl);
+            if (u + v == n - 1) {
+                x = F;
+                kf = -kf;
+            } else {
+                x += Fstride;
+            }
+            y += fstride;
+        }
+    }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
+                    const uint32_t *restrict f, size_t flen, size_t fstride,
+                    const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
+                    uint32_t *restrict tmp) {
+    uint32_t *gm, *igm, *fk, *t1, *x;
+    const uint32_t *y;
+    size_t n, u, tlen;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    tlen = flen + 1;
+    gm = tmp;
+    igm = gm + MKN(logn);
+    fk = igm + MKN(logn);
+    t1 = fk + n * tlen;
+
+    primes = PRIMES;
+
+    /*
+     * Compute k*f in fk[], in RNS notation.
+     */
+    for (u = 0; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0; v < n; v ++) {
+            t1[v] = modp_set(k[v], p);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, y = f, x = fk + u;
+                v < n; v ++, y += fstride, x += tlen) {
+            *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+        }
+        modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+        for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+            *x = modp_montymul(
+                     modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+        }
+        modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild k*f.
+     */
+    zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+    /*
+     * Subtract k*f, scaled, from F.
+     */
+    for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+        zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+    }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT   inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+    /*
+     * We enforce little-endian representation.
+     */
+
+    uint8_t tmp[8];
+
+    inner_shake256_extract(rng, tmp, sizeof tmp);
+    return (uint64_t)tmp[0]
+           | ((uint64_t)tmp[1] << 8)
+           | ((uint64_t)tmp[2] << 16)
+           | ((uint64_t)tmp[3] << 24)
+           | ((uint64_t)tmp[4] << 32)
+           | ((uint64_t)tmp[5] << 40)
+           | ((uint64_t)tmp[6] << 48)
+           | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ *    D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+    1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
+    2353523259288686585u,  1227179971273316331u,   575931623374121527u,
+    242543240509105209u,    91437049221049666u,    30799446349977173u,
+    9255276791179340u,     2478152334826140u,      590642893610164u,
+    125206034929641u,       23590435911403u,        3948334035941u,
+    586753615614u,          77391054539u,           9056793210u,
+    940121950u,             86539696u,              7062824u,
+    510971u,                32764u,                 1862u,
+    94u,                    4u,                    0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+    unsigned u, g;
+    int val;
+
+    g = 1U << (10 - logn);
+    val = 0;
+    for (u = 0; u < g; u ++) {
+        /*
+         * Each iteration generates one value with the
+         * Gaussian distribution for N = 1024.
+         *
+         * We use two random 64-bit values. First value
+         * decides on whether the generated value is 0, and,
+         * if not, the sign of the value. Second random 64-bit
+         * word is used to generate the non-zero value.
+         *
+         * For constant-time code we have to read the complete
+         * table. This has negligible cost, compared with the
+         * remainder of the keygen process (solving the NTRU
+         * equation).
+         */
+        uint64_t r;
+        uint32_t f, v, k, neg;
+
+        /*
+         * First value:
+         *  - flag 'neg' is randomly selected to be 0 or 1.
+         *  - flag 'f' is set to 1 if the generated value is zero,
+         *    or set to 0 otherwise.
+         */
+        r = get_rng_u64(rng);
+        neg = (uint32_t)(r >> 63);
+        r &= ~((uint64_t)1 << 63);
+        f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+        /*
+         * We produce a new random 63-bit integer r, and go over
+         * the array, starting at index 1. We store in v the
+         * index of the first array element which is not greater
+         * than r, unless the flag f was already 1.
+         */
+        v = 0;
+        r = get_rng_u64(rng);
+        r &= ~((uint64_t)1 << 63);
+        for (k = 1; k < (sizeof gauss_1024_12289)
+                / (sizeof gauss_1024_12289[0]); k ++) {
+            uint32_t t;
+
+            t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+            v |= k & -(t & (f ^ 1));
+            f |= t;
+        }
+
+        /*
+         * We apply the sign ('neg' flag). If the value is zero,
+         * the sign has no effect.
+         */
+        v = (v ^ -neg) + neg;
+
+        /*
+         * Generated value is added to val.
+         */
+        val += *(int32_t *)&v;
+    }
+    return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ *  - Within an array, values grow.
+ *
+ *  - The 'SMALL' array must have an entry for maximum depth, corresponding
+ *    to the size of values used in the binary GCD. There is no such value
+ *    for the 'LARGE' array (the binary GCD yields already reduced
+ *    coefficients).
+ *
+ *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ *  - Values must be large enough to handle the common cases, with some
+ *    margins.
+ *
+ *  - Values must not be "too large" either because we will convert some
+ *    integers into floating-point values by considering the top 10 words,
+ *    i.e. 310 bits; hence, for values of length more than 10 words, we
+ *    should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
+ *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
+ *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
+ *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
+ *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
+ *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
+ *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
+ *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
+ *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
+ *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
+ *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+    1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+    2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+    int avg;
+    int std;
+} BITLENGTH[] = {
+    {    4,  0 },
+    {   11,  1 },
+    {   24,  1 },
+    {   50,  1 },
+    {  102,  1 },
+    {  202,  2 },
+    {  401,  4 },
+    {  794,  5 },
+    { 1577,  8 },
+    { 3138, 13 },
+    { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG   4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = MKN(logn);
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = f[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(fpr);
+    if (km) {
+        k += (sizeof(fpr)) - km;
+    }
+    return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(uint32_t);
+    if (km) {
+        k += (sizeof(uint32_t)) - km;
+    }
+    return (uint32_t *)(cb + k);
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+             int in_ntt, int out_ntt) {
+    size_t n, hn, u;
+    size_t slen, tlen;
+    uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+    const small_prime *primes;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    slen = MAX_BL_SMALL[depth];
+    tlen = MAX_BL_SMALL[depth + 1];
+    primes = PRIMES;
+
+    /*
+     * Prepare room for the result.
+     */
+    fd = data;
+    gd = fd + hn * tlen;
+    fs = gd + hn * tlen;
+    gs = fs + n * slen;
+    gm = gs + n * slen;
+    igm = gm + n;
+    t1 = igm + n;
+    memmove(fs, data, 2 * n * slen * sizeof * data);
+
+    /*
+     * First slen words: we use the input values directly, and apply
+     * inverse NTT as we go.
+     */
+    for (u = 0; u < slen; u ++) {
+        uint32_t p, p0i, R2;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+        }
+
+        for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+
+    /*
+     * Since the fs and gs words have been de-NTTized, we can use the
+     * CRT to rebuild the values.
+     */
+    zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+    zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+    /*
+     * Remaining words: use modular reductions to extract the values.
+     */
+    for (u = slen; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+        for (v = 0, x = fs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        for (v = 0, x = gs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ *   0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+        unsigned logn, unsigned depth, int out_ntt) {
+    size_t n, u;
+    uint32_t *ft, *gt, p0;
+    unsigned d;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    ft = data;
+    gt = ft + n;
+    primes = PRIMES;
+    p0 = primes[0].p;
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p0);
+        gt[u] = modp_set(g[u], p0);
+    }
+
+    if (depth == 0 && out_ntt) {
+        uint32_t *gm, *igm;
+        uint32_t p, p0i;
+
+        p = primes[0].p;
+        p0i = modp_ninv31(p);
+        gm = gt + n;
+        igm = gm + MKN(logn);
+        modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+        modp_NTT2(ft, gm, logn, p, p0i);
+        modp_NTT2(gt, gm, logn, p, p0i);
+        return;
+    }
+
+    for (d = 0; d < depth; d ++) {
+        make_fg_step(data, logn - d, d,
+                     d != 0, (d + 1) < depth || out_ntt);
+    }
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+                   const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t len;
+    uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+    const small_prime *primes;
+
+    len = MAX_BL_SMALL[logn_top];
+    primes = PRIMES;
+
+    Fp = tmp;
+    Gp = Fp + len;
+    fp = Gp + len;
+    gp = fp + len;
+    t1 = gp + len;
+
+    make_fg(fp, f, g, logn_top, logn_top, 0);
+
+    /*
+     * We use the CRT to rebuild the resultants as big integers.
+     * There are two such big integers. The resultants are always
+     * nonnegative.
+     */
+    zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+    /*
+     * Apply the binary GCD. The zint_bezout() function works only
+     * if both inputs are odd.
+     *
+     * We can test on the result and return 0 because that would
+     * imply failure of the NTRU solving equation, and the (f,g)
+     * values will be abandoned in that case.
+     */
+    if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+        return 0;
+    }
+
+    /*
+     * Multiply the two values by the target value q. Values must
+     * fit in the destination arrays.
+     * We can again test on the returned words: a non-zero output
+     * of zint_mul_small() means that we exceeded our array
+     * capacity, and that implies failure and rejection of (f,g).
+     */
+    q = 12289;
+    if (zint_mul_small(Fp, len, q) != 0
+            || zint_mul_small(Gp, len, q) != 0) {
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+                        const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+    /*
+     * In this function, 'logn' is the log2 of the degree for
+     * this step. If N = 2^logn, then:
+     *  - the F and G values already in fk->tmp (from the deeper
+     *    levels) have degree N/2;
+     *  - this function should return F and G of degree N.
+     */
+    unsigned logn;
+    size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+    int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+    uint32_t *x, *y;
+    int32_t *k;
+    const small_prime *primes;
+
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2 or N/3)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+    primes = PRIMES;
+
+    /*
+     * Fd and Gd are the F and G from the deeper level.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+
+    /*
+     * Compute the input f and g for this level. Note that we get f
+     * and g in RNS + NTT representation.
+     */
+    ft = Gd + dlen * hn;
+    make_fg(ft, f, g, logn_top, depth, 1);
+
+    /*
+     * Move the newly computed f and g to make room for our candidate
+     * F and G (unreduced).
+     */
+    Ft = tmp;
+    Gt = Ft + n * llen;
+    t1 = Gt + n * llen;
+    memmove(t1, ft, 2 * n * slen * sizeof * ft);
+    ft = t1;
+    gt = ft + slen * n;
+    t1 = gt + slen * n;
+
+    /*
+     * Move Fd and Gd _after_ f and g.
+     */
+    memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+    Fd = t1;
+    Gd = Fd + hn * dlen;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt (only n/2 values in each).
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * We do not need Fd and Gd after that point.
+     */
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * If we processed slen words, then f and g have been
+         * de-NTTized, and are in RNS; we can rebuild them.
+         */
+        if (u == slen) {
+            zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+            zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+        }
+
+        gm = t1;
+        igm = gm + n;
+        fx = igm + n;
+        gx = fx + n;
+
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        if (u < slen) {
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = *x;
+                gx[v] = *y;
+            }
+            modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+            modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+        } else {
+            uint32_t Rx;
+
+            Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+            for (v = 0, x = ft, y = gt;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = zint_mod_small_signed(x, slen,
+                                              p, p0i, R2, Rx);
+                gx[v] = zint_mod_small_signed(y, slen,
+                                              p, p0i, R2, Rx);
+            }
+            modp_NTT2(fx, gm, logn, p, p0i);
+            modp_NTT2(gx, gm, logn, p, p0i);
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed in
+         * a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * General case:
+         *
+         *   we divide degree by d = 2 or 3
+         *   f'(x^d) = N(f)(x^d) = f * adj(f)
+         *   g'(x^d) = N(g)(x^d) = g * adj(g)
+         *   f'*G' - g'*F' = q
+         *   F = F'(x^d) * adj(g)
+         *   G = G'(x^d) * adj(f)
+         *
+         * We compute things in the NTT. We group roots of phi
+         * such that all roots x in a group share the same x^d.
+         * If the roots in a group are x_1, x_2... x_d, then:
+         *
+         *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+         *
+         * Thus, we have:
+         *
+         *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   ...
+         *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+         *
+         * In all cases, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, in our chosen NTT representation, roots
+         * from the same group are consecutive in RAM.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+                v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild F and G with the CRT.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+    zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+    /*
+     * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+     * order).
+     */
+
+    /*
+     * Apply Babai reduction to bring back F and G to size slen.
+     *
+     * We use the FFT to compute successive approximations of the
+     * reduction coefficient. We first isolate the top bits of
+     * the coefficients of f and g, and convert them to floating
+     * point; with the FFT, we compute adj(f), adj(g), and
+     * 1/(f*adj(f)+g*adj(g)).
+     *
+     * Then, we repeatedly apply the following:
+     *
+     *   - Get the top bits of the coefficients of F and G into
+     *     floating point, and use the FFT to compute:
+     *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+     *
+     *   - Convert back that value into normal representation, and
+     *     round it to the nearest integers, yielding a polynomial k.
+     *     Proper scaling is applied to f, g, F and G so that the
+     *     coefficients fit on 32 bits (signed).
+     *
+     *   - Subtract k*f from F and k*g from G.
+     *
+     * Under normal conditions, this process reduces the size of F
+     * and G by some bits at each iteration. For constant-time
+     * operation, we do not want to measure the actual length of
+     * F and G; instead, we do the following:
+     *
+     *   - f and g are converted to floating-point, with some scaling
+     *     if necessary to keep values in the representable range.
+     *
+     *   - For each iteration, we _assume_ a maximum size for F and G,
+     *     and use the values at that size. If we overreach, then
+     *     we get zeros, which is harmless: the resulting coefficients
+     *     of k will be 0 and the value won't be reduced.
+     *
+     *   - We conservatively assume that F and G will be reduced by
+     *     at least 25 bits at each iteration.
+     *
+     * Even when reaching the bottom of the reduction, reduction
+     * coefficient will remain low. If it goes out-of-range, then
+     * something wrong occurred and the whole NTRU solving fails.
+     */
+
+    /*
+     * Memory layout:
+     *  - We need to compute and keep adj(f), adj(g), and
+     *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+     *    respectively).
+     *  - At each iteration we need two extra fp buffer (N fp values),
+     *    and produce a k (N 32-bit words). k will be shared with one
+     *    of the fp buffers.
+     *  - To compute k*f and k*g efficiently (with the NTT), we need
+     *    some extra room; we reuse the space of the temporary buffers.
+     *
+     * Arrays of 'fpr' are obtained from the temporary array itself.
+     * We ensure that the base is at a properly aligned offset (the
+     * source array tmp[] is supposed to be already aligned).
+     */
+
+    rt3 = align_fpr(tmp, t1);
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt1 = rt5 + (n >> 1);
+    k = (int32_t *)align_u32(tmp, rt1);
+    rt2 = align_fpr(tmp, k + n);
+    if (rt2 < (rt1 + n)) {
+        rt2 = rt1 + n;
+    }
+    t1 = (uint32_t *)k + n;
+
+    /*
+     * Get f and g into rt3 and rt4 as floating-point approximations.
+     *
+     * We need to "scale down" the floating-point representation of
+     * coefficients when they are too big. We want to keep the value
+     * below 2^310 or so. Thus, when values are larger than 10 words,
+     * we consider only the top 10 words. Array lengths have been
+     * computed so that average maximum length will fall in the
+     * middle or the upper half of these top 10 words.
+     */
+    rlen = (slen > 10) ? 10 : slen;
+    poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+    poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+    /*
+     * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+     */
+    scale_fg = 31 * (int)(slen - rlen);
+
+    /*
+     * Estimated boundaries for the maximum size (in bits) of the
+     * coefficients of (f,g). We use the measured average, and
+     * allow for a deviation of at most six times the standard
+     * deviation.
+     */
+    minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+    maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+    /*
+     * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+     * and adj(g) in rt3 and rt4, respectively.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt3, rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt4, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+
+    /*
+     * Reduce F and G repeatedly.
+     *
+     * The expected maximum bit length of coefficients of F and G
+     * is kept in maxbl_FG, with the corresponding word length in
+     * FGlen.
+     */
+    FGlen = llen;
+    maxbl_FG = 31 * (int)llen;
+
+    /*
+     * Each reduction operation computes the reduction polynomial
+     * "k". We need that polynomial to have coefficients that fit
+     * on 32-bit signed integers, with some scaling; thus, we use
+     * a descending sequence of scaling values, down to zero.
+     *
+     * The size of the coefficients of k is (roughly) the difference
+     * between the size of the coefficients of (F,G) and the size
+     * of the coefficients of (f,g). Thus, the maximum size of the
+     * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+     * this is our starting scale value for k.
+     *
+     * We need to estimate the size of (F,G) during the execution of
+     * the algorithm; we are allowed some overestimation but not too
+     * much (poly_big_to_fp() uses a 310-bit window). Generally
+     * speaking, after applying a reduction with k scaled to
+     * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+     * where 'dd' is a few bits to account for the fact that the
+     * reduction is never perfect (intuitively, dd is on the order
+     * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+     * bits).
+     *
+     * The size of (f,g) is not known exactly, but maxbl_fg is an
+     * upper bound.
+     */
+    scale_k = maxbl_FG - minbl_fg;
+
+    for (;;) {
+        int scale_FG, dc, new_maxbl_FG;
+        uint32_t scl, sch;
+        fpr pdc, pt;
+
+        /*
+         * Convert current F and G into floating-point. We apply
+         * scaling if the current length is more than 10 words.
+         */
+        rlen = (FGlen > 10) ? 10 : FGlen;
+        scale_FG = 31 * (int)(FGlen - rlen);
+        poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+        poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+        /*
+         * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+         */
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt1, rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt2, rt2, rt4, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(rt2, rt2, rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt5, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt2, logn);
+
+        /*
+         * (f,g) are scaled by 'scale_fg', meaning that the
+         * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+         * to have their true mathematical value.
+         *
+         * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+         * the value we computed in rt2 is scaled by
+         * 'scale_FG-scale_fg'.
+         *
+         * We want that value to be scaled by 'scale_k', hence we
+         * apply a corrective scaling. After scaling, the values
+         * should fit in -2^31-1..+2^31-1.
+         */
+        dc = scale_k - scale_FG + scale_fg;
+
+        /*
+         * We will need to multiply values by 2^(-dc). The value
+         * 'dc' is not secret, so we can compute 2^(-dc) with a
+         * non-constant-time process.
+         * (We could use ldexp(), but we prefer to avoid any
+         * dependency on libm. When using FP emulation, we could
+         * use our fpr_ldexp(), which is constant-time.)
+         */
+        if (dc < 0) {
+            dc = -dc;
+            pt = fpr_two;
+        } else {
+            pt = fpr_onehalf;
+        }
+        pdc = fpr_one;
+        while (dc != 0) {
+            if ((dc & 1) != 0) {
+                pdc = fpr_mul(pdc, pt);
+            }
+            dc >>= 1;
+            pt = fpr_sqr(pt);
+        }
+
+        for (u = 0; u < n; u ++) {
+            fpr xv;
+
+            xv = fpr_mul(rt2[u], pdc);
+
+            /*
+             * Sometimes the values can be out-of-bounds if
+             * the algorithm fails; we must not call
+             * fpr_rint() (and cast to int32_t) if the value
+             * is not in-bounds. Note that the test does not
+             * break constant-time discipline, since any
+             * failure here implies that we discard the current
+             * secret key (f,g).
+             */
+            if (!fpr_lt(fpr_mtwo31m1, xv)
+                    || !fpr_lt(xv, fpr_ptwo31m1)) {
+                return 0;
+            }
+            k[u] = (int32_t)fpr_rint(xv);
+        }
+
+        /*
+         * Values in k[] are integers. They really are scaled
+         * down by maxbl_FG - minbl_fg bits.
+         *
+         * If we are at low depth, then we use the NTT to
+         * compute k*f and k*g.
+         */
+        sch = (uint32_t)(scale_k / 31);
+        scl = (uint32_t)(scale_k % 31);
+        if (depth <= DEPTH_INT_FG) {
+            poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+                                k, sch, scl, logn, t1);
+            poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+                                k, sch, scl, logn, t1);
+        } else {
+            poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+                            k, sch, scl, logn);
+            poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+                            k, sch, scl, logn);
+        }
+
+        /*
+         * We compute the new maximum size of (F,G), assuming that
+         * (f,g) has _maximal_ length (i.e. that reduction is
+         * "late" instead of "early". We also adjust FGlen
+         * accordingly.
+         */
+        new_maxbl_FG = scale_k + maxbl_fg + 10;
+        if (new_maxbl_FG < maxbl_FG) {
+            maxbl_FG = new_maxbl_FG;
+            if ((int)FGlen * 31 >= maxbl_FG + 31) {
+                FGlen --;
+            }
+        }
+
+        /*
+         * We suppose that scaling down achieves a reduction by
+         * at least 25 bits per iteration. We stop when we have
+         * done the loop with an unscaled k.
+         */
+        if (scale_k <= 0) {
+            break;
+        }
+        scale_k -= 25;
+        if (scale_k < 0) {
+            scale_k = 0;
+        }
+    }
+
+    /*
+     * If (F,G) length was lowered below 'slen', then we must take
+     * care to re-extend the sign.
+     */
+    if (FGlen < slen) {
+        for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+            size_t v;
+            uint32_t sw;
+
+            sw = -(Ft[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Ft[v] = sw;
+            }
+            sw = -(Gt[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Gt[v] = sw;
+            }
+        }
+    }
+
+    /*
+     * Compress encoding of all values to 'slen' words (this is the
+     * expected output format).
+     */
+    for (u = 0, x = tmp, y = tmp;
+            u < (n << 1); u ++, x += slen, y += llen) {
+        memmove(x, y, slen * sizeof * y);
+    }
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    /*
+     * The first half of this function is a copy of the corresponding
+     * part in solve_NTRU_intermediate(), for the reconstruction of
+     * the unreduced F and G. The second half (Babai reduction) is
+     * done differently, because the unreduced F and G fit in 53 bits
+     * of precision, allowing a much simpler process with lower RAM
+     * usage.
+     */
+    unsigned depth, logn;
+    size_t n_top, n, hn, slen, dlen, llen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+    uint32_t *x, *y;
+
+    depth = 1;
+    n_top = (size_t)1 << logn_top;
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     */
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+
+    /*
+     * Fd and Gd are the F and G from the deeper level. Ft and Gt
+     * are the destination arrays for the unreduced F and G.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+    Ft = Gd + dlen * hn;
+    Gt = Ft + llen * n;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * Now Fd and Gd are not needed anymore; we can squeeze them out.
+     */
+    memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+    Ft = tmp;
+    memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+    Gt = Ft + llen * n;
+    ft = Gt + llen * n;
+    gt = ft + slen * n;
+
+    t1 = gt + slen * n;
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        unsigned e;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * We recompute things from the source f and g, of full
+         * degree. However, we will need only the n first elements
+         * of the inverse NTT table (igm); the call to modp_mkgm()
+         * below will fill n_top elements in igm[] (thus overflowing
+         * into fx[]) but later code will overwrite these extra
+         * elements.
+         */
+        gm = t1;
+        igm = gm + n_top;
+        fx = igm + n;
+        gx = fx + n_top;
+        modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+        /*
+         * Set ft and gt to f and g modulo p, respectively.
+         */
+        for (v = 0; v < n_top; v ++) {
+            fx[v] = modp_set(f[v], p);
+            gx[v] = modp_set(g[v], p);
+        }
+
+        /*
+         * Convert to NTT and compute our f and g.
+         */
+        modp_NTT2(fx, gm, logn_top, p, p0i);
+        modp_NTT2(gx, gm, logn_top, p, p0i);
+        for (e = logn_top; e > logn; e --) {
+            modp_poly_rec_res(fx, e, p, p0i, R2);
+            modp_poly_rec_res(gx, e, p, p0i, R2);
+        }
+
+        /*
+         * From that point onward, we only need tables for
+         * degree n, so we can save some space.
+         */
+        if (depth > 0) { /* always true */
+            memmove(gm + n, igm, n * sizeof * igm);
+            igm = gm + n;
+            memmove(igm + n, fx, n * sizeof * ft);
+            fx = igm + n;
+            memmove(fx + n, gx, n * sizeof * gt);
+            gx = fx + n;
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed
+         * in a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * Equations are:
+         *
+         *   f'(x^2) = N(f)(x^2) = f * adj(f)
+         *   g'(x^2) = N(g)(x^2) = g * adj(g)
+         *
+         *   f'*G' - g'*F' = q
+         *
+         *   F = F'(x^2) * adj(g)
+         *   G = G'(x^2) * adj(f)
+         *
+         * The NTT representation of f is f(w) for all w which
+         * are roots of phi. In the binary case, as well as in
+         * the ternary case for all depth except the deepest,
+         * these roots can be grouped in pairs (w,-w), and we
+         * then have:
+         *
+         *   f(w) = adj(f)(-w)
+         *   f(-w) = adj(f)(w)
+         *
+         * and w^2 is then a root for phi at the half-degree.
+         *
+         * At the deepest level in the ternary case, this still
+         * holds, in the following sense: the roots of x^2-x+1
+         * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+         * have:
+         *
+         *   f(w) = adj(f)(-w^2)
+         *   f(-w^2) = adj(f)(w)
+         *
+         * In all case, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, the two roots for each pair are consecutive
+         * in our bit-reversal encoding.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+        /*
+         * Also save ft and gt (only up to size slen).
+         */
+        if (u < slen) {
+            modp_iNTT2(fx, igm, logn, p, p0i);
+            modp_iNTT2(gx, igm, logn, p, p0i);
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                *x = fx[v];
+                *y = gx[v];
+            }
+        }
+    }
+
+    /*
+     * Rebuild f, g, F and G with the CRT. Note that the elements of F
+     * and G are consecutive, and thus can be rebuilt in a single
+     * loop; similarly, the elements of f and g are consecutive.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+    zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+    /*
+     * Here starts the Babai reduction, specialized for depth = 1.
+     *
+     * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+     * are converted to floating point. There is no scaling, and a
+     * single pass is sufficient.
+     */
+
+    /*
+     * Convert F and G into floating point (rt1 and rt2).
+     */
+    rt1 = align_fpr(tmp, gt + slen * n);
+    rt2 = rt1 + n;
+    poly_big_to_fp(rt1, Ft, llen, llen, logn);
+    poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+    /*
+     * Integer representation of F and G is no longer needed, we
+     * can remove it.
+     */
+    memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+    ft = tmp;
+    gt = ft + slen * n;
+    rt3 = align_fpr(tmp, gt + slen * n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * Convert f and g into floating point (rt3 and rt4).
+     */
+    poly_big_to_fp(rt3, ft, slen, slen, logn);
+    poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+    /*
+     * Remove unneeded ft and gt.
+     */
+    memmove(tmp, rt1, 4 * n * sizeof * rt1);
+    rt1 = (fpr *)tmp;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt6 = rt5 + n;
+
+    /*
+     * We now have:
+     *   rt1 = F
+     *   rt2 = G
+     *   rt3 = f
+     *   rt4 = g
+     * in that order in RAM. We convert all of them to FFT.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt1, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt2, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = F*adj(f) + G*adj(g)
+     *   rt6 = 1 / (f*adj(f) + g*adj(g))
+     * (Note that rt6 is half-length.)
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt5, rt5, rt6, logn);
+
+    /*
+     * Compute k as the rounded version of rt5. Check that none of
+     * the values is larger than 2^63-1 (in absolute value)
+     * because that would make the fpr_rint() do something undefined;
+     * note that any out-of-bounds value here implies a failure and
+     * (f,g) will be discarded, so we can make a simple test.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt5, logn);
+    for (u = 0; u < n; u ++) {
+        fpr z;
+
+        z = rt5[u];
+        if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+            return 0;
+        }
+        rt5[u] = fpr_of(fpr_rint(z));
+    }
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt5, logn);
+
+    /*
+     * Subtract k*f from F, and k*g from G.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt3, rt3, rt5, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(rt1, rt1, rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt1, logn);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt4, rt4, rt5, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(rt2, rt2, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt2, logn);
+
+    /*
+     * Convert back F and G to integers, and return.
+     */
+    Ft = tmp;
+    Gt = Ft + n;
+    rt3 = align_fpr(tmp, Gt + n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    for (u = 0; u < n; u ++) {
+        Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+        Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t n, hn, u;
+    uint32_t p, p0i, R2;
+    uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+    uint32_t *gm, *igm, *ft, *gt;
+    fpr *rt2, *rt3;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     *
+     * Everything should fit in 31-bit integers, hence we can just use
+     * the first small prime p = 2147473409.
+     */
+    p = PRIMES[0].p;
+    p0i = modp_ninv31(p);
+    R2 = modp_R2(p, p0i);
+
+    Fp = tmp;
+    Gp = Fp + hn;
+    ft = Gp + hn;
+    gt = ft + n;
+    gm = gt + n;
+    igm = gm + n;
+
+    modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F' anf G' in NTT representation.
+     */
+    for (u = 0; u < hn; u ++) {
+        Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+        Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+    }
+    modp_NTT2(Fp, gm, logn - 1, p, p0i);
+    modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+    /*
+     * Load f and g and convert them to NTT representation.
+     */
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+
+    /*
+     * Build the unreduced F,G in ft and gt.
+     */
+    for (u = 0; u < n; u += 2) {
+        uint32_t ftA, ftB, gtA, gtB;
+        uint32_t mFp, mGp;
+
+        ftA = ft[u + 0];
+        ftB = ft[u + 1];
+        gtA = gt[u + 0];
+        gtB = gt[u + 1];
+        mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+        mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+        ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+        ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+        gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+        gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+    }
+    modp_iNTT2(ft, igm, logn, p, p0i);
+    modp_iNTT2(gt, igm, logn, p, p0i);
+
+    Gp = Fp + n;
+    t1 = Gp + n;
+    memmove(Fp, ft, 2 * n * sizeof * ft);
+
+    /*
+     * We now need to apply the Babai reduction. At that point,
+     * we have F and G in two n-word arrays.
+     *
+     * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+     * modulo p, using the NTT. We still move memory around in
+     * order to save RAM.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+
+    /*
+     * Compute the NTT tables in t1 and t2. We do not keep t2
+     * (we'll recompute it later on).
+     */
+    modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F and G to NTT.
+     */
+    modp_NTT2(Fp, t1, logn, p, p0i);
+    modp_NTT2(Gp, t1, logn, p, p0i);
+
+    /*
+     * Load f and adj(f) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(f[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[n - u] = modp_set(-f[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Compute F*adj(f) in t2, and f*adj(f) in t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_montymul(w, Fp[u], p, p0i);
+        t3[u] = modp_montymul(w, t4[u], p, p0i);
+    }
+
+    /*
+     * Load g and adj(g) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(g[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(g[u], p);
+        t5[n - u] = modp_set(-g[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Add G*adj(g) to t2, and g*adj(g) to t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_add(t2[u],
+                         modp_montymul(w, Gp[u], p, p0i), p);
+        t3[u] = modp_add(t3[u],
+                         modp_montymul(w, t4[u], p, p0i), p);
+    }
+
+    /*
+     * Convert back t2 and t3 to normal representation (normalized
+     * around 0), and then
+     * move them to t1 and t2. We first need to recompute the
+     * inverse table for NTT.
+     */
+    modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+    modp_iNTT2(t2, t4, logn, p, p0i);
+    modp_iNTT2(t3, t4, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint32_t)modp_norm(t2[u], p);
+        t2[u] = (uint32_t)modp_norm(t3[u], p);
+    }
+
+    /*
+     * At that point, array contents are:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   F*adj(f)+G*adj(g) (t1)
+     *   f*adj(f)+g*adj(g) (t2)
+     *
+     * We want to divide t1 by t2. The result is not integral; it
+     * must be rounded. We thus need to use the FFT.
+     */
+
+    /*
+     * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+     * polynomial is auto-adjoint, all its coordinates in FFT
+     * representation are actually real, so we can truncate off
+     * the imaginary parts.
+     */
+    rt3 = align_fpr(tmp, t3);
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t2)[u]);
+    }
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+    rt2 = align_fpr(tmp, t2);
+    memmove(rt2, rt3, hn * sizeof * rt3);
+
+    /*
+     * Convert F*adj(f)+G*adj(g) in FFT representation.
+     */
+    rt3 = rt2 + hn;
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t1)[u]);
+    }
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+
+    /*
+     * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+     * its rounded normal representation in t1.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_autoadj_fft(rt3, rt3, rt2, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt3, logn);
+    for (u = 0; u < n; u ++) {
+        t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+    }
+
+    /*
+     * RAM contents are now:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   k (t1)
+     *
+     * We want to compute F-k*f, and G-k*g.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+    modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(t1, t2, logn, p, p0i);
+    modp_NTT2(t4, t2, logn, p, p0i);
+    modp_NTT2(t5, t2, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t kw;
+
+        kw = modp_montymul(t1[u], R2, p, p0i);
+        Fp[u] = modp_sub(Fp[u],
+                         modp_montymul(kw, t4[u], p, p0i), p);
+        Gp[u] = modp_sub(Gp[u],
+                         modp_montymul(kw, t5[u], p, p0i), p);
+    }
+    modp_iNTT2(Fp, t3, logn, p, p0i);
+    modp_iNTT2(Gp, t3, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+        Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+    }
+
+    return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+           const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+    size_t n, u;
+    uint32_t *ft, *gt, *Ft, *Gt, *gm;
+    uint32_t p, p0i, r;
+    const small_prime *primes;
+
+    n = MKN(logn);
+
+    if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+        return 0;
+    }
+
+    /*
+     * For logn <= 2, we need to use solve_NTRU_intermediate()
+     * directly, because coefficients are a bit too large and
+     * do not fit the hypotheses in solve_NTRU_binary_depth0().
+     */
+    if (logn <= 2) {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 0) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+    } else {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 2) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+        if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+            return 0;
+        }
+        if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+            return 0;
+        }
+    }
+
+    /*
+     * If no buffer has been provided for G, use a temporary one.
+     */
+    if (G == NULL) {
+        G = (int8_t *)(tmp + 2 * n);
+    }
+
+    /*
+     * Final F and G are in fk->tmp, one word per coefficient
+     * (signed value over 31 bits).
+     */
+    if (!poly_big_to_small(F, tmp, lim, logn)
+            || !poly_big_to_small(G, tmp + n, lim, logn)) {
+        return 0;
+    }
+
+    /*
+     * Verify that the NTRU equation is fulfilled. Since all elements
+     * have short lengths, verifying modulo a small prime p works, and
+     * allows using the NTT.
+     *
+     * We put Gt[] first in tmp[], and process it first, so that it does
+     * not overlap with G[] in case we allocated it ourselves.
+     */
+    Gt = tmp;
+    ft = Gt + n;
+    gt = ft + n;
+    Ft = gt + n;
+    gm = Ft + n;
+
+    primes = PRIMES;
+    p = primes[0].p;
+    p0i = modp_ninv31(p);
+    modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Gt[u] = modp_set(G[u], p);
+    }
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+        Ft[u] = modp_set(F[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+    modp_NTT2(Ft, gm, logn, p, p0i);
+    modp_NTT2(Gt, gm, logn, p, p0i);
+    r = modp_montymul(12289, 1, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t z;
+
+        z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+                     modp_montymul(gt[u], Ft[u], p, p0i), p);
+        if (z != r) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+    size_t n, u;
+    unsigned mod2;
+
+    n = MKN(logn);
+    mod2 = 0;
+    for (u = 0; u < n; u ++) {
+        int s;
+
+restart:
+        s = mkgauss(rng, logn);
+
+        /*
+         * We need the coefficient to fit within -127..+127;
+         * realistically, this is always the case except for
+         * the very low degrees (N = 2 or 4), for which there
+         * is no real security anyway.
+         */
+        if (s < -127 || s > 127) {
+            goto restart;
+        }
+
+        /*
+         * We need the sum of all coefficients to be 1; otherwise,
+         * the resultant of the polynomial with X^N+1 will be even,
+         * and the binary GCD will fail.
+         */
+        if (u == n - 1) {
+            if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+                goto restart;
+            }
+        } else {
+            mod2 ^= (unsigned)(s & 1);
+        }
+        f[u] = (int8_t)s;
+    }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_keygen(inner_shake256_context *rng,
+                                        int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+                                        unsigned logn, uint8_t *tmp) {
+    /*
+     * Algorithm is the following:
+     *
+     *  - Generate f and g with the Gaussian distribution.
+     *
+     *  - If either Res(f,phi) or Res(g,phi) is even, try again.
+     *
+     *  - If ||(f,g)|| is too large, try again.
+     *
+     *  - If ||B~_{f,g}|| is too large, try again.
+     *
+     *  - If f is not invertible mod phi mod q, try again.
+     *
+     *  - Compute h = g/f mod phi mod q.
+     *
+     *  - Solve the NTRU equation fG - gF = q; if the solving fails,
+     *    try again. Usual failure condition is when Res(f,phi)
+     *    and Res(g,phi) are not prime to each other.
+     */
+    size_t n, u;
+    int16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
+
+    n = MKN(logn);
+    rc = rng;
+
+    /*
+     * We need to generate f and g randomly, until we find values
+     * such that the norm of (g,-f), and of the orthogonalized
+     * vector, are satisfying. The orthogonalized vector is:
+     *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+     * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+     *
+     * In the binary case, coefficients of f and g are generated
+     * independently of each other, with a discrete Gaussian
+     * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+     * the two vectors have expected norm 1.17*sqrt(q), which is
+     * also our acceptance bound: we require both vectors to be no
+     * larger than that (this will be satisfied about 1/4th of the
+     * time, thus we expect sampling new (f,g) about 4 times for that
+     * step).
+     *
+     * We require that Res(f,phi) and Res(g,phi) are both odd (the
+     * NTRU equation solver requires it).
+     */
+    for (;;) {
+        fpr *rt1, *rt2, *rt3;
+        fpr bnorm;
+        uint32_t normf, normg, norm;
+        int lim;
+
+        /*
+         * The poly_small_mkgauss() function makes sure
+         * that the sum of coefficients is 1 modulo 2
+         * (i.e. the resultant of the polynomial with phi
+         * will be odd).
+         */
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
+
+        /*
+         * Verify that all coefficients are within the bounds
+         * defined in max_fg_bits. This is the case with
+         * overwhelming probability; this guarantees that the
+         * key will be encodable with FALCON_COMP_TRIM.
+         */
+        lim = 1 << (PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[logn] - 1);
+        for (u = 0; u < n; u ++) {
+            /*
+             * We can use non-CT tests since on any failure
+             * we will discard f and g.
+             */
+            if (f[u] >= lim || f[u] <= -lim
+                    || g[u] >= lim || g[u] <= -lim) {
+                lim = -1;
+                break;
+            }
+        }
+        if (lim < 0) {
+            continue;
+        }
+
+        /*
+         * Bound is 1.17*sqrt(q). We compute the squared
+         * norms. With q = 12289, the squared bound is:
+         *   (1.17^2)* 12289 = 16822.4121
+         * Since f and g are integral, the squared norm
+         * of (g,-f) is an integer.
+         */
+        normf = poly_small_sqnorm(f, logn);
+        normg = poly_small_sqnorm(g, logn);
+        norm = (normf + normg) | -((normf | normg) >> 31);
+        if (norm >= 16823) {
+            continue;
+        }
+
+        /*
+         * We compute the orthogonalized vector norm.
+         */
+        rt1 = (fpr *)tmp;
+        rt2 = rt1 + n;
+        rt3 = rt2 + n;
+
+        poly_small_to_fp(rt1, f, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt1, rt1, logn);
+
+        poly_small_to_fp(rt2, g, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt2, rt2, logn);
+
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(rt1, rt1, fpr_q, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt1, rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt1, logn);
+
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(rt2, rt2, fpr_q, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt2, logn);
+
+        bnorm = PQCLEAN_FALCONPADDED1024_AARCH64_compute_bnorm(rt1, rt2);
+
+        if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+            continue;
+        }
+
+        /*
+         * Compute public key h = g/f mod X^N+1 mod q. If this
+         * fails, we must restart.
+         */
+        if (h == NULL) {
+            h2 = (int16_t *)tmp;
+            tmp2 = h2 + n;
+        } else {
+            h2 = (int16_t *)h;
+            tmp2 = (int16_t *)tmp;
+        }
+
+        if (!PQCLEAN_FALCONPADDED1024_AARCH64_compute_public(h2, f, g, tmp2)) {
+            continue;
+        }
+
+        /*
+         * Solve the NTRU equation to get F and G.
+         */
+        lim = (1 << (PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[logn] - 1)) - 1;
+        if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+            continue;
+        }
+
+        /*
+         * Key pair is generated.
+         */
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrof.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrof.h
new file mode 100644
index 000000000..c8f82991e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrof.h
@@ -0,0 +1,125 @@
+/*
+ * 64-bit Floating point NEON macro x1
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+
+// c <= addr x1
+#define vload(c, addr) c = vld1q_f64(addr);
+// c <= addr interleave 2
+#define vload2(c, addr) c = vld2q_f64(addr);
+// c <= addr interleave 4
+#define vload4(c, addr) c = vld4q_f64(addr);
+
+#define vstore(addr, c) vst1q_f64(addr, c);
+// addr <= c
+#define vstore2(addr, c) vst2q_f64(addr, c);
+// addr <= c
+#define vstore4(addr, c) vst4q_f64(addr, c);
+
+// c <= addr x2
+#define vloadx2(c, addr) c = vld1q_f64_x2(addr);
+// c <= addr x3
+#define vloadx3(c, addr) c = vld1q_f64_x3(addr);
+
+// addr <= c
+#define vstorex2(addr, c) vst1q_f64_x2(addr, c);
+
+// c = a - b
+#define vfsub(c, a, b) c = vsubq_f64(a, b);
+
+// c = a + b
+#define vfadd(c, a, b) c = vaddq_f64(a, b);
+
+// c = a * b
+#define vfmul(c, a, b) c = vmulq_f64(a, b);
+
+// c = a * n (n is constant)
+#define vfmuln(c, a, n) c = vmulq_n_f64(a, n);
+
+// Swap from a|b to b|a
+#define vswap(c, a) c = vextq_f64(a, a, 1);
+
+// c = a * b[i]
+#define vfmul_lane(c, a, b, i) c = vmulq_laneq_f64(a, b, i);
+
+// c = 1/a
+#define vfinv(c, a) c = vdivq_f64(vdupq_n_f64(1.0), a);
+
+// c = -a
+#define vfneg(c, a) c = vnegq_f64(a);
+
+#define transpose_f64(a, b, t, ia, ib, it)        \
+    t.val[it] = a.val[ia];                        \
+    a.val[ia] = vzip1q_f64(a.val[ia], b.val[ib]); \
+    b.val[ib] = vzip2q_f64(t.val[it], b.val[ib]);
+
+/*
+ * c = a + jb
+ * c[0] = a[0] - b[1]
+ * c[1] = a[1] + b[0]
+ */
+#define vfcaddj(c, a, b) c = vcaddq_rot90_f64(a, b);
+
+/*
+ * c = a - jb
+ * c[0] = a[0] + b[1]
+ * c[1] = a[1] - b[0]
+ */
+#define vfcsubj(c, a, b) c = vcaddq_rot270_f64(a, b);
+
+// c[0] = c[0] + b[0]*a[0], c[1] = c[1] + b[1]*a[0]
+#define vfcmla(c, a, b) c = vcmlaq_f64(c, a, b);
+
+// c[0] = c[0] - b[1]*a[1], c[1] = c[1] + b[0]*a[1]
+#define vfcmla_90(c, a, b) c = vcmlaq_rot90_f64(c, a, b);
+
+// c[0] = c[0] - b[0]*a[0], c[1] = c[1] - b[1]*a[0]
+#define vfcmla_180(c, a, b) c = vcmlaq_rot180_f64(c, a, b);
+
+// c[0] = c[0] + b[1]*a[1], c[1] = c[1] - b[0]*a[1]
+#define vfcmla_270(c, a, b) c = vcmlaq_rot270_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a*b
+ * c[0] = a[0]*b[0] - a[1]*b[1]
+ * c[1] = a[0]*b[1] + a[1]*b[0]
+ */
+#define FPC_CMUL(c, a, b)         \
+    c = vmulq_laneq_f64(b, a, 0); \
+    c = vcmlaq_rot90_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a * conjugate(b) = a * (b[0], -b[1])
+ * c[0] =   b[0]*a[0] + b[1]*a[1]
+ * c[1] = + b[0]*a[1] - b[1]*a[0]
+ */
+#define FPC_CMUL_CONJ(c, a, b)    \
+    c = vmulq_laneq_f64(a, b, 0); \
+    c = vcmlaq_rot270_f64(c, b, a);
+
+// d = c + a *b
+#define vfmla(d, c, a, b) d = vfmaq_f64(c, a, b);
+// d = c - a * b
+#define vfmls(d, c, a, b) d = vfmsq_f64(c, a, b);
+// d = c + a * b[i]
+#define vfmla_lane(d, c, a, b, i) d = vfmaq_laneq_f64(c, a, b, i);
+// d = c - a * b[i]
+#define vfmls_lane(d, c, a, b, i) d = vfmsq_laneq_f64(c, a, b, i);
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrofx4.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrofx4.h
new file mode 100644
index 000000000..e6b70e64e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrofx4.h
@@ -0,0 +1,430 @@
+/*
+ * 64-bit Floating point NEON macro x4
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+#include "macrof.h"
+
+#define vloadx4(c, addr) c = vld1q_f64_x4(addr);
+
+#define vstorex4(addr, c) vst1q_f64_x4(addr, c);
+
+#define vfdupx4(c, constant)          \
+    c.val[0] = vdupq_n_f64(constant); \
+    c.val[1] = vdupq_n_f64(constant); \
+    c.val[2] = vdupq_n_f64(constant); \
+    c.val[3] = vdupq_n_f64(constant);
+
+#define vfnegx4(c, a)               \
+    c.val[0] = vnegq_f64(a.val[0]); \
+    c.val[1] = vnegq_f64(a.val[1]); \
+    c.val[2] = vnegq_f64(a.val[2]); \
+    c.val[3] = vnegq_f64(a.val[3]);
+
+#define vfmulnx4(c, a, n)                \
+    c.val[0] = vmulq_n_f64(a.val[0], n); \
+    c.val[1] = vmulq_n_f64(a.val[1], n); \
+    c.val[2] = vmulq_n_f64(a.val[2], n); \
+    c.val[3] = vmulq_n_f64(a.val[3], n);
+
+// c = a - b
+#define vfsubx4(c, a, b)                      \
+    c.val[0] = vsubq_f64(a.val[0], b.val[0]); \
+    c.val[1] = vsubq_f64(a.val[1], b.val[1]); \
+    c.val[2] = vsubq_f64(a.val[2], b.val[2]); \
+    c.val[3] = vsubq_f64(a.val[3], b.val[3]);
+
+// c = a + b
+#define vfaddx4(c, a, b)                      \
+    c.val[0] = vaddq_f64(a.val[0], b.val[0]); \
+    c.val[1] = vaddq_f64(a.val[1], b.val[1]); \
+    c.val[2] = vaddq_f64(a.val[2], b.val[2]); \
+    c.val[3] = vaddq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4(c, a, b)                      \
+    c.val[0] = vmulq_f64(a.val[0], b.val[0]); \
+    c.val[1] = vmulq_f64(a.val[1], b.val[1]); \
+    c.val[2] = vmulq_f64(a.val[2], b.val[2]); \
+    c.val[3] = vmulq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4_i(c, a, b)             \
+    c.val[0] = vmulq_f64(a.val[0], b); \
+    c.val[1] = vmulq_f64(a.val[1], b); \
+    c.val[2] = vmulq_f64(a.val[2], b); \
+    c.val[3] = vmulq_f64(a.val[3], b);
+
+#define vfinvx4(c, a)                                 \
+    c.val[0] = vdivq_f64(vdupq_n_f64(1.0), a.val[0]); \
+    c.val[1] = vdivq_f64(vdupq_n_f64(1.0), a.val[1]); \
+    c.val[2] = vdivq_f64(vdupq_n_f64(1.0), a.val[2]); \
+    c.val[3] = vdivq_f64(vdupq_n_f64(1.0), a.val[3]);
+
+#define vfcvtx4(c, a)                   \
+    c.val[0] = vcvtq_f64_s64(a.val[0]); \
+    c.val[1] = vcvtq_f64_s64(a.val[1]); \
+    c.val[2] = vcvtq_f64_s64(a.val[2]); \
+    c.val[3] = vcvtq_f64_s64(a.val[3]);
+
+#define vfmlax4(d, c, a, b)                        \
+    vfmla(d.val[0], c.val[0], a.val[0], b.val[0]); \
+    vfmla(d.val[1], c.val[1], a.val[1], b.val[1]); \
+    vfmla(d.val[2], c.val[2], a.val[2], b.val[2]); \
+    vfmla(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfmlsx4(d, c, a, b)                        \
+    vfmls(d.val[0], c.val[0], a.val[0], b.val[0]); \
+    vfmls(d.val[1], c.val[1], a.val[1], b.val[1]); \
+    vfmls(d.val[2], c.val[2], a.val[2], b.val[2]); \
+    vfmls(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfrintx4(c, a)                   \
+    c.val[0] = vcvtnq_s64_f64(a.val[0]); \
+    c.val[1] = vcvtnq_s64_f64(a.val[1]); \
+    c.val[2] = vcvtnq_s64_f64(a.val[2]); \
+    c.val[3] = vcvtnq_s64_f64(a.val[3]);
+
+/*
+ * Wrapper for FFT, split/merge and poly_float.c
+ */
+
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmul(d_re, a_re, b_re);                        \
+    vfmls(d_re, d_re, a_im, b_im);                  \
+    vfmul(d_im, a_re, b_im);                        \
+    vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MULx2(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmul(d_re.val[0], a_re.val[0], b_re.val[0]);              \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmul(d_re.val[1], a_re.val[1], b_re.val[1]);              \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmul(d_im.val[0], a_re.val[0], b_im.val[0]);              \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmul(d_im.val[1], a_re.val[1], b_im.val[1]);              \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MULx4(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmul(d_re.val[0], a_re.val[0], b_re.val[0]);              \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmul(d_re.val[1], a_re.val[1], b_re.val[1]);              \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmul(d_re.val[2], a_re.val[2], b_re.val[2]);              \
+    vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+    vfmul(d_re.val[3], a_re.val[3], b_re.val[3]);              \
+    vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+    vfmul(d_im.val[0], a_re.val[0], b_im.val[0]);              \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmul(d_im.val[1], a_re.val[1], b_im.val[1]);              \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+    vfmul(d_im.val[2], a_re.val[2], b_im.val[2]);              \
+    vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+    vfmul(d_im.val[3], a_re.val[3], b_im.val[3]);              \
+    vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmla(d_re, d_re, a_re, b_re);                  \
+    vfmls(d_re, d_re, a_im, b_im);                  \
+    vfmla(d_im, d_im, a_re, b_im);                  \
+    vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+    vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+    vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+    vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+    vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+    vfmla(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+    vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+    vfmla(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]); \
+    vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im)     \
+    vfmul(d_re.val[0], b_im.val[0], a_im.val[0]);              \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmul(d_re.val[1], b_im.val[1], a_im.val[1]);              \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmul(d_re.val[2], b_im.val[2], a_im.val[2]);              \
+    vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+    vfmul(d_re.val[3], b_im.val[3], a_im.val[3]);              \
+    vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+    vfmul(d_im.val[0], b_re.val[0], a_im.val[0]);              \
+    vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmul(d_im.val[1], b_re.val[1], a_im.val[1]);              \
+    vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmul(d_im.val[2], b_re.val[2], a_im.val[2]);              \
+    vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+    vfmul(d_im.val[3], b_re.val[3], a_im.val[3]);              \
+    vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im)     \
+    vfmla(d_re.val[0], d_re.val[0], b_im.val[0], a_im.val[0]); \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmla(d_re.val[1], d_re.val[1], b_im.val[1], a_im.val[1]); \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmla(d_re.val[2], d_re.val[2], b_im.val[2], a_im.val[2]); \
+    vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+    vfmla(d_re.val[3], d_re.val[3], b_im.val[3], a_im.val[3]); \
+    vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+    vfmla(d_im.val[0], d_im.val[0], b_re.val[0], a_im.val[0]); \
+    vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmla(d_im.val[1], d_im.val[1], b_re.val[1], a_im.val[1]); \
+    vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmla(d_im.val[2], d_im.val[2], b_re.val[2], a_im.val[2]); \
+    vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+    vfmla(d_im.val[3], d_im.val[3], b_re.val[3], a_im.val[3]); \
+    vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MUL_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+    vfmul_lane(d_re, a_re, b_re_im, 0);               \
+    vfmls_lane(d_re, d_re, a_im, b_re_im, 1);         \
+    vfmul_lane(d_im, a_re, b_re_im, 1);               \
+    vfmla_lane(d_im, d_im, a_im, b_re_im, 0);
+
+#define FPC_MUL_LANEx4(d_re, d_im, a_re, a_im, b_re_im)            \
+    vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0);              \
+    vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+    vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0);              \
+    vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+    vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0);              \
+    vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+    vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0);              \
+    vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+    vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 1);              \
+    vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 0); \
+    vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 1);              \
+    vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 0); \
+    vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 1);              \
+    vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 0); \
+    vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 1);              \
+    vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 0);
+
+#define FWD_TOP(t_re, t_im, b_re, b_im, zeta_re, zeta_im) \
+    FPC_MUL(t_re, t_im, b_re, b_im, zeta_re, zeta_im);
+
+#define FWD_TOP_LANE(t_re, t_im, b_re, b_im, zeta) \
+    FPC_MUL_LANE(t_re, t_im, b_re, b_im, zeta);
+
+#define FWD_TOP_LANEx4(t_re, t_im, b_re, b_im, zeta) \
+    FPC_MUL_LANEx4(t_re, t_im, b_re, b_im, zeta);
+
+/*
+ * FPC
+ */
+
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vsubq_f64(a_re, b_re);                   \
+    d_im = vsubq_f64(a_im, b_im);
+
+#define FPC_SUBx4(d_re, d_im, a_re, a_im, b_re, b_im)  \
+    d_re.val[0] = vsubq_f64(a_re.val[0], b_re.val[0]); \
+    d_im.val[0] = vsubq_f64(a_im.val[0], b_im.val[0]); \
+    d_re.val[1] = vsubq_f64(a_re.val[1], b_re.val[1]); \
+    d_im.val[1] = vsubq_f64(a_im.val[1], b_im.val[1]); \
+    d_re.val[2] = vsubq_f64(a_re.val[2], b_re.val[2]); \
+    d_im.val[2] = vsubq_f64(a_im.val[2], b_im.val[2]); \
+    d_re.val[3] = vsubq_f64(a_re.val[3], b_re.val[3]); \
+    d_im.val[3] = vsubq_f64(a_im.val[3], b_im.val[3]);
+
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vaddq_f64(a_re, b_re);                   \
+    d_im = vaddq_f64(a_im, b_im);
+
+#define FPC_ADDx4(d_re, d_im, a_re, a_im, b_re, b_im)  \
+    d_re.val[0] = vaddq_f64(a_re.val[0], b_re.val[0]); \
+    d_im.val[0] = vaddq_f64(a_im.val[0], b_im.val[0]); \
+    d_re.val[1] = vaddq_f64(a_re.val[1], b_re.val[1]); \
+    d_im.val[1] = vaddq_f64(a_im.val[1], b_im.val[1]); \
+    d_re.val[2] = vaddq_f64(a_re.val[2], b_re.val[2]); \
+    d_im.val[2] = vaddq_f64(a_im.val[2], b_im.val[2]); \
+    d_re.val[3] = vaddq_f64(a_re.val[3], b_re.val[3]); \
+    d_im.val[3] = vaddq_f64(a_im.val[3], b_im.val[3]);
+
+#define FWD_BOT(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUB(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADD(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUBx4(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADDx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+/*
+ * FPC_J
+ */
+
+#define FPC_ADDJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vsubq_f64(a_re, b_im);                    \
+    d_im = vaddq_f64(a_im, b_re);
+
+#define FPC_ADDJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re.val[0] = vsubq_f64(a_re.val[0], b_im.val[0]); \
+    d_im.val[0] = vaddq_f64(a_im.val[0], b_re.val[0]); \
+    d_re.val[1] = vsubq_f64(a_re.val[1], b_im.val[1]); \
+    d_im.val[1] = vaddq_f64(a_im.val[1], b_re.val[1]); \
+    d_re.val[2] = vsubq_f64(a_re.val[2], b_im.val[2]); \
+    d_im.val[2] = vaddq_f64(a_im.val[2], b_re.val[2]); \
+    d_re.val[3] = vsubq_f64(a_re.val[3], b_im.val[3]); \
+    d_im.val[3] = vaddq_f64(a_im.val[3], b_re.val[3]);
+
+#define FPC_SUBJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vaddq_f64(a_re, b_im);                    \
+    d_im = vsubq_f64(a_im, b_re);
+
+#define FPC_SUBJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re.val[0] = vaddq_f64(a_re.val[0], b_im.val[0]); \
+    d_im.val[0] = vsubq_f64(a_im.val[0], b_re.val[0]); \
+    d_re.val[1] = vaddq_f64(a_re.val[1], b_im.val[1]); \
+    d_im.val[1] = vsubq_f64(a_im.val[1], b_re.val[1]); \
+    d_re.val[2] = vaddq_f64(a_re.val[2], b_im.val[2]); \
+    d_im.val[2] = vsubq_f64(a_im.val[2], b_re.val[2]); \
+    d_re.val[3] = vaddq_f64(a_re.val[3], b_im.val[3]); \
+    d_im.val[3] = vsubq_f64(a_im.val[3], b_re.val[3]);
+
+#define FWD_BOTJ(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUBJ(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADDJ(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTJx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUBJx4(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADDJx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+//============== Inverse FFT
+/*
+ * FPC_J
+ * a * conj(b)
+ * Original (without swap):
+ * d_re = b_im * a_im + a_re * b_re;
+ * d_im = b_re * a_im - a_re * b_im;
+ */
+#define FPC_MUL_BOTJ_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+    vfmul_lane(d_re, a_re, b_re_im, 0);                    \
+    vfmla_lane(d_re, d_re, a_im, b_re_im, 1);              \
+    vfmul_lane(d_im, a_im, b_re_im, 0);                    \
+    vfmls_lane(d_im, d_im, a_re, b_re_im, 1);
+
+#define FPC_MUL_BOTJ_LANEx4(d_re, d_im, a_re, a_im, b_re_im)       \
+    vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0);              \
+    vfmla_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+    vfmul_lane(d_im.val[0], a_im.val[0], b_re_im, 0);              \
+    vfmls_lane(d_im.val[0], d_im.val[0], a_re.val[0], b_re_im, 1); \
+    vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0);              \
+    vfmla_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+    vfmul_lane(d_im.val[1], a_im.val[1], b_re_im, 0);              \
+    vfmls_lane(d_im.val[1], d_im.val[1], a_re.val[1], b_re_im, 1); \
+    vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0);              \
+    vfmla_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+    vfmul_lane(d_im.val[2], a_im.val[2], b_re_im, 0);              \
+    vfmls_lane(d_im.val[2], d_im.val[2], a_re.val[2], b_re_im, 1); \
+    vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0);              \
+    vfmla_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+    vfmul_lane(d_im.val[3], a_im.val[3], b_re_im, 0);              \
+    vfmls_lane(d_im.val[3], d_im.val[3], a_re.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmul(d_re, b_im, a_im);                             \
+    vfmla(d_re, d_re, a_re, b_re);                       \
+    vfmul(d_im, b_re, a_im);                             \
+    vfmls(d_im, d_im, a_re, b_im);
+
+#define INV_TOPJ(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);     \
+    FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUBx4(t_re, t_im, a_re, a_im, b_re, b_im);     \
+    FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+    FPC_MUL_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta);
+
+/*
+ * FPC_Jm
+ * a * -conj(b)
+ * d_re = a_re * b_im - a_im * b_re;
+ * d_im = a_im * b_im + a_re * b_re;
+ */
+#define FPC_MUL_BOTJm_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+    vfmul_lane(d_re, a_re, b_re_im, 1);                     \
+    vfmls_lane(d_re, d_re, a_im, b_re_im, 0);               \
+    vfmul_lane(d_im, a_re, b_re_im, 0);                     \
+    vfmla_lane(d_im, d_im, a_im, b_re_im, 1);
+
+#define FPC_MUL_BOTJm_LANEx4(d_re, d_im, a_re, a_im, b_re_im)      \
+    vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 1);              \
+    vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 0); \
+    vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 0);              \
+    vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 1); \
+    vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 1);              \
+    vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 0); \
+    vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 0);              \
+    vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 1); \
+    vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 1);              \
+    vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 0); \
+    vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 0);              \
+    vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 1); \
+    vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 1);              \
+    vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 0); \
+    vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 0);              \
+    vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJm(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmul(d_re, a_re, b_im);                              \
+    vfmls(d_re, d_re, a_im, b_re);                        \
+    vfmul(d_im, a_im, b_im);                              \
+    vfmla(d_im, d_im, a_re, b_re);
+
+#define INV_TOPJm(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUB(t_re, t_im, b_re, b_im, a_re, a_im);      \
+    FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJmx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUBx4(t_re, t_im, b_re, b_im, a_re, a_im);      \
+    FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+    FPC_MUL_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta);
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrous.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrous.h
new file mode 100644
index 000000000..dfee8bc12
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrous.h
@@ -0,0 +1,469 @@
+/*
+ * Macro for sign/unsigned integer
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+
+#define vmull_lo(c, a, b) c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+
+#define vmull_hi(c, a, b) c = vmull_high_s16(a, b);
+
+#define vmulla_lo(d, c, a, b) d = vmlal_s16(c, vget_low_s16(a), vget_low_s16(b));
+
+#define vmulla_hi(d, c, a, b) d = vmlal_high_s16(c, a, b);
+
+#define vadd(c, a, b) c = vaddq_u32(a, b);
+
+#define vaddv(c, a) c = vaddvq_u32(a);
+
+#define vor(c, a, b) c = vorrq_u32(a, b);
+
+// Macro for NTT operation. Using signed 16-bit.
+#define vload_s16_4(c, addr) c = vld4q_s16(addr);
+#define vload_s16_x2(c, addr) c = vld1q_s16_x2(addr);
+#define vload_s16_x4(c, addr) c = vld1q_s16_x4(addr);
+
+#define vstore_s16_x4(addr, c) vst1q_s16_x4(addr, c);
+#define vstore_s16_x2(addr, c) vst1q_s16_x2(addr, c);
+#define vstore_s16_4(add, c) vst4q_s16(add, c);
+
+/*
+ * Strategy for NTT:
+ * - Forward and Inverse NTT multiply with constant, use either Barrett or Montgomery *Rounding* arithmetic
+ * - Pointwise multiplication must use Montgomery *Doubling* arithmetic
+ *
+ * Rounding because:
+ *
+ * - Montgomery need one coefficient to be *odd*, it only works with precomputed coefficient
+ * => Tried this approach, very strict on coefficient input range.
+ * => E.g a*b: a in [-R/2, R/2]. b in [-Q/2, Q/2] then c in [-2Q, 2Q]
+ *
+ *  - Barrett multiplication seem to work better with no restriction
+ * => Proved to be good. E.g c=a*b, a in [-R, R], b in [-Q/2, Q/2] then c in [-3Q/2, 3Q/2]
+ * However, depend on the input bound, the output bound is varies. By using this knowledge, we can further
+ * optimize Barrett point by carefully check the output bound according to input bound.
+ *
+ * - Barrett reduction with c = a % Q. a in [-R, R] then c in [-Q/2, Q/2]
+ *
+ *
+ * Doubling because
+ * - Montgomery Doubling work with two unknown coefficient, no constaint at all
+ * => c = a*b. a,b in [-R, R] c in [-Q, Q]
+ */
+
+// ------------ Forward NTT and Inverse NTT ------------
+/*
+ * GS Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define gsbf_br(a, b, zl, zh, QMVQ, t) \
+    t = vsubq_s16(a, b);               \
+    a = vaddq_s16(a, b);               \
+    b = vqrdmulhq_s16(t, zh);          \
+    t = vmulq_s16(t, zl);              \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri(a, b, zl, zh, i, QMVQ, t) \
+    t = vsubq_s16(a, b);                   \
+    a = vaddq_s16(a, b);                   \
+    b = vqrdmulhq_laneq_s16(t, zh, i);     \
+    t = vmulq_laneq_s16(t, zl, i);         \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t)   \
+    t.val[0] = vsubq_s16(a.val[0], b.val[0]);                \
+    t.val[1] = vsubq_s16(a.val[1], b.val[1]);                \
+    t.val[2] = vsubq_s16(a.val[2], b.val[2]);                \
+    t.val[3] = vsubq_s16(a.val[3], b.val[3]);                \
+    a.val[0] = vaddq_s16(a.val[0], b.val[0]);                \
+    a.val[1] = vaddq_s16(a.val[1], b.val[1]);                \
+    a.val[2] = vaddq_s16(a.val[2], b.val[2]);                \
+    a.val[3] = vaddq_s16(a.val[3], b.val[3]);                \
+    b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0);        \
+    b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1);        \
+    b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2);        \
+    b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3);        \
+    t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0);            \
+    b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+    t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1);            \
+    b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+    t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2);            \
+    b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+    t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3);            \
+    b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top_x4(a, b, t)                  \
+    t.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+    t.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+    t.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+    t.val[3] = vsubq_s16(a.val[3], b.val[3]); \
+    a.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+    a.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+    a.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+    a.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
+#define gsbf_bri_bot_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t)  \
+    b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0);        \
+    b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1);        \
+    b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2);        \
+    b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3);        \
+    t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0);            \
+    b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+    t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1);            \
+    b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+    t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2);            \
+    b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+    t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3);            \
+    b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top(a, b, t) \
+    t = vsubq_s16(a, b);  \
+    a = vaddq_s16(a, b);
+
+#define gsbf_bri_bot(b, zl, zh, i, QMVQ, t) \
+    b = vqrdmulhq_laneq_s16(t, zh, i);      \
+    t = vmulq_laneq_s16(t, zl, i);          \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_br_bot(b, zl, zh, QMVQ, t) \
+    b = vqrdmulhq_s16(t, zh);           \
+    t = vmulq_s16(t, zl);               \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+/*
+ * Barrett multiplication via *Rounding* use for Inverse NTT
+ * Input: a, b, zl, zh, Q. a in [-R, R]
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define barmul_invntt(a, zl, zh, i, QMVQ, t) \
+    t = vqrdmulhq_laneq_s16(a, zh, i);       \
+    a = vmulq_laneq_s16(a, zl, i);           \
+    a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barmul_invntt_x2(a, zl, zh, i, QMVQ, t)              \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i);         \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i);         \
+    a.val[0] = vmulq_laneq_s16(a.val[0], zl, i);             \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+    a.val[1] = vmulq_laneq_s16(a.val[1], zl, i);             \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0);
+
+#define barmul_invntt_x4(a, zl, zh, i, QMVQ, t)              \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i);         \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i);         \
+    t.val[2] = vqrdmulhq_laneq_s16(a.val[2], zh, i);         \
+    t.val[3] = vqrdmulhq_laneq_s16(a.val[3], zh, i);         \
+    a.val[0] = vmulq_laneq_s16(a.val[0], zl, i);             \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+    a.val[1] = vmulq_laneq_s16(a.val[1], zl, i);             \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+    a.val[2] = vmulq_laneq_s16(a.val[2], zl, i);             \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+    a.val[3] = vmulq_laneq_s16(a.val[3], zl, i);             \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+/*
+ * Convert coefficients to Montgomery domain
+ */
+#define barmuli_mont(a, QMVM, t)         \
+    t = vqrdmulhq_laneq_s16(a, QMVM, 6); \
+    a = vmulq_laneq_s16(a, QMVM, 2);     \
+    a = vmlsq_laneq_s16(a, t, QMVM, 0);
+
+#define barmuli_mont_x8(a, b, QMVM, t, t2)                    \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 6);        \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 6);        \
+    t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 6);        \
+    t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 6);        \
+    t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 6);       \
+    t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 6);       \
+    t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 6);       \
+    t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 6);       \
+    a.val[0] = vmulq_laneq_s16(a.val[0], QMVM, 2);            \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0);  \
+    a.val[1] = vmulq_laneq_s16(a.val[1], QMVM, 2);            \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0);  \
+    a.val[2] = vmulq_laneq_s16(a.val[2], QMVM, 2);            \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0);  \
+    a.val[3] = vmulq_laneq_s16(a.val[3], QMVM, 2);            \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0);  \
+    b.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 2);            \
+    b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+    b.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 2);            \
+    b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+    b.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 2);            \
+    b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+    b.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 2);            \
+    b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * Convert coefficients to Montgomery domain and embeded n^-1
+ */
+
+#define barmuli_mont_ninv_x8(a, b, QMVM, t, t2)               \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 7);        \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 7);        \
+    t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 7);        \
+    t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 7);        \
+    t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 7);       \
+    t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 7);       \
+    t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 7);       \
+    t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 7);       \
+    a.val[0] = vshlq_n_s16(a.val[0], FALCON_LOG2_NINV_MONT);  \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0);  \
+    a.val[1] = vshlq_n_s16(a.val[1], FALCON_LOG2_NINV_MONT);  \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0);  \
+    a.val[2] = vshlq_n_s16(a.val[2], FALCON_LOG2_NINV_MONT);  \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0);  \
+    a.val[3] = vshlq_n_s16(a.val[3], FALCON_LOG2_NINV_MONT);  \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0);  \
+    b.val[0] = vshlq_n_s16(b.val[0], FALCON_LOG2_NINV_MONT);  \
+    b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+    b.val[1] = vshlq_n_s16(b.val[1], FALCON_LOG2_NINV_MONT);  \
+    b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+    b.val[2] = vshlq_n_s16(b.val[2], FALCON_LOG2_NINV_MONT);  \
+    b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+    b.val[3] = vshlq_n_s16(b.val[3], FALCON_LOG2_NINV_MONT);  \
+    b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * CT Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define ctbf_br(a, b, zl, zh, QMVQ, t)  \
+    t = vqrdmulhq_s16(b, zh);           \
+    b = vmulq_s16(b, zl);               \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0); \
+    b = vsubq_s16(a, t);                \
+    a = vaddq_s16(a, t);
+
+#define ctbf_bri(a, b, zl, zh, i, QMVQ, t) \
+    t = vqrdmulhq_laneq_s16(b, zh, i);     \
+    b = vmulq_laneq_s16(b, zl, i);         \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0);    \
+    b = vsubq_s16(a, t);                   \
+    a = vaddq_s16(a, t);
+
+#define ctbf_br_top(b, zl, zh, QMVQ, t) \
+    t = vqrdmulhq_s16(b, zh);           \
+    b = vmulq_s16(b, zl);               \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bri_top(b, zl, zh, i, QMVQ, t) \
+    t = vqrdmulhq_laneq_s16(b, zh, i);      \
+    b = vmulq_laneq_s16(b, zl, i);          \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bot(a, b, t) \
+    b = vsubq_s16(a, t);  \
+    a = vaddq_s16(a, t);
+
+#define ctbf_bri_top_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t)  \
+    t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0);        \
+    t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1);        \
+    t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2);        \
+    t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3);        \
+    b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0);            \
+    t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+    b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1);            \
+    t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+    b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2);            \
+    t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+    b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3);            \
+    t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0);
+
+#define ctbf_bot_x4(a, b, t)                  \
+    b.val[0] = vsubq_s16(a.val[0], t.val[0]); \
+    b.val[1] = vsubq_s16(a.val[1], t.val[1]); \
+    b.val[2] = vsubq_s16(a.val[2], t.val[2]); \
+    b.val[3] = vsubq_s16(a.val[3], t.val[3]); \
+    a.val[0] = vaddq_s16(a.val[0], t.val[0]); \
+    a.val[1] = vaddq_s16(a.val[1], t.val[1]); \
+    a.val[2] = vaddq_s16(a.val[2], t.val[2]); \
+    a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+#define ctbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t)   \
+    t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0);        \
+    t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1);        \
+    t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2);        \
+    t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3);        \
+    b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0);            \
+    t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+    b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1);            \
+    t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+    b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2);            \
+    t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+    b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3);            \
+    t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0); \
+    b.val[0] = vsubq_s16(a.val[0], t.val[0]);                \
+    b.val[1] = vsubq_s16(a.val[1], t.val[1]);                \
+    b.val[2] = vsubq_s16(a.val[2], t.val[2]);                \
+    b.val[3] = vsubq_s16(a.val[3], t.val[3]);                \
+    a.val[0] = vaddq_s16(a.val[0], t.val[0]);                \
+    a.val[1] = vaddq_s16(a.val[1], t.val[1]);                \
+    a.val[2] = vaddq_s16(a.val[2], t.val[2]);                \
+    a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+// ------------ Pointwise Multiplication ------------
+/*
+ * Montgomery multiplication via *Doubling*
+ * Input: a, b, bNinv, Q
+ * Output: c = ab * R^-1
+ */
+#define montmul(c, a, b, QMVM, t)       \
+    c = vqdmulhq_s16(a, b);             \
+    t = vmulq_laneq_s16(b, QMVM, 1);    \
+    t = vmulq_s16(a, t);                \
+    t = vqdmulhq_laneq_s16(t, QMVM, 0); \
+    c = vhsubq_s16(c, t);
+
+#define montmul_x4(z, a, b, QMVM, t)                  \
+    z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]);      \
+    z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]);      \
+    z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]);      \
+    z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]);      \
+    t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1);    \
+    t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1);    \
+    t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1);    \
+    t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1);    \
+    t.val[0] = vmulq_s16(a.val[0], t.val[0]);         \
+    t.val[1] = vmulq_s16(a.val[1], t.val[1]);         \
+    t.val[2] = vmulq_s16(a.val[2], t.val[2]);         \
+    t.val[3] = vmulq_s16(a.val[3], t.val[3]);         \
+    t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+    z.val[0] = vhsubq_s16(z.val[0], t.val[0]);        \
+    t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+    z.val[1] = vhsubq_s16(z.val[1], t.val[1]);        \
+    t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+    z.val[2] = vhsubq_s16(z.val[2], t.val[2]);        \
+    t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+    z.val[3] = vhsubq_s16(z.val[3], t.val[3]);
+
+#define montmul_x8(z, w, a, b, e, f, QMVM, t, k)      \
+    z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]);      \
+    z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]);      \
+    z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]);      \
+    z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]);      \
+    w.val[0] = vqdmulhq_s16(e.val[0], f.val[0]);      \
+    w.val[1] = vqdmulhq_s16(e.val[1], f.val[1]);      \
+    w.val[2] = vqdmulhq_s16(e.val[2], f.val[2]);      \
+    w.val[3] = vqdmulhq_s16(e.val[3], f.val[3]);      \
+    t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1);    \
+    t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1);    \
+    t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1);    \
+    t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1);    \
+    k.val[0] = vmulq_laneq_s16(f.val[0], QMVM, 1);    \
+    k.val[1] = vmulq_laneq_s16(f.val[1], QMVM, 1);    \
+    k.val[2] = vmulq_laneq_s16(f.val[2], QMVM, 1);    \
+    k.val[3] = vmulq_laneq_s16(f.val[3], QMVM, 1);    \
+    t.val[0] = vmulq_s16(a.val[0], t.val[0]);         \
+    t.val[1] = vmulq_s16(a.val[1], t.val[1]);         \
+    t.val[2] = vmulq_s16(a.val[2], t.val[2]);         \
+    t.val[3] = vmulq_s16(a.val[3], t.val[3]);         \
+    k.val[0] = vmulq_s16(e.val[0], k.val[0]);         \
+    k.val[1] = vmulq_s16(e.val[1], k.val[1]);         \
+    k.val[2] = vmulq_s16(e.val[2], k.val[2]);         \
+    k.val[3] = vmulq_s16(e.val[3], k.val[3]);         \
+    t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+    z.val[0] = vhsubq_s16(z.val[0], t.val[0]);        \
+    t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+    z.val[1] = vhsubq_s16(z.val[1], t.val[1]);        \
+    t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+    z.val[2] = vhsubq_s16(z.val[2], t.val[2]);        \
+    t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+    z.val[3] = vhsubq_s16(z.val[3], t.val[3]);        \
+    k.val[0] = vqdmulhq_laneq_s16(k.val[0], QMVM, 0); \
+    w.val[0] = vhsubq_s16(w.val[0], k.val[0]);        \
+    k.val[1] = vqdmulhq_laneq_s16(k.val[1], QMVM, 0); \
+    w.val[1] = vhsubq_s16(w.val[1], k.val[1]);        \
+    k.val[2] = vqdmulhq_laneq_s16(k.val[2], QMVM, 0); \
+    w.val[2] = vhsubq_s16(w.val[2], k.val[2]);        \
+    k.val[3] = vqdmulhq_laneq_s16(k.val[3], QMVM, 0); \
+    w.val[3] = vhsubq_s16(w.val[3], k.val[3]);
+
+// ------------ Barrett Reduction ------------
+/*
+ * Barrett reduction, return [-Q/2, Q/2]
+ * `v` = 5461, `n` = 11
+ */
+#define barrett(a, QMVQ, t)             \
+    t = vqdmulhq_laneq_s16(a, QMVQ, 4); \
+    t = vrshrq_n_s16(t, 11);            \
+    a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barrett_x2(a, i, j, m, n, QMVQ, t)                   \
+    t.val[m] = vqdmulhq_laneq_s16(a.val[i], QMVQ, 4);        \
+    t.val[m] = vrshrq_n_s16(t.val[m], 11);                   \
+    t.val[n] = vqdmulhq_laneq_s16(a.val[j], QMVQ, 4);        \
+    t.val[n] = vrshrq_n_s16(t.val[n], 11);                   \
+    a.val[i] = vmlsq_laneq_s16(a.val[i], t.val[m], QMVQ, 0); \
+    a.val[j] = vmlsq_laneq_s16(a.val[j], t.val[n], QMVQ, 0);
+
+#define barrett_x4(a, QMVQ, t)                               \
+    t.val[0] = vqdmulhq_laneq_s16(a.val[0], QMVQ, 4);        \
+    t.val[0] = vrshrq_n_s16(t.val[0], 11);                   \
+    t.val[1] = vqdmulhq_laneq_s16(a.val[1], QMVQ, 4);        \
+    t.val[1] = vrshrq_n_s16(t.val[1], 11);                   \
+    t.val[2] = vqdmulhq_laneq_s16(a.val[2], QMVQ, 4);        \
+    t.val[2] = vrshrq_n_s16(t.val[2], 11);                   \
+    t.val[3] = vqdmulhq_laneq_s16(a.val[3], QMVQ, 4);        \
+    t.val[3] = vrshrq_n_s16(t.val[3], 11);                   \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+// ------------ Matrix Transpose ------------
+/*
+ * Matrix 4x4 transpose: v
+ * Input: int16x8x4_t v, tmp
+ * Output: int16x8x4_t v
+ */
+#define transpose(v, tmp)                                                           \
+    tmp.val[0] = vtrn1q_s16(v.val[0], v.val[1]);                                    \
+    tmp.val[1] = vtrn2q_s16(v.val[0], v.val[1]);                                    \
+    tmp.val[2] = vtrn1q_s16(v.val[2], v.val[3]);                                    \
+    tmp.val[3] = vtrn2q_s16(v.val[2], v.val[3]);                                    \
+    v.val[0] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+    v.val[2] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+    v.val[1] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]); \
+    v.val[3] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]);
+
+// ------------ Re-arrange vector ------------
+#define arrange(v_out, v_in, i, j, m, n, a, b, c, d)                                      \
+    v_out.val[a] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+    v_out.val[b] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+    v_out.val[c] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]); \
+    v_out.val[d] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]);
+
+// ------------ Addition/Subtraction ------------
+#define vsub_x4(c, a, b)                      \
+    c.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+    c.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+    c.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+    c.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+#define vadd_x4(c, a, b)                      \
+    c.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+    c.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+    c.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+    c.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt.c
new file mode 100644
index 000000000..7007cf245
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt.c
@@ -0,0 +1,928 @@
+/*
+ * High-speed vectorize NTT for N = 512, 1024
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrous.h"
+#include "ntt_consts.h"
+#include "poly.h"
+
+/*
+ * Assume Input in the range [-Q/2, Q/2]
+ * Total Barrett point for N = 512, 1024: 2048, 4096
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont) {
+    // Total SIMD registers 29 = 16 + 12 + 1
+    int16x8x4_t v0, v1, v2, v3; // 16
+    int16x8x4_t zl, zh, t, t2;  // 12
+    int16x8x2_t zlh, zhh;       // 4
+    int16x8_t neon_qmvq;        // 1
+    const int16_t *ptr_ntt_br = PQCLEAN_FALCONPADDED1024_AARCH64_ntt_br;
+    const int16_t *ptr_ntt_qinv_br = PQCLEAN_FALCONPADDED1024_AARCH64_ntt_qinv_br;
+
+    neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+    zl.val[0] = vld1q_s16(ptr_ntt_br);
+    zh.val[0] = vld1q_s16(ptr_ntt_qinv_br);
+    ptr_ntt_br += 8;
+    ptr_ntt_qinv_br += 8;
+
+    // Layer 9, 8, 7
+    int16x8x2_t u0, u1, u2, u3, u4, u5, u6, u7;
+
+    for (unsigned j = 0; j < 128; j += 16) {
+        vload_s16_x2(u0, &a[j]);
+        vload_s16_x2(u1, &a[j + 128]);
+        vload_s16_x2(u2, &a[j + 256]);
+        vload_s16_x2(u3, &a[j + 384]);
+
+        vload_s16_x2(u4, &a[j + 512]);
+        vload_s16_x2(u5, &a[j + 640]);
+        vload_s16_x2(u6, &a[j + 768]);
+        vload_s16_x2(u7, &a[j + 896]);
+
+        // u0, 4: .5
+        // u1, 5: .5
+        // u2, 6: .5
+        // u3, 7: .5
+
+        // Layer 9
+        // u0 - u4, u1 - u5
+        // u2 - u6, u3 - u7
+        ctbf_bri_top(u4.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[0]);
+        ctbf_bri_top(u4.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[1]);
+        ctbf_bri_top(u5.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[2]);
+        ctbf_bri_top(u5.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[3]);
+
+        ctbf_bri_top(u6.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[0]);
+        ctbf_bri_top(u6.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[1]);
+        ctbf_bri_top(u7.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[2]);
+        ctbf_bri_top(u7.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[3]);
+
+        ctbf_bot(u0.val[0], u4.val[0], t.val[0]);
+        ctbf_bot(u0.val[1], u4.val[1], t.val[1]);
+        ctbf_bot(u1.val[0], u5.val[0], t.val[2]);
+        ctbf_bot(u1.val[1], u5.val[1], t.val[3]);
+
+        ctbf_bot(u2.val[0], u6.val[0], t2.val[0]);
+        ctbf_bot(u2.val[1], u6.val[1], t2.val[1]);
+        ctbf_bot(u3.val[0], u7.val[0], t2.val[2]);
+        ctbf_bot(u3.val[1], u7.val[1], t2.val[3]);
+
+        // u0, 4: 1.2
+        // u1, 5: 1.2
+        // u2, 6: 1.2
+        // u3, 7: 1.2
+
+        // Layer 8
+        // u0 - u2, u1 - u3
+        // u4 - u6, u5 - u7
+        ctbf_bri_top(u2.val[0], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[0]);
+        ctbf_bri_top(u2.val[1], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[1]);
+        ctbf_bri_top(u3.val[0], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[2]);
+        ctbf_bri_top(u3.val[1], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[3]);
+
+        ctbf_bri_top(u6.val[0], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[0]);
+        ctbf_bri_top(u6.val[1], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[1]);
+        ctbf_bri_top(u7.val[0], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[2]);
+        ctbf_bri_top(u7.val[1], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[3]);
+
+        ctbf_bot(u0.val[0], u2.val[0], t.val[0]);
+        ctbf_bot(u0.val[1], u2.val[1], t.val[1]);
+        ctbf_bot(u1.val[0], u3.val[0], t.val[2]);
+        ctbf_bot(u1.val[1], u3.val[1], t.val[3]);
+
+        ctbf_bot(u4.val[0], u6.val[0], t2.val[0]);
+        ctbf_bot(u4.val[1], u6.val[1], t2.val[1]);
+        ctbf_bot(u5.val[0], u7.val[0], t2.val[2]);
+        ctbf_bot(u5.val[1], u7.val[1], t2.val[3]);
+
+        // 2.14 -> 0.5
+        barrett_x2(u0, 0, 1, 0, 1, neon_qmvq, t);
+        barrett_x2(u1, 0, 1, 2, 3, neon_qmvq, t);
+        barrett_x2(u2, 0, 1, 0, 1, neon_qmvq, t);
+        barrett_x2(u3, 0, 1, 2, 3, neon_qmvq, t);
+
+        barrett_x2(u4, 0, 1, 0, 1, neon_qmvq, t2);
+        barrett_x2(u5, 0, 1, 2, 3, neon_qmvq, t2);
+        barrett_x2(u6, 0, 1, 0, 1, neon_qmvq, t2);
+        barrett_x2(u7, 0, 1, 2, 3, neon_qmvq, t2);
+        // u0, 4: .5
+        // u1, 5: .5
+        // u2, 6: .5
+        // u3, 7: .5
+
+        // Layer 7
+        // u0 - u1, u2 - u3
+        // u4 - u5, u6 - u7
+        ctbf_bri_top(u1.val[0], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[0]);
+        ctbf_bri_top(u1.val[1], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[1]);
+        ctbf_bri_top(u3.val[0], zl.val[0], zh.val[0], 5, neon_qmvq, t.val[2]);
+        ctbf_bri_top(u3.val[1], zl.val[0], zh.val[0], 5, neon_qmvq, t.val[3]);
+
+        ctbf_bri_top(u5.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[0]);
+        ctbf_bri_top(u5.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[1]);
+        ctbf_bri_top(u7.val[0], zl.val[0], zh.val[0], 7, neon_qmvq, t2.val[2]);
+        ctbf_bri_top(u7.val[1], zl.val[0], zh.val[0], 7, neon_qmvq, t2.val[3]);
+
+        ctbf_bot(u0.val[0], u1.val[0], t.val[0]);
+        ctbf_bot(u0.val[1], u1.val[1], t.val[1]);
+        ctbf_bot(u2.val[0], u3.val[0], t.val[2]);
+        ctbf_bot(u2.val[1], u3.val[1], t.val[3]);
+
+        ctbf_bot(u4.val[0], u5.val[0], t2.val[0]);
+        ctbf_bot(u4.val[1], u5.val[1], t2.val[1]);
+        ctbf_bot(u6.val[0], u7.val[0], t2.val[2]);
+        ctbf_bot(u6.val[1], u7.val[1], t2.val[3]);
+
+        // u0, 4: 1.2
+        // u1, 5: 1.2
+        // u2, 6: 1.2
+        // u3, 7: 1.2
+
+        // Store at 1.2Q
+        vstore_s16_x2(&a[j], u0);
+        vstore_s16_x2(&a[j + 128], u1);
+        vstore_s16_x2(&a[j + 256], u2);
+        vstore_s16_x2(&a[j + 384], u3);
+
+        vstore_s16_x2(&a[j + 512], u4);
+        vstore_s16_x2(&a[j + 640], u5);
+        vstore_s16_x2(&a[j + 768], u6);
+        vstore_s16_x2(&a[j + 896], u7);
+    }
+
+    // Layer 6, 5, 4, 3, 2, 1, 0
+    for (unsigned j = 0; j < FALCON_N; j += 128) {
+        vload_s16_x4(v0, &a[j]);
+        vload_s16_x4(v1, &a[j + 32]);
+        vload_s16_x4(v2, &a[j + 64]);
+        vload_s16_x4(v3, &a[j + 96]);
+
+        vload_s16_x2(zlh, ptr_ntt_br);
+        vload_s16_x2(zhh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 16;
+        ptr_ntt_qinv_br += 16;
+
+        // Layer 6
+        // v0 - v2, v1 - v3
+        ctbf_bri_top_x4(v2, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+        ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t2);
+
+        ctbf_bot_x4(v0, v2, t);
+        ctbf_bot_x4(v1, v3, t2);
+
+        // 2.3 -> 0.5
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t);
+        barrett_x4(v3, neon_qmvq, t);
+
+        // Layer 5
+        // v0 - v1, v2 - v3
+        ctbf_bri_top_x4(v1, zlh.val[0], zhh.val[0], 1, 1, 1, 1, neon_qmvq, t);
+        ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+
+        ctbf_bot_x4(v0, v1, t);
+        ctbf_bot_x4(v2, v3, t2);
+
+        // 1.3
+
+        // Layer 4
+        // v0(0, 1 - 2, 3)
+        // v1(0, 1 - 2, 3)
+        // v2(0, 1 - 2, 3)
+        // v3(0, 1 - 2, 3)
+        ctbf_bri_top(v0.val[2], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[0]);
+        ctbf_bri_top(v0.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[1]);
+        ctbf_bri_top(v1.val[2], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[2]);
+        ctbf_bri_top(v1.val[3], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[3]);
+
+        ctbf_bri_top(v2.val[2], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[0]);
+        ctbf_bri_top(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+        ctbf_bri_top(v3.val[2], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+        ctbf_bri_top(v3.val[3], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+        ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+        ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+        ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+        ctbf_bot(v2.val[0], v2.val[2], t2.val[0]);
+        ctbf_bot(v2.val[1], v2.val[3], t2.val[1]);
+        ctbf_bot(v3.val[0], v3.val[2], t2.val[2]);
+        ctbf_bot(v3.val[1], v3.val[3], t2.val[3]);
+
+        // 2.3 -> 0.5
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // Layer 3
+        // v0(0, 2 - 1, 3)
+        // v1(0, 2 - 1, 3)
+        // v2(0, 2 - 1, 3)
+        // v3(0, 2 - 1, 3)
+        ctbf_bri_top(v0.val[1], zlh.val[0], zhh.val[0], 7, neon_qmvq, t.val[0]);
+        ctbf_bri_top(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+        ctbf_bri_top(v1.val[1], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+        ctbf_bri_top(v1.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t.val[3]);
+
+        ctbf_bri_top(v2.val[1], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[0]);
+        ctbf_bri_top(v2.val[3], zlh.val[1], zhh.val[1], 4, neon_qmvq, t2.val[1]);
+        ctbf_bri_top(v3.val[1], zlh.val[1], zhh.val[1], 5, neon_qmvq, t2.val[2]);
+        ctbf_bri_top(v3.val[3], zlh.val[1], zhh.val[1], 6, neon_qmvq, t2.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+        ctbf_bot(v0.val[2], v0.val[3], t.val[1]);
+        ctbf_bot(v1.val[0], v1.val[1], t.val[2]);
+        ctbf_bot(v1.val[2], v1.val[3], t.val[3]);
+
+        ctbf_bot(v2.val[0], v2.val[1], t2.val[0]);
+        ctbf_bot(v2.val[2], v2.val[3], t2.val[1]);
+        ctbf_bot(v3.val[0], v3.val[1], t2.val[2]);
+        ctbf_bot(v3.val[2], v3.val[3], t2.val[3]);
+
+        // 1.3
+
+        // Layer 2
+        // Input:
+        // 0,  1,  2,  3  | 4,  5,  6,  7
+        // 8,  9,  10, 11 | 12, 13, 14, 15
+        // 16, 17, 18, 19 | 20, 21, 22, 23
+        // 24, 25, 26, 27 | 28, 29, 30, 31
+        arrange(t, v0, 0, 2, 1, 3, 0, 1, 2, 3);
+        v0 = t;
+        arrange(t, v1, 0, 2, 1, 3, 0, 1, 2, 3);
+        v1 = t;
+        arrange(t2, v2, 0, 2, 1, 3, 0, 1, 2, 3);
+        v2 = t2;
+        arrange(t2, v3, 0, 2, 1, 3, 0, 1, 2, 3);
+        v3 = t2;
+        // Output:
+        // 0,  1,  2,  3  | 16, 17, 18, 19
+        // 4,  5,  6,  7  | 20, 21, 22, 23
+        // 8,  9,  10, 11 | 24, 25, 26, 27
+        // 12, 13, 14, 15 | 28, 29, 30, 31
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+        ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+        ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+        ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+        ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+        ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+        ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+        // 2.3 -> 0.5
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // Layer 1: v0.val[0] x v0.val[2] | v0.val[1] x v0.val[3]
+        // v0.val[0]: 0,  1,  2,  3  | 16, 17, 18, 19
+        // v0.val[1]: 4,  5,  6,  7  | 20, 21, 22, 23
+        // v0.val[2]: 8,  9,  10, 11 | 24, 25, 26, 27
+        // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+        // transpose 4x4
+        transpose(v0, t);
+        transpose(v1, t);
+        transpose(v2, t2);
+        transpose(v3, t2);
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[2], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[1]);
+        ctbf_br_top(v1.val[2], zl.val[1], zh.val[1], neon_qmvq, t.val[2]);
+        ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+        ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+        ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+        ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+        ctbf_br_top(v2.val[2], zl.val[2], zh.val[2], neon_qmvq, t.val[0]);
+        ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[1]);
+        ctbf_br_top(v3.val[2], zl.val[3], zh.val[3], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v2.val[0], v2.val[2], t.val[0]);
+        ctbf_bot(v2.val[1], v2.val[3], t.val[1]);
+        ctbf_bot(v3.val[0], v3.val[2], t.val[2]);
+        ctbf_bot(v3.val[1], v3.val[3], t.val[3]);
+
+        // 1.3
+
+        // Layer 0
+        // v(0, 2 - 1, 3)
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+        ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+        ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+        ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+        ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+        ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+        ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+        // 2.3
+
+        if (mont == NTT_MONT) {
+            // Convert to Montgomery domain by multiply with FALCON_MONT
+            barmuli_mont_x8(v0, v1, neon_qmvq, t, t2);
+            barmuli_mont_x8(v2, v3, neon_qmvq, t, t2);
+        } else if (mont == NTT_MONT_INV) {
+            barmuli_mont_ninv_x8(v0, v1, neon_qmvq, t, t2);
+            barmuli_mont_ninv_x8(v2, v3, neon_qmvq, t, t2);
+        }
+
+        vstore_s16_4(&a[j], v0);
+        vstore_s16_4(&a[j + 32], v1);
+        vstore_s16_4(&a[j + 64], v2);
+        vstore_s16_4(&a[j + 96], v3);
+    }
+}
+
+/*
+ * Assume input in range [-Q, Q]
+ * Total Barrett point N = 512, 1024: 1792, 3840
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv) {
+    // Total SIMD registers: 29 = 16 + 12 + 1
+    int16x8x4_t v0, v1, v2, v3; // 16
+    int16x8x4_t zl, zh, t, t2;  // 12
+    int16x8x2_t zlh, zhh;       // 4
+    int16x8_t neon_qmvq;        // 1
+    const int16_t *ptr_invntt_br = PQCLEAN_FALCONPADDED1024_AARCH64_invntt_br;
+    const int16_t *ptr_invntt_qinv_br = PQCLEAN_FALCONPADDED1024_AARCH64_invntt_qinv_br;
+
+    neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+    unsigned j;
+
+    // Layer 0, 1, 2, 3, 4, 5, 6
+    for (j = 0; j < FALCON_N; j += 128) {
+        vload_s16_4(v0, &a[j]);
+        vload_s16_4(v1, &a[j + 32]);
+        vload_s16_4(v2, &a[j + 64]);
+        vload_s16_4(v3, &a[j + 96]);
+
+        // Layer 0
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+        gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+        gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+        gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+        gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+        gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+        gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+        gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+        gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        // 0 - 1*, 2 - 3*
+        gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+        gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+        gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+        vload_s16_x4(zl, ptr_invntt_br);
+        vload_s16_x4(zh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 32;
+        ptr_invntt_qinv_br += 32;
+
+        gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+        gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+        gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+        gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+        // 0: 2
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        barrett(v0.val[0], neon_qmvq, t.val[0]);
+        barrett(v1.val[0], neon_qmvq, t.val[1]);
+        barrett(v2.val[0], neon_qmvq, t.val[2]);
+        barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+        // 0: 0.5
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        // Layer 1
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+        // 0 - 2*, 1 - 3*
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+        gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+        gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+        gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+        gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+        gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+        gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+        gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+        gsbf_br_bot(v0.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+        gsbf_br_bot(v0.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t.val[1]);
+        gsbf_br_bot(v1.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t.val[2]);
+        gsbf_br_bot(v1.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_br_bot(v2.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[0]);
+        gsbf_br_bot(v2.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[1]);
+        gsbf_br_bot(v3.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[2]);
+        gsbf_br_bot(v3.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[3]);
+
+        // 0: 2.5
+        // 1: 2.6
+        // 2: 1.5
+        // 3: 1.5
+
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // 0: 0.5
+        // 1: 0.5
+        // 2: 0.5
+        // 3: 0.5
+
+        // Layer 2
+        // Before Transpose
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+        transpose(v0, t);
+        transpose(v1, t);
+        transpose(v2, t2);
+        transpose(v3, t2);
+
+        // After Transpose
+        // v0.val[0]: 0,  1,  2,  3  | 16,  17,  18,  19
+        // v0.val[1]: 4,  5,  6,  7  | 20,  21,  22,  23
+        // v0.val[2]: 8,  9,  10, 11 | 24,  25,  26,  27
+        // v0.val[3]: 12, 13, 14, 15 | 28,  29,  30,  31
+        // 0 - 1*, 2 - 3*
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+        gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+        gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+        gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+        gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+        gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+        gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+        gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+        gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+        gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+        gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+        vload_s16_x4(zl, ptr_invntt_br);
+        vload_s16_x4(zh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 32;
+        ptr_invntt_qinv_br += 32;
+
+        gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+        gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+        gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+        gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+        // 0: 1
+        // 1: 0.9
+        // 2: 1
+        // 3: 0.9
+
+        // Layer 3
+        // Re-arrange vector from
+        // v0.val[0]: 0,  1,  2,  3  | 16,  17,  18,  19
+        // v0.val[1]: 4,  5,  6,  7  | 20,  21,  22,  23
+        // v0.val[2]: 8,  9,  10, 11 | 24,  25,  26,  27
+        // v0.val[3]: 12, 13, 14, 15 | 28,  29,  30,  31
+        // Compiler will handle register re-naming
+        arrange(t, v0, 0, 1, 2, 3, 0, 2, 1, 3);
+        v0 = t;
+
+        // Compiler will handle register re-naming
+        arrange(t, v1, 0, 1, 2, 3, 0, 2, 1, 3);
+        v1 = t;
+
+        // Compiler will handle register re-naming
+        arrange(t2, v2, 0, 1, 2, 3, 0, 2, 1, 3);
+        v2 = t2;
+
+        // Compiler will handle register re-naming
+        arrange(t2, v3, 0, 1, 2, 3, 0, 2, 1, 3);
+        v3 = t2;
+        // To
+        // v0.val[0]: 0,  1,  2,  3  | 4,  5,  6,  7
+        // v0.val[1]: 8,  9,  10, 11 | 12, 13, 14, 15
+        // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+        // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+        // 0 - 1, 2 - 3
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+        gsbf_top(v0.val[2], v0.val[3], t.val[1]);
+        gsbf_top(v1.val[0], v1.val[1], t.val[2]);
+        gsbf_top(v1.val[2], v1.val[3], t.val[3]);
+
+        gsbf_top(v2.val[0], v2.val[1], t2.val[0]);
+        gsbf_top(v2.val[2], v2.val[3], t2.val[1]);
+        gsbf_top(v3.val[0], v3.val[1], t2.val[2]);
+        gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+        gsbf_bri_bot(v0.val[1], zlh.val[0], zhh.val[0], 0, neon_qmvq, t.val[0]);
+        gsbf_bri_bot(v0.val[3], zlh.val[0], zhh.val[0], 1, neon_qmvq, t.val[1]);
+        gsbf_bri_bot(v1.val[1], zlh.val[0], zhh.val[0], 2, neon_qmvq, t.val[2]);
+        gsbf_bri_bot(v1.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[3]);
+
+        gsbf_bri_bot(v2.val[1], zlh.val[0], zhh.val[0], 4, neon_qmvq, t2.val[0]);
+        gsbf_bri_bot(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+        gsbf_bri_bot(v3.val[1], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+        gsbf_bri_bot(v3.val[3], zlh.val[0], zhh.val[0], 7, neon_qmvq, t2.val[3]);
+
+        // 0: 2
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        barrett(v0.val[0], neon_qmvq, t.val[0]);
+        barrett(v1.val[0], neon_qmvq, t.val[1]);
+        barrett(v2.val[0], neon_qmvq, t.val[2]);
+        barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+        // 0: 0.5
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        // Layer 4
+        // v0.val[0]: 0,  1,  2,  3  | 4,  5,  6,  7
+        // v0.val[1]: 8,  9,  10, 11 | 12, 13, 14, 15
+        // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+        // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+        // 0 - 2, 1 - 3
+
+        gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+        gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+        gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+        gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+        gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+        gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+        gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+        gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+        gsbf_bri_bot(v0.val[2], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[0]);
+        gsbf_bri_bot(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+        gsbf_bri_bot(v1.val[2], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+        gsbf_bri_bot(v1.val[3], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[3]);
+
+        gsbf_bri_bot(v2.val[2], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[0]);
+        gsbf_bri_bot(v2.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[1]);
+        gsbf_bri_bot(v3.val[2], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[2]);
+        gsbf_bri_bot(v3.val[3], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[3]);
+
+        // 0: 2.5
+        // 1: 2.5
+        // 2: 1.5
+        // 3: 1.5
+
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // 0: 0.5
+        // 1: 0.5
+        // 2: 0.5
+        // 3: 0.5
+
+        // Layer 5
+        // Cross block
+        // v0.0->3 - v1.0->3
+        gsbf_top_x4(v0, v1, t);
+        gsbf_top_x4(v2, v3, t2);
+
+        gsbf_bri_bot_x4(v1, zlh.val[1], zhh.val[1], 4, 4, 4, 4, neon_qmvq, t);
+        gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 5, 5, 5, 5, neon_qmvq, t2);
+
+        // v0: 1
+        // v1: 0.9
+        // v2: 1
+        // v3: 0.9
+
+        // Layer 6
+        // Cross block
+        // v0.0->3 - v2.0->3
+        gsbf_top_x4(v0, v2, t);
+        gsbf_top_x4(v1, v3, t2);
+
+        gsbf_bri_bot_x4(v2, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t);
+        gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t2);
+
+        // v0: 2
+        // v1: 1.8
+        // v2: 1.3
+        // v3: 1.2
+
+        vstore_s16_x4(&a[j], v0);
+        vstore_s16_x4(&a[j + 32], v1);
+        vstore_s16_x4(&a[j + 64], v2);
+        vstore_s16_x4(&a[j + 96], v3);
+    }
+
+    ptr_invntt_br += 8 * ninv;
+    ptr_invntt_qinv_br += 8 * ninv;
+    zl.val[0] = vld1q_s16(ptr_invntt_br);
+    zh.val[0] = vld1q_s16(ptr_invntt_qinv_br);
+
+    // Layer 7, 8, 9
+    int16x8x2_t u0, u1, u2, u3, u4, u5, u6, u7;
+
+    for (j = 0; j < 128; j += 16) {
+        vload_s16_x2(u0, &a[j]);
+        vload_s16_x2(u1, &a[j + 128]);
+        vload_s16_x2(u2, &a[j + 256]);
+        vload_s16_x2(u3, &a[j + 384]);
+
+        vload_s16_x2(u4, &a[j + 512]);
+        vload_s16_x2(u5, &a[j + 640]);
+        vload_s16_x2(u6, &a[j + 768]);
+        vload_s16_x2(u7, &a[j + 896]);
+
+        // 2
+        barrett_x2(u0, 0, 1, 0, 1, neon_qmvq, t);
+        barrett_x2(u1, 0, 1, 2, 3, neon_qmvq, t);
+        barrett_x2(u2, 0, 1, 0, 1, neon_qmvq, t);
+        barrett_x2(u3, 0, 1, 2, 3, neon_qmvq, t);
+
+        barrett_x2(u4, 0, 1, 0, 1, neon_qmvq, t2);
+        barrett_x2(u5, 0, 1, 2, 3, neon_qmvq, t2);
+        barrett_x2(u6, 0, 1, 0, 1, neon_qmvq, t2);
+        barrett_x2(u7, 0, 1, 2, 3, neon_qmvq, t2);
+
+        // u0, 4: 0.5
+        // u1, 5: 0.5
+        // u2, 6: 0.5
+        // u3, 7: 0.5
+
+        // Layer 7
+        // u0 - u1, u2 - u3
+        // u4 - u5, u6 - u7
+        gsbf_top(u0.val[0], u1.val[0], t.val[0]);
+        gsbf_top(u0.val[1], u1.val[1], t.val[1]);
+        gsbf_top(u2.val[0], u3.val[0], t.val[2]);
+        gsbf_top(u2.val[1], u3.val[1], t.val[3]);
+
+        gsbf_top(u4.val[0], u5.val[0], t2.val[0]);
+        gsbf_top(u4.val[1], u5.val[1], t2.val[1]);
+        gsbf_top(u6.val[0], u7.val[0], t2.val[2]);
+        gsbf_top(u6.val[1], u7.val[1], t2.val[3]);
+
+        gsbf_bri_bot(u1.val[0], zl.val[0], zh.val[0], 0, neon_qmvq, t.val[0]);
+        gsbf_bri_bot(u1.val[1], zl.val[0], zh.val[0], 0, neon_qmvq, t.val[1]);
+        gsbf_bri_bot(u3.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[2]);
+        gsbf_bri_bot(u3.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[3]);
+
+        gsbf_bri_bot(u5.val[0], zl.val[0], zh.val[0], 2, neon_qmvq, t2.val[0]);
+        gsbf_bri_bot(u5.val[1], zl.val[0], zh.val[0], 2, neon_qmvq, t2.val[1]);
+        gsbf_bri_bot(u7.val[0], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[2]);
+        gsbf_bri_bot(u7.val[1], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[3]);
+
+        // u0, 4: 1
+        // u1, 5: .87
+        // u2, 6: 1
+        // u3, 7: .87
+
+        // Layer 8
+        // u0 - u2, u1 - u3
+        // u4 - u6, u5 - u7
+        gsbf_top(u0.val[0], u2.val[0], t.val[0]);
+        gsbf_top(u0.val[1], u2.val[1], t.val[1]);
+        gsbf_top(u1.val[0], u3.val[0], t.val[2]);
+        gsbf_top(u1.val[1], u3.val[1], t.val[3]);
+
+        gsbf_top(u4.val[0], u6.val[0], t2.val[0]);
+        gsbf_top(u4.val[1], u6.val[1], t2.val[1]);
+        gsbf_top(u5.val[0], u7.val[0], t2.val[2]);
+        gsbf_top(u5.val[1], u7.val[1], t2.val[3]);
+
+        gsbf_bri_bot(u2.val[0], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[0]);
+        gsbf_bri_bot(u2.val[1], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[1]);
+        gsbf_bri_bot(u3.val[0], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[2]);
+        gsbf_bri_bot(u3.val[1], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[3]);
+
+        gsbf_bri_bot(u6.val[0], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[0]);
+        gsbf_bri_bot(u6.val[1], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[1]);
+        gsbf_bri_bot(u7.val[0], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[2]);
+        gsbf_bri_bot(u7.val[1], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[3]);
+
+        // u0, 4: 2
+        // u2, 6: 1.25
+        // u1, 5: 1.75
+        // u3, 7: 1.15
+
+        barrett_x2(u0, 0, 1, 0, 1, neon_qmvq, t);
+        barrett_x2(u4, 0, 1, 2, 3, neon_qmvq, t);
+        barrett_x2(u1, 0, 1, 0, 1, neon_qmvq, t2);
+        barrett_x2(u5, 0, 1, 2, 3, neon_qmvq, t2);
+
+        // u0, 4: 0.5
+        // u2, 6: 1.25
+        // u1, 5: 0.5
+        // u3, 7: 1.15
+
+        // Layer 9
+        // u0 - u4, u1 - u5
+        // u2 - u6, u3 - u7
+        gsbf_top(u0.val[0], u4.val[0], t.val[0]);
+        gsbf_top(u0.val[1], u4.val[1], t.val[1]);
+        gsbf_top(u1.val[0], u5.val[0], t.val[2]);
+        gsbf_top(u1.val[1], u5.val[1], t.val[3]);
+
+        gsbf_top(u2.val[0], u6.val[0], t2.val[0]);
+        gsbf_top(u2.val[1], u6.val[1], t2.val[1]);
+        gsbf_top(u3.val[0], u7.val[0], t2.val[2]);
+        gsbf_top(u3.val[1], u7.val[1], t2.val[3]);
+
+        gsbf_bri_bot(u4.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[0]);
+        gsbf_bri_bot(u4.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[1]);
+        gsbf_bri_bot(u5.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[2]);
+        gsbf_bri_bot(u5.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[3]);
+
+        gsbf_bri_bot(u6.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[0]);
+        gsbf_bri_bot(u6.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[1]);
+        gsbf_bri_bot(u7.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[2]);
+        gsbf_bri_bot(u7.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[3]);
+
+        // u0, 4: 1, .87
+        // u2, 6: 2.5, 1.5
+        // u1, 5: 1, .87
+        // u3, 7: 2.3, 1.4
+
+        if (ninv == INVNTT_NINV) {
+            barmul_invntt_x2(u0, zl.val[0], zh.val[0], 7, neon_qmvq, t);
+            barmul_invntt_x2(u1, zl.val[0], zh.val[0], 7, neon_qmvq, t);
+            barmul_invntt_x2(u2, zl.val[0], zh.val[0], 7, neon_qmvq, t2);
+            barmul_invntt_x2(u3, zl.val[0], zh.val[0], 7, neon_qmvq, t2);
+        }
+
+        // u0, 4: .87, .87
+        // u2, 6: 1.5, 1.5
+        // u1, 5: .87, .87
+        // u3, 7: 1.4, 1.4
+
+        barrett_x2(u2, 0, 1, 0, 1, neon_qmvq, t);
+        barrett_x2(u6, 0, 1, 2, 3, neon_qmvq, t);
+        barrett_x2(u3, 0, 1, 0, 1, neon_qmvq, t2);
+        barrett_x2(u7, 0, 1, 2, 3, neon_qmvq, t2);
+
+        // u0, 4: .87, .87
+        // u2, 6: .5, .5
+        // u1, 5: .87, .87
+        // u3, 7: .5, .5
+
+        vstore_s16_x2(&a[j], u0);
+        vstore_s16_x2(&a[j + 128], u1);
+        vstore_s16_x2(&a[j + 256], u2);
+        vstore_s16_x2(&a[j + 384], u3);
+
+        vstore_s16_x2(&a[j + 512], u4);
+        vstore_s16_x2(&a[j + 640], u5);
+        vstore_s16_x2(&a[j + 768], u6);
+        vstore_s16_x2(&a[j + 896], u7);
+    }
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+    // Total SIMD registers: 29 = 28 + 1
+    int16x8x4_t a, b, c, d, e1, e2, t, k; // 28
+    int16x8_t neon_qmvm;                  // 1
+    neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+
+    for (unsigned i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &f[i]);
+        vload_s16_x4(b, &g[i]);
+        vload_s16_x4(c, &f[i + 32]);
+        vload_s16_x4(d, &g[i + 32]);
+
+        montmul_x8(e1, e2, a, b, c, d, neon_qmvm, t, k);
+
+        vstore_s16_x4(&f[i], e1);
+        vstore_s16_x4(&f[i + 32], e2);
+    }
+}
+
+/* ===================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.c
new file mode 100644
index 000000000..f6dbf1178
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.c
@@ -0,0 +1,732 @@
+#include "ntt_consts.h"
+#include "params.h"
+
+#define PADDING 0
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_qmvq[8] = {FALCON_Q, FALCON_QINV,
+                                                          FALCON_MONT, FALCON_NINV_MONT,
+                                                          FALCON_V, 0,
+                                                          FALCON_MONT_BR, FALCON_NINV_MONT_BR
+                                                         };
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_br[] = {
+    PADDING,    -1479,    -5146,     4043,    -1305,      722,     5736,    -4134,
+    3542,    -4821,     2639,     2319,    -1170,     -955,     -790,     1260,
+    4388,     4632,    -5755,     2426,      334,     1428,     1696,  PADDING,
+    2401,     2401,     2401,     2401,    -5101,    -5101,    -5101,    -5101,
+    390,      390,      390,      390,    -3833,    -3833,    -3833,    -3833,
+    354,      354,      354,      354,    -2912,    -2912,    -2912,    -2912,
+    5012,     5012,     5012,     5012,     2859,     2859,     2859,     2859,
+    442,      442,      442,      442,    -1067,    -1067,    -1067,    -1067,
+    773,      773,      773,      773,     3778,     3778,     3778,     3778,
+    4861,     4861,     4861,     4861,     5698,     5698,     5698,     5698,
+    -2481,    -2481,    -2481,    -2481,    -1045,    -1045,    -1045,    -1045,
+    49,     1263,     5915,     1483,    -2500,    -1489,    -1583,    -5942,
+    1512,      350,    -1815,     5383,     5369,    -2057,    -3202,     4493,
+    -2738,    -5868,    -5735,     2655,    -3009,     1693,      174,      723,
+    -1975,    -3757,      347,     2925,    -3315,     -426,     1858,     4754,
+    7,      845,     3154,     3285,      216,    -5526,      767,    -2213,
+    3120,    -6086,    -3941,     3536,     3229,    -1706,     1282,     2021,
+    3944,     5604,     2171,    -1265,    -2945,     2633,    -3232,     4855,
+    -2941,    -5662,     3837,     3221,     4050,      844,     -980,     4590,
+    1936,     3723,     5054,    -4360,       50,      769,    -3805,     4153,
+    -6105,     5646,     3753,     5370,     4730,     3929,    -3572,    -2832,
+    4099,    -5530,    -3480,     3007,     5349,     1406,     -293,    -3769,
+    -567,     5289,     2595,     4273,    -5207,     5202,     -682,    -5082,
+    -3504,    -2625,     -949,    -3201,     3014,     5086,    -1326,     2013,
+    -3289,      729,     3241,     2881,     3284,    -5092,    -2089,  PADDING,
+    1017,     1017,     1017,     1017,     1632,     1632,     1632,     1632,
+    27,       27,       27,       27,    -3763,    -3763,    -3763,    -3763,
+    1537,     1537,     1537,     1537,     4714,     4714,     4714,     4714,
+    -2678,    -2678,    -2678,    -2678,     5019,     5019,     5019,     5019,
+    -4885,    -4885,    -4885,    -4885,    -5084,    -5084,    -5084,    -5084,
+    -3066,    -3066,    -3066,    -3066,    -1440,    -1440,    -1440,    -1440,
+    242,      242,      242,      242,    -4143,    -4143,    -4143,    -4143,
+    3704,     3704,     3704,     3704,     -545,     -545,     -545,     -545,
+    3030,     4115,     2361,    -1843,     2908,      218,     3434,    -3529,
+    3963,      576,     6142,    -2447,     1954,    -2051,    -2882,    -1805,
+    3991,    -3969,    -2767,      156,     2281,     5876,    -2031,     5333,
+    3772,      418,     5908,     -453,     5429,    -4774,    -4737,     1293,
+    -3469,    -4443,     4693,    -2293,     1802,     5103,    -4411,     1223,
+    -1280,      -24,     -904,    -5547,      881,     1015,     5461,     2637,
+    4684,    -5135,    -4987,     3670,      578,     -450,    -4661,    -2622,
+    5618,     5789,     5043,     3090,     3065,    -5703,    -5900,    -4719,
+    6138,    -3418,     2338,     -417,     1555,    -1891,    -1590,    -2334,
+    614,    -1371,    -2485,    -5039,     -365,    -1927,    -2946,    -4510,
+    3360,       63,     2373,     3808,     5368,     1944,     -510,    -5386,
+    -1658,     3502,      826,     1398,     1506,     4483,      910,     -751,
+    -2545,     -563,    -2975,     4846,    -2747,    -3135,     3712,    -3694,
+    -5179,    -1759,    -3707,     3382,     -355,    -2548,    -4231,  PADDING,
+    1002,     1002,     1002,     1002,     5088,     5088,     5088,     5088,
+    -4976,    -4976,    -4976,    -4976,    -3780,    -3780,    -3780,    -3780,
+    -2437,    -2437,    -2437,    -2437,     6022,     6022,     6022,     6022,
+    -2566,    -2566,    -2566,    -2566,    -6039,    -6039,    -6039,    -6039,
+    5011,     5011,     5011,     5011,    -4284,    -4284,    -4284,    -4284,
+    -1607,    -1607,    -1607,    -1607,     -875,     -875,     -875,     -875,
+    3646,     3646,     3646,     3646,     2987,     2987,     2987,     2987,
+    -2187,    -2187,    -2187,    -2187,    -2422,    -2422,    -2422,    -2422,
+    295,     6099,     5766,      652,    -4016,     4077,    -3762,    -2919,
+    325,    -1404,    -1146,     -948,     5990,     1159,    -3728,    -4049,
+    3329,     4298,     -168,     2692,     5961,    -5106,    -1962,     1594,
+    -6122,    -2555,    -5184,    -1200,     1360,     3956,    -6119,     5297,
+    4518,     1160,     2730,    -2253,     2478,     4194,    -1783,    -4565,
+    -5170,     -865,      189,    -1763,    -1530,    -3869,     5832,    -1734,
+    -5275,    -1251,     2035,    -1882,    -4770,     5287,    -5673,    -5406,
+    4834,    -2828,    -4113,     3840,     3451,    -1241,    -5781,    -2643,
+    3094,     4820,     5411,     1868,    -2840,     3019,    -5078,     4974,
+    2672,     1279,     3116,     2209,     1694,    -4423,     1350,    -3815,
+    -1790,    -5410,     1040,    -6125,      944,    -3669,    -3020,    -4665,
+    2712,     4352,       72,    -1842,    -4094,     4378,    -3045,     1095,
+    3621,    -3006,    -2744,     4805,    -3553,    -1062,    -2294,     3637,
+    3459,      145,    -5542,    -2731,    -3932,    -4890,    -5911,  PADDING,
+    -1065,    -1065,    -1065,    -1065,     -404,     -404,     -404,     -404,
+    1168,     1168,     1168,     1168,    -1207,    -1207,    -1207,    -1207,
+    493,      493,      493,      493,    -5444,    -5444,    -5444,    -5444,
+    -4337,    -4337,    -4337,    -4337,     1378,     1378,     1378,     1378,
+    2143,     2143,     2143,     2143,    -4645,    -4645,    -4645,    -4645,
+    5277,     5277,     5277,     5277,     3248,     3248,     3248,     3248,
+    -4096,    -4096,    -4096,    -4096,     2381,     2381,     2381,     2381,
+    -435,     -435,     -435,     -435,     1912,     1912,     1912,     1912,
+    -4079,    -1058,      922,      441,     1958,     4322,     1112,     2078,
+    4046,      709,    -3150,     1319,     4240,    -3570,    -6065,     -835,
+    2459,      683,     3656,      -64,    -1566,     5782,    -2948,    -2503,
+    -3123,    -1747,    -3054,    -5486,    -4433,    -5919,     3834,    -5257,
+    2873,     -791,    -1120,      -21,      874,      170,     2307,     -648,
+    -1030,     3821,     4649,     2929,     1573,     3793,     -502,     2602,
+    1849,    -3268,    -4301,      457,     -879,      982,     4218,    -3454,
+    -4504,      530,     3578,    -3466,    -2046,    -2957,     3317,      139,
+    2827,     2434,    -2535,    -5808,    -2301,    -5650,     4289,     -150,
+    -466,     1681,     5969,     6026,    -3846,    -6063,     5118,    -1901,
+    5776,     3795,    -4523,       -8,    -2593,    -2276,     4390,    -3758,
+    778,     2626,     4697,     1701,     2940,    -1481,    -2532,     3332,
+    -1646,     5728,    -4591,     3091,      -81,    -4320,    -1000,    -2842,
+    480,     1022,        9,    -2468,      339,     5791,      544,  PADDING,
+    2166,     2166,     2166,     2166,     -113,     -113,     -113,     -113,
+    -160,     -160,     -160,     -160,       -3,       -3,       -3,       -3,
+    3636,     3636,     3636,     3636,     5291,     5291,     5291,     5291,
+    -1426,    -1426,    -1426,    -1426,     1663,     1663,     1663,     1663,
+    3915,     3915,     3915,     3915,    -4919,    -4919,    -4919,    -4919,
+    3149,     3149,     3149,     3149,     4437,     4437,     4437,     4437,
+    4938,     4938,     4938,     4938,     2704,     2704,     2704,     2704,
+    -4654,    -4654,    -4654,    -4654,    -1777,    -1777,    -1777,    -1777,
+    -5241,    -2920,    -4169,    -3127,    -5468,     1010,    -3482,      787,
+    5057,     4698,     4780,    -3445,     -192,     1321,     4912,    -2049,
+    677,    -5874,    -6055,    -3336,     1323,    -2766,      -52,     3174,
+    1579,     -431,    -2505,     5906,     3957,    -2839,      151,    -2127,
+    343,     4538,    -5211,     1208,    -1705,     -416,      716,     2164,
+    5412,    -3278,     3515,     1218,    -1536,     2429,     1373,      717,
+    -3368,     4238,    -4222,     -540,     3163,     6127,     1389,     4404,
+    3359,     5209,     3678,    -1928,     1826,     4489,     1136,     3708,
+    -3448,    -1908,     1866,    -4727,     2450,      814,    -2110,    -5416,
+    -4209,    -5993,     -438,     5061,    -1721,    -4103,    -2982,    -3589,
+    4227,     -612,     1526,     -125,     4032,    -4840,    -2068,     -346,
+    -3205,     1092,     4265,      464,     2926,    -3171,     3449,    -3238,
+    1212,     5023,     5828,    -2963,    -4896,    -3051,     2366,    -1673,
+    4278,    -5331,    -4989,    -4177,    -3584,     1381,    -2525,  PADDING,
+    3364,     3364,     3364,     3364,     4057,     4057,     4057,     4057,
+    -2847,    -2847,    -2847,    -2847,     2174,     2174,     2174,     2174,
+    -5042,    -5042,    -5042,    -5042,     4053,     4053,     4053,     4053,
+    5195,     5195,     5195,     5195,    -4895,    -4895,    -4895,    -4895,
+    1689,     1689,     1689,     1689,    -3271,    -3271,    -3271,    -3271,
+    -4414,    -4414,    -4414,    -4414,     4372,     4372,     4372,     4372,
+    -2305,    -2305,    -2305,    -2305,     2645,     2645,     2645,     2645,
+    -2780,    -2780,    -2780,    -2780,     1484,     1484,     1484,     1484,
+    -58,     -241,     3532,    -1003,     1956,    -5009,     -885,    -6008,
+    3477,    -5681,      142,    -1105,    -2844,     3438,     -975,     4212,
+    -3029,    -5594,     4782,     5886,    -4213,      504,     2302,     -605,
+    -421,    -4080,     3602,     6068,    -3600,     3263,     6077,    -4624,
+    2065,     3495,    -3534,    -1756,     2275,     4267,     5063,    -1518,
+    -1275,    -1176,     4860,    -1445,    -5987,      579,    -2769,    -5966,
+    -3975,    -5835,     1417,    -4505,     3744,     2528,     5102,    -5588,
+    4924,     1014,     1327,     3942,     2717,     3200,     5836,     2260,
+    5826,     4564,     3961,     4145,     2461,     5653,    -4176,    -3765,
+    5508,    -5734,     1125,    -1131,    -5596,     3889,     3114,      212,
+    4883,     3087,     5676,     2257,     4963,    -3056,     -412,    -5845,
+    4781,     -448,     3607,    -5232,       60,    -1535,    -4566,       68,
+    3195,    -3328,    -5777,    -1177,    -4255,    -1635,    -2768,     -953,
+    -3748,      827,     5767,     2476,      118,     2197,    -5067,  PADDING,
+    -3247,    -3247,    -3247,    -3247,    -3978,    -3978,    -3978,    -3978,
+    -2370,    -2370,    -2370,    -2370,     5332,     5332,     5332,     5332,
+    1630,     1630,     1630,     1630,     5407,     5407,     5407,     5407,
+    -1153,    -1153,    -1153,    -1153,    -2249,    -2249,    -2249,    -2249,
+    -2686,    -2686,    -2686,    -2686,    -2969,    -2969,    -2969,    -2969,
+    2865,     2865,     2865,     2865,     3510,     3510,     3510,     3510,
+    -2126,    -2126,    -2126,    -2126,     3186,     3186,     3186,     3186,
+    -2884,    -2884,    -2884,    -2884,    -4048,    -4048,    -4048,    -4048,
+    -4467,    -4789,    -5537,     4749,     4449,    -5456,     -147,    -3789,
+    6118,    -3818,     1190,    -2683,     3860,     5445,    -4536,    -1050,
+    5079,    -3262,     2169,     -522,    -4324,     4916,    -4075,     5315,
+    -1278,    -2344,     1973,    -5574,    -3514,    -1041,     5925,    -1018,
+    180,    -4605,    -1409,      204,    -1468,    -3407,    -1344,    -2483,
+    4739,    -5518,    -3028,     -364,    -1236,    -5246,     3121,     1057,
+    -406,      146,     1403,     6094,     -239,      994,     4670,     5464,
+    3375,    -3393,    -4913,     3825,    -2947,      636,     -622,     5672,
+    4138,     2689,    -5219,     5509,    -3981,      463,    -3042,    -2054,
+    -4251,     1226,     5216,    -2360,    -3017,     4475,     4705,    -2600,
+    -1687,     5268,     1804,    -5189,    -2900,     4554,     -512,     4906,
+    -2291,     4335,     3528,    -4235,    -3982,     5609,    -1737,     4499,
+    5860,    -4978,     1351,     -140,    -1853,    -4611,     -726,     3949,
+    -3296,     4452,     2396,    -4354,      130,     2837,    -5374,  PADDING,
+    -2399,    -2399,    -2399,    -2399,    -5191,    -5191,    -5191,    -5191,
+    -3000,    -3000,    -3000,    -3000,     3016,     3016,     3016,     3016,
+    -5559,    -5559,    -5559,    -5559,    -2178,    -2178,    -2178,    -2178,
+    3985,     3985,     3985,     3985,     3531,     3531,     3531,     3531,
+    -3400,    -3400,    -3400,    -3400,    -3136,    -3136,    -3136,    -3136,
+    671,      671,      671,      671,      243,      243,      243,      243,
+    420,      420,      420,      420,     1544,     1544,     1544,     1544,
+    4905,     4905,     4905,     4905,      476,      476,      476,      476,
+    654,     3565,     1702,     1987,    -5529,     5206,     3199,      -56,
+    6136,    -5862,    -5415,    -3643,     4948,    -6137,      400,    -1728,
+    5339,     5446,     3710,     6093,      468,    -3988,      316,     -382,
+    -2033,    -3998,     3879,     1922,    -1359,    -5435,      973,    -1254,
+    5598,    -1892,    -5724,    -1029,     5959,    -3959,     2442,     5115,
+    -1314,     2894,    -5690,    -3947,     3343,     1522,      -20,     4608,
+    4578,     -375,    -1836,    -2185,     6085,    -1038,    -2231,     2800,
+    506,     1392,     3276,     2212,    -1942,     2575,     2776,    -5478,
+    3344,    -3624,    -1325,    -1945,    -2148,     5797,     1248,     4939,
+    1744,    -3654,    -2455,      338,    -4119,    -2151,     5002,     5163,
+    377,     1620,     -425,     -392,    -4167,     -923,    -6092,      193,
+    1255,     5784,    -3338,    -2674,    -3408,     1165,    -1178,     3511,
+}; // 1024->1416
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_qinv_br[] = {
+    PADDING,    -3943,   -13721,    10780,    -3479,     1925,    15294,   -11023,
+    9444,   -12854,     7036,     6183,    -3119,    -2546,    -2106,     3359,
+    11700,    12350,   -15345,     6468,      890,     3807,     4522,  PADDING,
+    6402,     6402,     6402,     6402,   -13601,   -13601,   -13601,   -13601,
+    1039,     1039,     1039,     1039,   -10220,   -10220,   -10220,   -10220,
+    943,      943,      943,      943,    -7764,    -7764,    -7764,    -7764,
+    13364,    13364,    13364,    13364,     7623,     7623,     7623,     7623,
+    1178,     1178,     1178,     1178,    -2845,    -2845,    -2845,    -2845,
+    2061,     2061,     2061,     2061,    10073,    10073,    10073,    10073,
+    12961,    12961,    12961,    12961,    15193,    15193,    15193,    15193,
+    -6615,    -6615,    -6615,    -6615,    -2786,    -2786,    -2786,    -2786,
+    130,     3367,    15772,     3954,    -6666,    -3970,    -4220,   -15844,
+    4031,      933,    -4839,    14353,    14316,    -5484,    -8537,    11980,
+    -7300,   -15646,   -15292,     7079,    -8023,     4514,      463,     1927,
+    -5266,   -10017,      925,     7799,    -8839,    -1135,     4954,    12676,
+    18,     2253,     8409,     8759,      575,   -14734,     2045,    -5900,
+    8319,   -16228,   -10508,     9428,     8609,    -4548,     3418,     5388,
+    10516,    14942,     5788,    -3373,    -7852,     7020,    -8617,    12945,
+    -7842,   -15097,    10231,     8588,    10799,     2250,    -2613,    12239,
+    5162,     9927,    13476,   -11625,      133,     2050,   -10145,    11073,
+    -16278,    15054,    10007,    14318,    12612,    10476,    -9524,    -7551,
+    10929,   -14745,    -9279,     8018,    14262,     3749,     -781,   -10049,
+    -1511,    14102,     6919,    11393,   -13884,    13870,    -1818,   -13550,
+    -9343,    -6999,    -2530,    -8535,     8036,    13561,    -3535,     5367,
+    -8769,     1943,     8641,     7682,     8756,   -13577,    -5570,  PADDING,
+    2711,     2711,     2711,     2711,     4351,     4351,     4351,     4351,
+    71,       71,       71,       71,   -10033,   -10033,   -10033,   -10033,
+    4098,     4098,     4098,     4098,    12569,    12569,    12569,    12569,
+    -7140,    -7140,    -7140,    -7140,    13382,    13382,    13382,    13382,
+    -13025,   -13025,   -13025,   -13025,   -13556,   -13556,   -13556,   -13556,
+    -8175,    -8175,    -8175,    -8175,    -3839,    -3839,    -3839,    -3839,
+    645,      645,      645,      645,   -11047,   -11047,   -11047,   -11047,
+    9876,     9876,     9876,     9876,    -1453,    -1453,    -1453,    -1453,
+    8079,    10972,     6295,    -4914,     7754,      581,     9156,    -9409,
+    10567,     1535,    16377,    -6524,     5210,    -5468,    -7684,    -4812,
+    10641,   -10583,    -7378,      415,     6082,    15668,    -5415,    14220,
+    10057,     1114,    15753,    -1207,    14476,   -12729,   -12630,     3447,
+    -9249,   -11847,    12513,    -6114,     4804,    13606,   -11761,     3261,
+    -3413,      -63,    -2410,   -14790,     2349,     2706,    14561,     7031,
+    12489,   -13692,   -13297,     9785,     1541,    -1199,   -12428,    -6991,
+    14980,    15436,    13446,     8239,     8172,   -15206,   -15732,   -12582,
+    16366,    -9113,     6234,    -1111,     4146,    -5042,    -4239,    -6223,
+    1637,    -3655,    -6626,   -13436,     -973,    -5138,    -7855,   -12025,
+    8959,      167,     6327,    10153,    14313,     5183,    -1359,   -14361,
+    -4420,     9337,     2202,     3727,     4015,    11953,     2426,    -2002,
+    -6786,    -1501,    -7932,    12921,    -7324,    -8359,     9897,    -9849,
+    -13809,    -4690,    -9884,     9017,     -946,    -6794,   -11281,  PADDING,
+    2671,     2671,     2671,     2671,    13566,    13566,    13566,    13566,
+    -13268,   -13268,   -13268,   -13268,   -10079,   -10079,   -10079,   -10079,
+    -6498,    -6498,    -6498,    -6498,    16057,    16057,    16057,    16057,
+    -6842,    -6842,    -6842,    -6842,   -16102,   -16102,   -16102,   -16102,
+    13361,    13361,    13361,    13361,   -11423,   -11423,   -11423,   -11423,
+    -4284,    -4284,    -4284,    -4284,    -2333,    -2333,    -2333,    -2333,
+    9721,     9721,     9721,     9721,     7964,     7964,     7964,     7964,
+    -5831,    -5831,    -5831,    -5831,    -6458,    -6458,    -6458,    -6458,
+    786,    16262,    15374,     1738,   -10708,    10871,   -10031,    -7783,
+    866,    -3743,    -3055,    -2527,    15972,     3090,    -9940,   -10796,
+    8876,    11460,     -447,     7178,    15894,   -13614,    -5231,     4250,
+    -16324,    -6812,   -13822,    -3199,     3626,    10548,   -16316,    14124,
+    12047,     3093,     7279,    -6007,     6607,    11183,    -4754,   -12172,
+    -13785,    -2306,      503,    -4700,    -4079,   -10316,    15550,    -4623,
+    -14065,    -3335,     5426,    -5018,   -12718,    14097,   -15126,   -14414,
+    12889,    -7540,   -10967,    10239,     9201,    -3309,   -15414,    -7047,
+    8249,    12852,    14428,     4980,    -7572,     8050,   -13540,    13262,
+    7124,     3410,     8308,     5890,     4516,   -11793,     3599,   -10172,
+    -4772,   -14425,     2773,   -16332,     2517,    -9783,    -8052,   -12438,
+    7231,    11604,      191,    -4911,   -10916,    11673,    -8119,     2919,
+    9655,    -8015,    -7316,    12812,    -9473,    -2831,    -6116,     9697,
+    9223,      386,   -14777,    -7282,   -10484,   -13038,   -15761,  PADDING,
+    -2839,    -2839,    -2839,    -2839,    -1077,    -1077,    -1077,    -1077,
+    3114,     3114,     3114,     3114,    -3218,    -3218,    -3218,    -3218,
+    1314,     1314,     1314,     1314,   -14516,   -14516,   -14516,   -14516,
+    -11564,   -11564,   -11564,   -11564,     3674,     3674,     3674,     3674,
+    5714,     5714,     5714,     5714,   -12385,   -12385,   -12385,   -12385,
+    14070,    14070,    14070,    14070,     8660,     8660,     8660,     8660,
+    -10921,   -10921,   -10921,   -10921,     6348,     6348,     6348,     6348,
+    -1159,    -1159,    -1159,    -1159,     5098,     5098,     5098,     5098,
+    -10876,    -2821,     2458,     1175,     5220,    11524,     2965,     5540,
+    10788,     1890,    -8399,     3517,    11305,    -9519,   -16172,    -2226,
+    6556,     1821,     9748,     -170,    -4175,    15417,    -7860,    -6674,
+    -8327,    -4658,    -8143,   -14628,   -11820,   -15782,    10223,   -14017,
+    7660,    -2109,    -2986,      -55,     2330,      453,     6151,    -1727,
+    -2746,    10188,    12396,     7810,     4194,    10113,    -1338,     6938,
+    4930,    -8713,   -11468,     1218,    -2343,     2618,    11247,    -9209,
+    -12009,     1413,     9540,    -9241,    -5455,    -7884,     8844,      370,
+    7538,     6490,    -6759,   -15486,    -6135,   -15065,    11436,     -399,
+    -1242,     4482,    15916,    16068,   -10255,   -16166,    13646,    -5068,
+    15401,    10119,   -12060,      -21,    -6914,    -6068,    11705,   -10020,
+    2074,     7002,    12524,     4535,     7839,    -3949,    -6751,     8884,
+    -4388,    15273,   -12241,     8241,     -215,   -11519,    -2666,    -7578,
+    1279,     2725,       23,    -6580,      903,    15441,     1450,  PADDING,
+    5775,     5775,     5775,     5775,     -301,     -301,     -301,     -301,
+    -426,     -426,     -426,     -426,       -7,       -7,       -7,       -7,
+    9695,     9695,     9695,     9695,    14108,    14108,    14108,    14108,
+    -3802,    -3802,    -3802,    -3802,     4434,     4434,     4434,     4434,
+    10439,    10439,    10439,    10439,   -13116,   -13116,   -13116,   -13116,
+    8396,     8396,     8396,     8396,    11831,    11831,    11831,    11831,
+    13166,    13166,    13166,    13166,     7210,     7210,     7210,     7210,
+    -12409,   -12409,   -12409,   -12409,    -4738,    -4738,    -4738,    -4738,
+    -13974,    -7786,   -11116,    -8337,   -14580,     2693,    -9284,     2098,
+    13484,    12526,    12745,    -9185,     -511,     3522,    13097,    -5463,
+    1805,   -15662,   -16145,    -8895,     3527,    -7375,     -138,     8463,
+    4210,    -1149,    -6679,    15748,    10551,    -7570,      402,    -5671,
+    914,    12100,   -13894,     3221,    -4546,    -1109,     1909,     5770,
+    14430,    -8740,     9372,     3247,    -4095,     6476,     3661,     1911,
+    -8980,    11300,   -11257,    -1439,     8433,    16337,     3703,    11743,
+    8956,    13889,     9807,    -5140,     4868,    11969,     3029,     9887,
+    -9193,    -5087,     4975,   -12604,     6532,     2170,    -5626,   -14441,
+    -11223,   -15980,    -1167,    13494,    -4588,   -10940,    -7951,    -9569,
+    11271,    -1631,     4069,     -333,    10751,   -12905,    -5514,     -922,
+    -8545,     2911,    11372,     1237,     7802,    -8455,     9196,    -8633,
+    3231,    13393,    15540,    -7900,   -13054,    -8135,     6308,    -4460,
+    11407,   -14214,   -13302,   -11137,    -9556,     3682,    -6732,  PADDING,
+    8969,     8969,     8969,     8969,    10817,    10817,    10817,    10817,
+    -7591,    -7591,    -7591,    -7591,     5796,     5796,     5796,     5796,
+    -13444,   -13444,   -13444,   -13444,    10807,    10807,    10807,    10807,
+    13852,    13852,    13852,    13852,   -13052,   -13052,   -13052,   -13052,
+    4503,     4503,     4503,     4503,    -8721,    -8721,    -8721,    -8721,
+    -11769,   -11769,   -11769,   -11769,    11657,    11657,    11657,    11657,
+    -6146,    -6146,    -6146,    -6146,     7052,     7052,     7052,     7052,
+    -7412,    -7412,    -7412,    -7412,     3957,     3957,     3957,     3957,
+    -154,     -642,     9417,    -2674,     5215,   -13356,    -2359,   -16020,
+    9271,   -15148,      378,    -2946,    -7583,     9167,    -2599,    11231,
+    -8076,   -14916,    12750,    15694,   -11233,     1343,     6138,    -1613,
+    -1122,   -10879,     9604,    16180,    -9599,     8700,    16204,   -12329,
+    5506,     9319,    -9423,    -4682,     6066,    11377,    13500,    -4047,
+    -3399,    -3135,    12958,    -3853,   -15964,     1543,    -7383,   -15908,
+    -10599,   -15558,     3778,   -12012,     9983,     6740,    13604,   -14900,
+    13129,     2703,     3538,    10511,     7244,     8532,    15561,     6026,
+    15534,    12169,    10561,    11052,     6562,    15073,   -11135,   -10039,
+    14686,   -15289,     2999,    -3015,   -14921,    10369,     8303,      565,
+    13020,     8231,    15134,     6018,    13233,    -8148,    -1098,   -15585,
+    12748,    -1194,     9617,   -13950,      159,    -4093,   -12175,      181,
+    8519,    -8873,   -15404,    -3138,   -11345,    -4359,    -7380,    -2541,
+    -9993,     2205,    15377,     6602,      314,     5858,   -13510,  PADDING,
+    -8657,    -8657,    -8657,    -8657,   -10607,   -10607,   -10607,   -10607,
+    -6319,    -6319,    -6319,    -6319,    14217,    14217,    14217,    14217,
+    4346,     4346,     4346,     4346,    14417,    14417,    14417,    14417,
+    -3074,    -3074,    -3074,    -3074,    -5996,    -5996,    -5996,    -5996,
+    -7162,    -7162,    -7162,    -7162,    -7916,    -7916,    -7916,    -7916,
+    7639,     7639,     7639,     7639,     9359,     9359,     9359,     9359,
+    -5668,    -5668,    -5668,    -5668,     8495,     8495,     8495,     8495,
+    -7690,    -7690,    -7690,    -7690,   -10793,   -10793,   -10793,   -10793,
+    -11911,   -12769,   -14764,    12662,    11863,   -14548,     -391,   -10103,
+    16313,   -10180,     3173,    -7154,    10292,    14518,   -12095,    -2799,
+    13542,    -8697,     5783,    -1391,   -11529,    13108,   -10865,    14172,
+    -3407,    -6250,     5260,   -14862,    -9369,    -2775,    15798,    -2714,
+    479,   -12279,    -3757,      543,    -3914,    -9084,    -3583,    -6620,
+    12636,   -14713,    -8074,     -970,    -3295,   -13988,     8321,     2818,
+    -1082,      389,     3741,    16249,     -637,     2650,    12452,    14569,
+    8999,    -9047,   -13100,    10199,    -7858,     1695,    -1658,    15124,
+    11033,     7170,   -13916,    14689,   -10615,     1234,    -8111,    -5476,
+    -11335,     3269,    13908,    -6292,    -8044,    11932,    12545,    -6932,
+    -4498,    14046,     4810,   -13836,    -7732,    12143,    -1365,    13081,
+    -6108,    11559,     9407,   -11292,   -10617,    14956,    -4631,    11996,
+    15625,   -13273,     3602,     -373,    -4940,   -12294,    -1935,    10529,
+    -8788,    11871,     6388,   -11609,      346,     7564,   -14329,  PADDING,
+    -6396,    -6396,    -6396,    -6396,   -13841,   -13841,   -13841,   -13841,
+    -7999,    -7999,    -7999,    -7999,     8042,     8042,     8042,     8042,
+    -14822,   -14822,   -14822,   -14822,    -5807,    -5807,    -5807,    -5807,
+    10625,    10625,    10625,    10625,     9415,     9415,     9415,     9415,
+    -9065,    -9065,    -9065,    -9065,    -8361,    -8361,    -8361,    -8361,
+    1789,     1789,     1789,     1789,      647,      647,      647,      647,
+    1119,     1119,     1119,     1119,     4116,     4116,     4116,     4116,
+    13078,    13078,    13078,    13078,     1269,     1269,     1269,     1269,
+    1743,     9505,     4538,     5298,   -14742,    13881,     8529,     -149,
+    16361,   -15630,   -14438,    -9713,    13193,   -16364,     1066,    -4607,
+    14236,    14521,     9892,    16246,     1247,   -10633,      842,    -1018,
+    -5420,   -10660,    10343,     5124,    -3623,   -14492,     2594,    -3343,
+    14926,    -5044,   -15262,    -2743,    15889,   -10556,     6511,    13638,
+    -3503,     7716,   -15172,   -10524,     8913,     4058,      -53,    12287,
+    12207,     -999,    -4895,    -5826,    16225,    -2767,    -5948,     7466,
+    1349,     3711,     8735,     5898,    -5178,     6866,     7402,   -14606,
+    8916,    -9663,    -3533,    -5186,    -5727,    15457,     3327,    13169,
+    4650,    -9743,    -6546,      901,   -10983,    -5735,    13337,    13766,
+    1005,     4319,    -1133,    -1045,   -11111,    -2461,   -16244,      514,
+    3346,    15422,    -8900,    -7130,    -9087,     3106,    -3141,     9361,
+}; // 1416
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_br[] = {
+    -3511,     1178,    -1165,     3408,     2674,     3338,    -5784,    -1255,
+    -193,     6092,      923,     4167,      392,      425,    -1620,     -377,
+    -5163,    -5002,     2151,     4119,     -338,     2455,     3654,    -1744,
+    -4939,    -1248,    -5797,     2148,     1945,     1325,     3624,    -3344,
+    5478,    -2776,    -2575,     1942,    -2212,    -3276,    -1392,     -506,
+    -2800,     2231,     1038,    -6085,     2185,     1836,      375,    -4578,
+    -4608,       20,    -1522,    -3343,     3947,     5690,    -2894,     1314,
+    -5115,    -2442,     3959,    -5959,     1029,     5724,     1892,    -5598,
+    1254,     -973,     5435,     1359,    -1922,    -3879,     3998,     2033,
+    382,     -316,     3988,     -468,    -6093,    -3710,    -5446,    -5339,
+    1728,     -400,     6137,    -4948,     3643,     5415,     5862,    -6136,
+    56,    -3199,    -5206,     5529,    -1987,    -1702,    -3565,     -654,
+    -476,     -476,     -476,     -476,    -4905,    -4905,    -4905,    -4905,
+    -1544,    -1544,    -1544,    -1544,     -420,     -420,     -420,     -420,
+    -243,     -243,     -243,     -243,     -671,     -671,     -671,     -671,
+    3136,     3136,     3136,     3136,     3400,     3400,     3400,     3400,
+    -3531,    -3531,    -3531,    -3531,    -3985,    -3985,    -3985,    -3985,
+    2178,     2178,     2178,     2178,     5559,     5559,     5559,     5559,
+    -3016,    -3016,    -3016,    -3016,     3000,     3000,     3000,     3000,
+    5191,     5191,     5191,     5191,     2399,     2399,     2399,     2399,
+    5374,    -2837,     -130,     4354,    -2396,    -4452,     3296,    -3949,
+    726,     4611,     1853,      140,    -1351,     4978,    -5860,  PADDING,
+    -4499,     1737,    -5609,     3982,     4235,    -3528,    -4335,     2291,
+    -4906,      512,    -4554,     2900,     5189,    -1804,    -5268,     1687,
+    2600,    -4705,    -4475,     3017,     2360,    -5216,    -1226,     4251,
+    2054,     3042,     -463,     3981,    -5509,     5219,    -2689,    -4138,
+    -5672,      622,     -636,     2947,    -3825,     4913,     3393,    -3375,
+    -5464,    -4670,     -994,      239,    -6094,    -1403,     -146,      406,
+    -1057,    -3121,     5246,     1236,      364,     3028,     5518,    -4739,
+    2483,     1344,     3407,     1468,     -204,     1409,     4605,     -180,
+    1018,    -5925,     1041,     3514,     5574,    -1973,     2344,     1278,
+    -5315,     4075,    -4916,     4324,      522,    -2169,     3262,    -5079,
+    1050,     4536,    -5445,    -3860,     2683,    -1190,     3818,    -6118,
+    3789,      147,     5456,    -4449,    -4749,     5537,     4789,     4467,
+    4048,     4048,     4048,     4048,     2884,     2884,     2884,     2884,
+    -3186,    -3186,    -3186,    -3186,     2126,     2126,     2126,     2126,
+    -3510,    -3510,    -3510,    -3510,    -2865,    -2865,    -2865,    -2865,
+    2969,     2969,     2969,     2969,     2686,     2686,     2686,     2686,
+    2249,     2249,     2249,     2249,     1153,     1153,     1153,     1153,
+    -5407,    -5407,    -5407,    -5407,    -1630,    -1630,    -1630,    -1630,
+    -5332,    -5332,    -5332,    -5332,     2370,     2370,     2370,     2370,
+    3978,     3978,     3978,     3978,     3247,     3247,     3247,     3247,
+    5067,    -2197,     -118,    -2476,    -5767,     -827,     3748,      953,
+    2768,     1635,     4255,     1177,     5777,     3328,    -3195,  PADDING,
+    -68,     4566,     1535,      -60,     5232,    -3607,      448,    -4781,
+    5845,      412,     3056,    -4963,    -2257,    -5676,    -3087,    -4883,
+    -212,    -3114,    -3889,     5596,     1131,    -1125,     5734,    -5508,
+    3765,     4176,    -5653,    -2461,    -4145,    -3961,    -4564,    -5826,
+    -2260,    -5836,    -3200,    -2717,    -3942,    -1327,    -1014,    -4924,
+    5588,    -5102,    -2528,    -3744,     4505,    -1417,     5835,     3975,
+    5966,     2769,     -579,     5987,     1445,    -4860,     1176,     1275,
+    1518,    -5063,    -4267,    -2275,     1756,     3534,    -3495,    -2065,
+    4624,    -6077,    -3263,     3600,    -6068,    -3602,     4080,      421,
+    605,    -2302,     -504,     4213,    -5886,    -4782,     5594,     3029,
+    -4212,      975,    -3438,     2844,     1105,     -142,     5681,    -3477,
+    6008,      885,     5009,    -1956,     1003,    -3532,      241,       58,
+    -1484,    -1484,    -1484,    -1484,     2780,     2780,     2780,     2780,
+    -2645,    -2645,    -2645,    -2645,     2305,     2305,     2305,     2305,
+    -4372,    -4372,    -4372,    -4372,     4414,     4414,     4414,     4414,
+    3271,     3271,     3271,     3271,    -1689,    -1689,    -1689,    -1689,
+    4895,     4895,     4895,     4895,    -5195,    -5195,    -5195,    -5195,
+    -4053,    -4053,    -4053,    -4053,     5042,     5042,     5042,     5042,
+    -2174,    -2174,    -2174,    -2174,     2847,     2847,     2847,     2847,
+    -4057,    -4057,    -4057,    -4057,    -3364,    -3364,    -3364,    -3364,
+    2525,    -1381,     3584,     4177,     4989,     5331,    -4278,     1673,
+    -2366,     3051,     4896,     2963,    -5828,    -5023,    -1212,  PADDING,
+    3238,    -3449,     3171,    -2926,     -464,    -4265,    -1092,     3205,
+    346,     2068,     4840,    -4032,      125,    -1526,      612,    -4227,
+    3589,     2982,     4103,     1721,    -5061,      438,     5993,     4209,
+    5416,     2110,     -814,    -2450,     4727,    -1866,     1908,     3448,
+    -3708,    -1136,    -4489,    -1826,     1928,    -3678,    -5209,    -3359,
+    -4404,    -1389,    -6127,    -3163,      540,     4222,    -4238,     3368,
+    -717,    -1373,    -2429,     1536,    -1218,    -3515,     3278,    -5412,
+    -2164,     -716,      416,     1705,    -1208,     5211,    -4538,     -343,
+    2127,     -151,     2839,    -3957,    -5906,     2505,      431,    -1579,
+    -3174,       52,     2766,    -1323,     3336,     6055,     5874,     -677,
+    2049,    -4912,    -1321,      192,     3445,    -4780,    -4698,    -5057,
+    -787,     3482,    -1010,     5468,     3127,     4169,     2920,     5241,
+    1777,     1777,     1777,     1777,     4654,     4654,     4654,     4654,
+    -2704,    -2704,    -2704,    -2704,    -4938,    -4938,    -4938,    -4938,
+    -4437,    -4437,    -4437,    -4437,    -3149,    -3149,    -3149,    -3149,
+    4919,     4919,     4919,     4919,    -3915,    -3915,    -3915,    -3915,
+    -1663,    -1663,    -1663,    -1663,     1426,     1426,     1426,     1426,
+    -5291,    -5291,    -5291,    -5291,    -3636,    -3636,    -3636,    -3636,
+    3,        3,        3,        3,      160,      160,      160,      160,
+    113,      113,      113,      113,    -2166,    -2166,    -2166,    -2166,
+    -544,    -5791,     -339,     2468,       -9,    -1022,     -480,     2842,
+    1000,     4320,       81,    -3091,     4591,    -5728,     1646,  PADDING,
+    -3332,     2532,     1481,    -2940,    -1701,    -4697,    -2626,     -778,
+    3758,    -4390,     2276,     2593,        8,     4523,    -3795,    -5776,
+    1901,    -5118,     6063,     3846,    -6026,    -5969,    -1681,      466,
+    150,    -4289,     5650,     2301,     5808,     2535,    -2434,    -2827,
+    -139,    -3317,     2957,     2046,     3466,    -3578,     -530,     4504,
+    3454,    -4218,     -982,      879,     -457,     4301,     3268,    -1849,
+    -2602,      502,    -3793,    -1573,    -2929,    -4649,    -3821,     1030,
+    648,    -2307,     -170,     -874,       21,     1120,      791,    -2873,
+    5257,    -3834,     5919,     4433,     5486,     3054,     1747,     3123,
+    2503,     2948,    -5782,     1566,       64,    -3656,     -683,    -2459,
+    835,     6065,     3570,    -4240,    -1319,     3150,     -709,    -4046,
+    -2078,    -1112,    -4322,    -1958,     -441,     -922,     1058,     4079,
+    -1912,    -1912,    -1912,    -1912,      435,      435,      435,      435,
+    -2381,    -2381,    -2381,    -2381,     4096,     4096,     4096,     4096,
+    -3248,    -3248,    -3248,    -3248,    -5277,    -5277,    -5277,    -5277,
+    4645,     4645,     4645,     4645,    -2143,    -2143,    -2143,    -2143,
+    -1378,    -1378,    -1378,    -1378,     4337,     4337,     4337,     4337,
+    5444,     5444,     5444,     5444,     -493,     -493,     -493,     -493,
+    1207,     1207,     1207,     1207,    -1168,    -1168,    -1168,    -1168,
+    404,      404,      404,      404,     1065,     1065,     1065,     1065,
+    5911,     4890,     3932,     2731,     5542,     -145,    -3459,    -3637,
+    2294,     1062,     3553,    -4805,     2744,     3006,    -3621,  PADDING,
+    -1095,     3045,    -4378,     4094,     1842,      -72,    -4352,    -2712,
+    4665,     3020,     3669,     -944,     6125,    -1040,     5410,     1790,
+    3815,    -1350,     4423,    -1694,    -2209,    -3116,    -1279,    -2672,
+    -4974,     5078,    -3019,     2840,    -1868,    -5411,    -4820,    -3094,
+    2643,     5781,     1241,    -3451,    -3840,     4113,     2828,    -4834,
+    5406,     5673,    -5287,     4770,     1882,    -2035,     1251,     5275,
+    1734,    -5832,     3869,     1530,     1763,     -189,      865,     5170,
+    4565,     1783,    -4194,    -2478,     2253,    -2730,    -1160,    -4518,
+    -5297,     6119,    -3956,    -1360,     1200,     5184,     2555,     6122,
+    -1594,     1962,     5106,    -5961,    -2692,      168,    -4298,    -3329,
+    4049,     3728,    -1159,    -5990,      948,     1146,     1404,     -325,
+    2919,     3762,    -4077,     4016,     -652,    -5766,    -6099,     -295,
+    2422,     2422,     2422,     2422,     2187,     2187,     2187,     2187,
+    -2987,    -2987,    -2987,    -2987,    -3646,    -3646,    -3646,    -3646,
+    875,      875,      875,      875,     1607,     1607,     1607,     1607,
+    4284,     4284,     4284,     4284,    -5011,    -5011,    -5011,    -5011,
+    6039,     6039,     6039,     6039,     2566,     2566,     2566,     2566,
+    -6022,    -6022,    -6022,    -6022,     2437,     2437,     2437,     2437,
+    3780,     3780,     3780,     3780,     4976,     4976,     4976,     4976,
+    -5088,    -5088,    -5088,    -5088,    -1002,    -1002,    -1002,    -1002,
+    4231,     2548,      355,    -3382,     3707,     1759,     5179,     3694,
+    -3712,     3135,     2747,    -4846,     2975,      563,     2545,  PADDING,
+    751,     -910,    -4483,    -1506,    -1398,     -826,    -3502,     1658,
+    5386,      510,    -1944,    -5368,    -3808,    -2373,      -63,    -3360,
+    4510,     2946,     1927,      365,     5039,     2485,     1371,     -614,
+    2334,     1590,     1891,    -1555,      417,    -2338,     3418,    -6138,
+    4719,     5900,     5703,    -3065,    -3090,    -5043,    -5789,    -5618,
+    2622,     4661,      450,     -578,    -3670,     4987,     5135,    -4684,
+    -2637,    -5461,    -1015,     -881,     5547,      904,       24,     1280,
+    -1223,     4411,    -5103,    -1802,     2293,    -4693,     4443,     3469,
+    -1293,     4737,     4774,    -5429,      453,    -5908,     -418,    -3772,
+    -5333,     2031,    -5876,    -2281,     -156,     2767,     3969,    -3991,
+    1805,     2882,     2051,    -1954,     2447,    -6142,     -576,    -3963,
+    3529,    -3434,     -218,    -2908,     1843,    -2361,    -4115,    -3030,
+    545,      545,      545,      545,    -3704,    -3704,    -3704,    -3704,
+    4143,     4143,     4143,     4143,     -242,     -242,     -242,     -242,
+    1440,     1440,     1440,     1440,     3066,     3066,     3066,     3066,
+    5084,     5084,     5084,     5084,     4885,     4885,     4885,     4885,
+    -5019,    -5019,    -5019,    -5019,     2678,     2678,     2678,     2678,
+    -4714,    -4714,    -4714,    -4714,    -1537,    -1537,    -1537,    -1537,
+    3763,     3763,     3763,     3763,      -27,      -27,      -27,      -27,
+    -1632,    -1632,    -1632,    -1632,    -1017,    -1017,    -1017,    -1017,
+    2089,     5092,    -3284,    -2881,    -3241,     -729,     3289,    -2013,
+    1326,    -5086,    -3014,     3201,      949,     2625,     3504,  PADDING,
+    5082,      682,    -5202,     5207,    -4273,    -2595,    -5289,      567,
+    3769,      293,    -1406,    -5349,    -3007,     3480,     5530,    -4099,
+    2832,     3572,    -3929,    -4730,    -5370,    -3753,    -5646,     6105,
+    -4153,     3805,     -769,      -50,     4360,    -5054,    -3723,    -1936,
+    -4590,      980,     -844,    -4050,    -3221,    -3837,     5662,     2941,
+    -4855,     3232,    -2633,     2945,     1265,    -2171,    -5604,    -3944,
+    -2021,    -1282,     1706,    -3229,    -3536,     3941,     6086,    -3120,
+    2213,     -767,     5526,     -216,    -3285,    -3154,     -845,       -7,
+    -4754,    -1858,      426,     3315,    -2925,     -347,     3757,     1975,
+    -723,     -174,    -1693,     3009,    -2655,     5735,     5868,     2738,
+    -4493,     3202,     2057,    -5369,    -5383,     1815,     -350,    -1512,
+    5942,     1583,     1489,     2500,    -1483,    -5915,    -1263,      -49,
+    1045,     1045,     1045,     1045,     2481,     2481,     2481,     2481,
+    -5698,    -5698,    -5698,    -5698,    -4861,    -4861,    -4861,    -4861,
+    -3778,    -3778,    -3778,    -3778,     -773,     -773,     -773,     -773,
+    1067,     1067,     1067,     1067,     -442,     -442,     -442,     -442,
+    -2859,    -2859,    -2859,    -2859,    -5012,    -5012,    -5012,    -5012,
+    2912,     2912,     2912,     2912,     -354,     -354,     -354,     -354,
+    3833,     3833,     3833,     3833,     -390,     -390,     -390,     -390,
+    5101,     5101,     5101,     5101,    -2401,    -2401,    -2401,    -2401,
+    -1696,    -1428,     -334,    -2426,     5755,    -4632,    -4388,    -1260,
+    790,      955,     1170,    -2319,    -2639,     4821,    -3542,  PADDING,
+    4134,    -5736,     -722,     1305,    -4043,     5146,     1479,  PADDING, // dup
+    4134,    -5736,     -722,     1305,    -4043,     5146,     6830,    12277, // ninv=1
+}; // 1424
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_qinv_br[] = {
+    -9361,     3141,    -3106,     9087,     7130,     8900,   -15422,    -3346,
+    -514,    16244,     2461,    11111,     1045,     1133,    -4319,    -1005,
+    -13766,   -13337,     5735,    10983,     -901,     6546,     9743,    -4650,
+    -13169,    -3327,   -15457,     5727,     5186,     3533,     9663,    -8916,
+    14606,    -7402,    -6866,     5178,    -5898,    -8735,    -3711,    -1349,
+    -7466,     5948,     2767,   -16225,     5826,     4895,      999,   -12207,
+    -12287,       53,    -4058,    -8913,    10524,    15172,    -7716,     3503,
+    -13638,    -6511,    10556,   -15889,     2743,    15262,     5044,   -14926,
+    3343,    -2594,    14492,     3623,    -5124,   -10343,    10660,     5420,
+    1018,     -842,    10633,    -1247,   -16246,    -9892,   -14521,   -14236,
+    4607,    -1066,    16364,   -13193,     9713,    14438,    15630,   -16361,
+    149,    -8529,   -13881,    14742,    -5298,    -4538,    -9505,    -1743,
+    -1269,    -1269,    -1269,    -1269,   -13078,   -13078,   -13078,   -13078,
+    -4116,    -4116,    -4116,    -4116,    -1119,    -1119,    -1119,    -1119,
+    -647,     -647,     -647,     -647,    -1789,    -1789,    -1789,    -1789,
+    8361,     8361,     8361,     8361,     9065,     9065,     9065,     9065,
+    -9415,    -9415,    -9415,    -9415,   -10625,   -10625,   -10625,   -10625,
+    5807,     5807,     5807,     5807,    14822,    14822,    14822,    14822,
+    -8042,    -8042,    -8042,    -8042,     7999,     7999,     7999,     7999,
+    13841,    13841,    13841,    13841,     6396,     6396,     6396,     6396,
+    14329,    -7564,     -346,    11609,    -6388,   -11871,     8788,   -10529,
+    1935,    12294,     4940,      373,    -3602,    13273,   -15625,  PADDING,
+    -11996,     4631,   -14956,    10617,    11292,    -9407,   -11559,     6108,
+    -13081,     1365,   -12143,     7732,    13836,    -4810,   -14046,     4498,
+    6932,   -12545,   -11932,     8044,     6292,   -13908,    -3269,    11335,
+    5476,     8111,    -1234,    10615,   -14689,    13916,    -7170,   -11033,
+    -15124,     1658,    -1695,     7858,   -10199,    13100,     9047,    -8999,
+    -14569,   -12452,    -2650,      637,   -16249,    -3741,     -389,     1082,
+    -2818,    -8321,    13988,     3295,      970,     8074,    14713,   -12636,
+    6620,     3583,     9084,     3914,     -543,     3757,    12279,     -479,
+    2714,   -15798,     2775,     9369,    14862,    -5260,     6250,     3407,
+    -14172,    10865,   -13108,    11529,     1391,    -5783,     8697,   -13542,
+    2799,    12095,   -14518,   -10292,     7154,    -3173,    10180,   -16313,
+    10103,      391,    14548,   -11863,   -12662,    14764,    12769,    11911,
+    10793,    10793,    10793,    10793,     7690,     7690,     7690,     7690,
+    -8495,    -8495,    -8495,    -8495,     5668,     5668,     5668,     5668,
+    -9359,    -9359,    -9359,    -9359,    -7639,    -7639,    -7639,    -7639,
+    7916,     7916,     7916,     7916,     7162,     7162,     7162,     7162,
+    5996,     5996,     5996,     5996,     3074,     3074,     3074,     3074,
+    -14417,   -14417,   -14417,   -14417,    -4346,    -4346,    -4346,    -4346,
+    -14217,   -14217,   -14217,   -14217,     6319,     6319,     6319,     6319,
+    10607,    10607,    10607,    10607,     8657,     8657,     8657,     8657,
+    13510,    -5858,     -314,    -6602,   -15377,    -2205,     9993,     2541,
+    7380,     4359,    11345,     3138,    15404,     8873,    -8519,  PADDING,
+    -181,    12175,     4093,     -159,    13950,    -9617,     1194,   -12748,
+    15585,     1098,     8148,   -13233,    -6018,   -15134,    -8231,   -13020,
+    -565,    -8303,   -10369,    14921,     3015,    -2999,    15289,   -14686,
+    10039,    11135,   -15073,    -6562,   -11052,   -10561,   -12169,   -15534,
+    -6026,   -15561,    -8532,    -7244,   -10511,    -3538,    -2703,   -13129,
+    14900,   -13604,    -6740,    -9983,    12012,    -3778,    15558,    10599,
+    15908,     7383,    -1543,    15964,     3853,   -12958,     3135,     3399,
+    4047,   -13500,   -11377,    -6066,     4682,     9423,    -9319,    -5506,
+    12329,   -16204,    -8700,     9599,   -16180,    -9604,    10879,     1122,
+    1613,    -6138,    -1343,    11233,   -15694,   -12750,    14916,     8076,
+    -11231,     2599,    -9167,     7583,     2946,     -378,    15148,    -9271,
+    16020,     2359,    13356,    -5215,     2674,    -9417,      642,      154,
+    -3957,    -3957,    -3957,    -3957,     7412,     7412,     7412,     7412,
+    -7052,    -7052,    -7052,    -7052,     6146,     6146,     6146,     6146,
+    -11657,   -11657,   -11657,   -11657,    11769,    11769,    11769,    11769,
+    8721,     8721,     8721,     8721,    -4503,    -4503,    -4503,    -4503,
+    13052,    13052,    13052,    13052,   -13852,   -13852,   -13852,   -13852,
+    -10807,   -10807,   -10807,   -10807,    13444,    13444,    13444,    13444,
+    -5796,    -5796,    -5796,    -5796,     7591,     7591,     7591,     7591,
+    -10817,   -10817,   -10817,   -10817,    -8969,    -8969,    -8969,    -8969,
+    6732,    -3682,     9556,    11137,    13302,    14214,   -11407,     4460,
+    -6308,     8135,    13054,     7900,   -15540,   -13393,    -3231,  PADDING,
+    8633,    -9196,     8455,    -7802,    -1237,   -11372,    -2911,     8545,
+    922,     5514,    12905,   -10751,      333,    -4069,     1631,   -11271,
+    9569,     7951,    10940,     4588,   -13494,     1167,    15980,    11223,
+    14441,     5626,    -2170,    -6532,    12604,    -4975,     5087,     9193,
+    -9887,    -3029,   -11969,    -4868,     5140,    -9807,   -13889,    -8956,
+    -11743,    -3703,   -16337,    -8433,     1439,    11257,   -11300,     8980,
+    -1911,    -3661,    -6476,     4095,    -3247,    -9372,     8740,   -14430,
+    -5770,    -1909,     1109,     4546,    -3221,    13894,   -12100,     -914,
+    5671,     -402,     7570,   -10551,   -15748,     6679,     1149,    -4210,
+    -8463,      138,     7375,    -3527,     8895,    16145,    15662,    -1805,
+    5463,   -13097,    -3522,      511,     9185,   -12745,   -12526,   -13484,
+    -2098,     9284,    -2693,    14580,     8337,    11116,     7786,    13974,
+    4738,     4738,     4738,     4738,    12409,    12409,    12409,    12409,
+    -7210,    -7210,    -7210,    -7210,   -13166,   -13166,   -13166,   -13166,
+    -11831,   -11831,   -11831,   -11831,    -8396,    -8396,    -8396,    -8396,
+    13116,    13116,    13116,    13116,   -10439,   -10439,   -10439,   -10439,
+    -4434,    -4434,    -4434,    -4434,     3802,     3802,     3802,     3802,
+    -14108,   -14108,   -14108,   -14108,    -9695,    -9695,    -9695,    -9695,
+    7,        7,        7,        7,      426,      426,      426,      426,
+    301,      301,      301,      301,    -5775,    -5775,    -5775,    -5775,
+    -1450,   -15441,     -903,     6580,      -23,    -2725,    -1279,     7578,
+    2666,    11519,      215,    -8241,    12241,   -15273,     4388,  PADDING,
+    -8884,     6751,     3949,    -7839,    -4535,   -12524,    -7002,    -2074,
+    10020,   -11705,     6068,     6914,       21,    12060,   -10119,   -15401,
+    5068,   -13646,    16166,    10255,   -16068,   -15916,    -4482,     1242,
+    399,   -11436,    15065,     6135,    15486,     6759,    -6490,    -7538,
+    -370,    -8844,     7884,     5455,     9241,    -9540,    -1413,    12009,
+    9209,   -11247,    -2618,     2343,    -1218,    11468,     8713,    -4930,
+    -6938,     1338,   -10113,    -4194,    -7810,   -12396,   -10188,     2746,
+    1727,    -6151,     -453,    -2330,       55,     2986,     2109,    -7660,
+    14017,   -10223,    15782,    11820,    14628,     8143,     4658,     8327,
+    6674,     7860,   -15417,     4175,      170,    -9748,    -1821,    -6556,
+    2226,    16172,     9519,   -11305,    -3517,     8399,    -1890,   -10788,
+    -5540,    -2965,   -11524,    -5220,    -1175,    -2458,     2821,    10876,
+    -5098,    -5098,    -5098,    -5098,     1159,     1159,     1159,     1159,
+    -6348,    -6348,    -6348,    -6348,    10921,    10921,    10921,    10921,
+    -8660,    -8660,    -8660,    -8660,   -14070,   -14070,   -14070,   -14070,
+    12385,    12385,    12385,    12385,    -5714,    -5714,    -5714,    -5714,
+    -3674,    -3674,    -3674,    -3674,    11564,    11564,    11564,    11564,
+    14516,    14516,    14516,    14516,    -1314,    -1314,    -1314,    -1314,
+    3218,     3218,     3218,     3218,    -3114,    -3114,    -3114,    -3114,
+    1077,     1077,     1077,     1077,     2839,     2839,     2839,     2839,
+    15761,    13038,    10484,     7282,    14777,     -386,    -9223,    -9697,
+    6116,     2831,     9473,   -12812,     7316,     8015,    -9655,  PADDING,
+    -2919,     8119,   -11673,    10916,     4911,     -191,   -11604,    -7231,
+    12438,     8052,     9783,    -2517,    16332,    -2773,    14425,     4772,
+    10172,    -3599,    11793,    -4516,    -5890,    -8308,    -3410,    -7124,
+    -13262,    13540,    -8050,     7572,    -4980,   -14428,   -12852,    -8249,
+    7047,    15414,     3309,    -9201,   -10239,    10967,     7540,   -12889,
+    14414,    15126,   -14097,    12718,     5018,    -5426,     3335,    14065,
+    4623,   -15550,    10316,     4079,     4700,     -503,     2306,    13785,
+    12172,     4754,   -11183,    -6607,     6007,    -7279,    -3093,   -12047,
+    -14124,    16316,   -10548,    -3626,     3199,    13822,     6812,    16324,
+    -4250,     5231,    13614,   -15894,    -7178,      447,   -11460,    -8876,
+    10796,     9940,    -3090,   -15972,     2527,     3055,     3743,     -866,
+    7783,    10031,   -10871,    10708,    -1738,   -15374,   -16262,     -786,
+    6458,     6458,     6458,     6458,     5831,     5831,     5831,     5831,
+    -7964,    -7964,    -7964,    -7964,    -9721,    -9721,    -9721,    -9721,
+    2333,     2333,     2333,     2333,     4284,     4284,     4284,     4284,
+    11423,    11423,    11423,    11423,   -13361,   -13361,   -13361,   -13361,
+    16102,    16102,    16102,    16102,     6842,     6842,     6842,     6842,
+    -16057,   -16057,   -16057,   -16057,     6498,     6498,     6498,     6498,
+    10079,    10079,    10079,    10079,    13268,    13268,    13268,    13268,
+    -13566,   -13566,   -13566,   -13566,    -2671,    -2671,    -2671,    -2671,
+    11281,     6794,      946,    -9017,     9884,     4690,    13809,     9849,
+    -9897,     8359,     7324,   -12921,     7932,     1501,     6786,  PADDING,
+    2002,    -2426,   -11953,    -4015,    -3727,    -2202,    -9337,     4420,
+    14361,     1359,    -5183,   -14313,   -10153,    -6327,     -167,    -8959,
+    12025,     7855,     5138,      973,    13436,     6626,     3655,    -1637,
+    6223,     4239,     5042,    -4146,     1111,    -6234,     9113,   -16366,
+    12582,    15732,    15206,    -8172,    -8239,   -13446,   -15436,   -14980,
+    6991,    12428,     1199,    -1541,    -9785,    13297,    13692,   -12489,
+    -7031,   -14561,    -2706,    -2349,    14790,     2410,       63,     3413,
+    -3261,    11761,   -13606,    -4804,     6114,   -12513,    11847,     9249,
+    -3447,    12630,    12729,   -14476,     1207,   -15753,    -1114,   -10057,
+    -14220,     5415,   -15668,    -6082,     -415,     7378,    10583,   -10641,
+    4812,     7684,     5468,    -5210,     6524,   -16377,    -1535,   -10567,
+    9409,    -9156,     -581,    -7754,     4914,    -6295,   -10972,    -8079,
+    1453,     1453,     1453,     1453,    -9876,    -9876,    -9876,    -9876,
+    11047,    11047,    11047,    11047,     -645,     -645,     -645,     -645,
+    3839,     3839,     3839,     3839,     8175,     8175,     8175,     8175,
+    13556,    13556,    13556,    13556,    13025,    13025,    13025,    13025,
+    -13382,   -13382,   -13382,   -13382,     7140,     7140,     7140,     7140,
+    -12569,   -12569,   -12569,   -12569,    -4098,    -4098,    -4098,    -4098,
+    10033,    10033,    10033,    10033,      -71,      -71,      -71,      -71,
+    -4351,    -4351,    -4351,    -4351,    -2711,    -2711,    -2711,    -2711,
+    5570,    13577,    -8756,    -7682,    -8641,    -1943,     8769,    -5367,
+    3535,   -13561,    -8036,     8535,     2530,     6999,     9343,  PADDING,
+    13550,     1818,   -13870,    13884,   -11393,    -6919,   -14102,     1511,
+    10049,      781,    -3749,   -14262,    -8018,     9279,    14745,   -10929,
+    7551,     9524,   -10476,   -12612,   -14318,   -10007,   -15054,    16278,
+    -11073,    10145,    -2050,     -133,    11625,   -13476,    -9927,    -5162,
+    -12239,     2613,    -2250,   -10799,    -8588,   -10231,    15097,     7842,
+    -12945,     8617,    -7020,     7852,     3373,    -5788,   -14942,   -10516,
+    -5388,    -3418,     4548,    -8609,    -9428,    10508,    16228,    -8319,
+    5900,    -2045,    14734,     -575,    -8759,    -8409,    -2253,      -18,
+    -12676,    -4954,     1135,     8839,    -7799,     -925,    10017,     5266,
+    -1927,     -463,    -4514,     8023,    -7079,    15292,    15646,     7300,
+    -11980,     8537,     5484,   -14316,   -14353,     4839,     -933,    -4031,
+    15844,     4220,     3970,     6666,    -3954,   -15772,    -3367,     -130,
+    2786,     2786,     2786,     2786,     6615,     6615,     6615,     6615,
+    -15193,   -15193,   -15193,   -15193,   -12961,   -12961,   -12961,   -12961,
+    -10073,   -10073,   -10073,   -10073,    -2061,    -2061,    -2061,    -2061,
+    2845,     2845,     2845,     2845,    -1178,    -1178,    -1178,    -1178,
+    -7623,    -7623,    -7623,    -7623,   -13364,   -13364,   -13364,   -13364,
+    7764,     7764,     7764,     7764,     -943,     -943,     -943,     -943,
+    10220,    10220,    10220,    10220,    -1039,    -1039,    -1039,    -1039,
+    13601,    13601,    13601,    13601,    -6402,    -6402,    -6402,    -6402,
+    -4522,    -3807,     -890,    -6468,    15345,   -12350,   -11700,    -3359,
+    2106,     2546,     3119,    -6183,    -7036,    12854,    -9444,  PADDING,
+    11023,   -15294,    -1925,     3479,   -10780,    13721,     3943,  PADDING, // dup
+    11023,   -15294,    -1925,     3479,   -10780,    13721,     18211,   32736, // ninv=1
+}; // 1424
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.h
new file mode 100644
index 000000000..f04568d7c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.h
@@ -0,0 +1,23 @@
+#ifndef NTT_CONSTS
+#define NTT_CONSTS
+
+#include <stdint.h>
+
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_qmvq[8];
+
+/*
+ * Table for NTT, binary case:
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ */
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_qinv_br[];
+
+/*
+ * Table for inverse NTT
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_qinv_br[];
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/params.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/params.h
new file mode 100644
index 000000000..d494a4806
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/params.h
@@ -0,0 +1,17 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define FALCON_LOGN 10
+
+#define FALCON_N (1 << FALCON_LOGN)
+#define FALCON_Q 12289
+#define FALCON_QINV (-12287) // pow(12289, -1, pow(2, 16)) - pow(2, 16)
+#define FALCON_V 5461        // Barrett reduction
+#define FALCON_MONT 4091     // pow(2, 16, 12289)
+#define FALCON_MONT_BR 10908 // (4091 << 16)//q//2
+
+#define FALCON_NINV_MONT 64     // pow(1024, -1, 12289) * pow(2, 16, 12289)
+#define FALCON_NINV_MONT_BR 170 // (64 << 16) // q //2
+#define FALCON_LOG2_NINV_MONT 6
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly.h
new file mode 100644
index 000000000..2d7509746
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly.h
@@ -0,0 +1,42 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "inner.h"
+#include "params.h"
+
+typedef enum ntt_domain {
+    NTT_NONE = 0,
+    NTT_MONT = 1,
+    NTT_MONT_INV = 2,
+} ntt_domain_t;
+
+typedef enum invntt_domain {
+    INVNTT_NONE = 0,
+    INVNTT_NINV = 1,
+} invntt_domain_t;
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]);
+
+uint16_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+        const int8_t low, const int8_t high);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+        const int16_t low, const int16_t high);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_float.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_float.c
new file mode 100644
index 000000000..10a302cf1
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_float.c
@@ -0,0 +1,1459 @@
+/*
+ * Poly FFT
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(fpr *c, const fpr *restrict a,
+        const fpr *restrict b, unsigned logn) {
+    float64x2x4_t neon_a, neon_b, neon_c;
+    float64x2x2_t neon_a2, neon_b2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+    switch (logn) {
+    case 1:
+        // n = 2;
+        vload(neon_a.val[0], &a[0]);
+        vload(neon_b.val[0], &b[0]);
+
+        vfadd(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+        vstore(&c[0], neon_c.val[0]);
+        break;
+
+    case 2:
+        // n = 4
+        vloadx2(neon_a2, &a[0]);
+        vloadx2(neon_b2, &b[0]);
+
+        vfadd(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+        vfadd(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+        vstorex2(&c[0], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = 0; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+            vloadx4(neon_b, &b[i]);
+
+            vfaddx4(neon_c, neon_a, neon_b);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+/*
+ * c = a - b
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(fpr *c, const fpr *restrict a,
+        const fpr *restrict b, unsigned logn) {
+    float64x2x4_t neon_a, neon_b, neon_c;
+    float64x2x2_t neon_a2, neon_b2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+    switch (logn) {
+    case 1:
+        vload(neon_a.val[0], &a[0]);
+        vload(neon_b.val[0], &b[0]);
+
+        vfsub(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+        vstore(&c[0], neon_c.val[0]);
+        break;
+
+    case 2:
+        vloadx2(neon_a2, &a[0]);
+        vloadx2(neon_b2, &b[0]);
+
+        vfsub(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+        vfsub(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+        vstorex2(&c[0], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = 0; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+            vloadx4(neon_b, &b[i]);
+
+            vfsubx4(neon_c, neon_a, neon_b);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+/*
+ * c = -a
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(fpr *c, const fpr *restrict a,
+        unsigned logn) {
+    float64x2x4_t neon_a, neon_c;
+    float64x2x2_t neon_a2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+
+    switch (logn) {
+    case 1:
+        vload(neon_a.val[0], &a[0]);
+
+        vfneg(neon_c.val[0], neon_a.val[0]);
+
+        vstore(&c[0], neon_c.val[0]);
+        break;
+
+    case 2:
+        vloadx2(neon_a2, &a[0]);
+
+        vfneg(neon_c2.val[0], neon_a2.val[0]);
+        vfneg(neon_c2.val[1], neon_a2.val[1]);
+
+        vstorex2(&c[0], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = 0; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+
+            vfnegx4(neon_c, neon_a);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a,
+        unsigned logn) {
+
+    float64x2x4_t neon_a, neon_c;
+    float64x2x2_t neon_a2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1;
+        c[1] = fpr_neg(a[1]);
+        break;
+
+    case 2:
+        // n = 4; hn = 2
+        vload(neon_a.val[0], &a[2]);
+        vfneg(neon_c.val[0], neon_a.val[0]);
+        vstore(&c[2], neon_c.val[0]);
+        break;
+
+    case 3:
+        // n = 8; hn = 4
+        vloadx2(neon_a2, &a[4]);
+        vfneg(neon_c2.val[0], neon_a2.val[0]);
+        vfneg(neon_c2.val[1], neon_a2.val[1]);
+        vstorex2(&c[4], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = hn; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+
+            vfnegx4(neon_c, neon_a);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log1(
+    fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+    fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+    a_re = a[0];
+    a_im = a[1];
+    b_re = b[0];
+    b_im = b[1];
+
+    c_re = a_re * b_re - a_im * b_im;
+    c_im = a_re * b_im + a_im * b_re;
+
+    c[0] = c_re;
+    c[1] = c_im;
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log2(
+    fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+    // n = 4
+    float64x2x2_t neon_a, neon_b, neon_c;
+    float64x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+    // 0: re, re
+    // 1: im, im
+    vloadx2(neon_a, &a[0]);
+    vloadx2(neon_b, &b[0]);
+
+    a_re = neon_a.val[0];
+    a_im = neon_a.val[1];
+    b_re = neon_b.val[0];
+    b_im = neon_b.val[1];
+
+    FPC_MUL(c_re, c_im, a_re, a_im, b_re, b_im);
+
+    neon_c.val[0] = c_re;
+    neon_c.val[1] = c_im;
+
+    vstorex2(&c[0], neon_c);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log3(
+    fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+    // n = 8
+    float64x2x4_t neon_a, neon_b, neon_c;
+    float64x2x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+    vloadx4(neon_a, &a[0]);
+    vloadx4(neon_b, &b[0]);
+
+    a_re.val[0] = neon_a.val[0];
+    a_re.val[1] = neon_a.val[1];
+    a_im.val[0] = neon_a.val[2];
+    a_im.val[1] = neon_a.val[3];
+
+    b_re.val[0] = neon_b.val[0];
+    b_re.val[1] = neon_b.val[1];
+    b_im.val[0] = neon_b.val[2];
+    b_im.val[1] = neon_b.val[3];
+
+    FPC_MULx2(c_re, c_im, a_re, a_im, b_re, b_im);
+
+    neon_c.val[0] = c_re.val[0];
+    neon_c.val[1] = c_re.val[1];
+    neon_c.val[2] = c_im.val[0];
+    neon_c.val[3] = c_im.val[1];
+
+    vstorex4(&c[0], neon_c);
+}
+
+/* see inner.h */
+/*
+ * c = a * b
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(fpr *c, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    // Total 32 registers
+    float64x2x4_t a_re, b_re, a_im, b_im; // 24
+    float64x2x4_t c_re, c_im;             // 8
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log1(c, a, b);
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log2(c, a, b);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log3(c, a, b);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+            vloadx4(b_im, &b[i + hn]);
+
+            FPC_MULx4(c_re, c_im, a_re, a_im, b_re, b_im);
+
+            vstorex4(&c[i], c_re);
+            vstorex4(&c[i + hn], c_im);
+        }
+        break;
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log1(
+    fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+    const fpr *restrict b) {
+    fpr a_re, a_im, b_re, b_im, c_re, c_im, d_re, d_im;
+
+    a_re = a[0];
+    a_im = a[1];
+    b_re = b[0];
+    b_im = b[1];
+    d_re = d[0];
+    d_im = d[1];
+
+    c_re = a_re * b_re - a_im * b_im;
+    c_im = a_re * b_im + a_im * b_re;
+
+    c[0] = c_re + d_re;
+    c[1] = c_im + d_im;
+
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log2(
+    fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+    const fpr *restrict b) {
+    // n = 4
+    float64x2x2_t neon_a, neon_b, neon_d;
+    float64x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+    // 0: re, re
+    // 1: im, im
+    vloadx2(neon_a, &a[0]);
+    vloadx2(neon_b, &b[0]);
+    vloadx2(neon_d, &d[0]);
+
+    a_re = neon_a.val[0];
+    a_im = neon_a.val[1];
+    b_re = neon_b.val[0];
+    b_im = neon_b.val[1];
+    d_re = neon_d.val[0];
+    d_im = neon_d.val[1];
+
+    FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im);
+
+    neon_d.val[0] = d_re;
+    neon_d.val[1] = d_im;
+
+    vstorex2(&c[0], neon_d);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log3(
+    fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+    const fpr *restrict b) {
+    // n = 8
+    float64x2x4_t neon_a, neon_b, neon_d;
+    float64x2x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+    vloadx4(neon_a, &a[0]);
+    vloadx4(neon_b, &b[0]);
+    vloadx4(neon_d, &d[0]);
+
+    a_re.val[0] = neon_a.val[0];
+    a_re.val[1] = neon_a.val[1];
+    a_im.val[0] = neon_a.val[2];
+    a_im.val[1] = neon_a.val[3];
+
+    b_re.val[0] = neon_b.val[0];
+    b_re.val[1] = neon_b.val[1];
+    b_im.val[0] = neon_b.val[2];
+    b_im.val[1] = neon_b.val[3];
+
+    d_re.val[0] = neon_d.val[0];
+    d_re.val[1] = neon_d.val[1];
+    d_im.val[0] = neon_d.val[2];
+    d_im.val[1] = neon_d.val[3];
+
+    FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im);
+
+    neon_d.val[0] = d_re.val[0];
+    neon_d.val[1] = d_re.val[1];
+    neon_d.val[2] = d_im.val[0];
+    neon_d.val[3] = d_im.val[1];
+
+    vstorex4(&c[0], neon_d);
+}
+
+/* see inner.h */
+/*
+ * c = d + a * b
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(fpr *c, const fpr *restrict d,
+        const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    // Total 32 registers
+    float64x2x4_t a_re, b_re, a_im, b_im, d_re, d_im; // 32
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log1(c, d, a, b);
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log2(c, d, a, b);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log3(c, d, a, b);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+            vloadx4(b_im, &b[i + hn]);
+            vloadx4(d_re, &d[i]);
+            vloadx4(d_im, &d[i + hn]);
+
+            FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+            vstorex4(&c[i], d_re);
+            vstorex4(&c[i + hn], d_im);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(fpr *d, fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+
+    float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(b_re, &b[i]);
+        vloadx4(b_im, &b[i + hn]);
+
+        FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+        vstorex4(&d[i], d_re);
+        vstorex4(&d[i + hn], d_im);
+    }
+}
+
+// c = d + a*b
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+
+    float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(b_re, &b[i]);
+        vloadx4(b_im, &b[i + hn]);
+        vloadx4(d_re, &d[i]);
+        vloadx4(d_im, &d[i + hn]);
+
+        FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+        vstorex4(&c[i], d_re);
+        vstorex4(&c[i + hn], d_im);
+    }
+}
+
+/* see inner.h */
+/*
+ * c = a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(fpr *c,
+        const fpr *restrict a,
+        unsigned logn) {
+
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    float64x2x4_t a_re, a_im, c_re, c_im; // 16
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+
+    vfdupx4(c_im, 0);
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+
+        vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+        vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+        vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+        vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+        vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+        vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+        vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+
+        vstorex4(&c[i], c_re);
+        vstorex4(&c[i + hn], c_im);
+    }
+}
+
+/*
+ * c = d + a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(fpr *c,
+        const fpr *restrict d,
+        const fpr *restrict a,
+        unsigned logn) {
+
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    float64x2x4_t a_re, a_im, d_re; // 16
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(d_re, &d[i]);
+
+        vfmla(d_re.val[0], d_re.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(d_re.val[0], d_re.val[0], a_im.val[0], a_im.val[0]);
+        vfmla(d_re.val[1], d_re.val[1], a_re.val[1], a_re.val[1]);
+        vfmla(d_re.val[1], d_re.val[1], a_im.val[1], a_im.val[1]);
+        vfmla(d_re.val[2], d_re.val[2], a_re.val[2], a_re.val[2]);
+        vfmla(d_re.val[2], d_re.val[2], a_im.val[2], a_im.val[2]);
+        vfmla(d_re.val[3], d_re.val[3], a_re.val[3], a_re.val[3]);
+        vfmla(d_re.val[3], d_re.val[3], a_im.val[3], a_im.val[3]);
+
+        vstorex4(&c[i], d_re);
+    }
+}
+
+/* see inner.h */
+/*
+ * c = a * scalar_x
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x,
+        unsigned logn) {
+    // assert(logn >= 3);
+    // Total SIMD registers: 9
+    const unsigned falcon_n = 1 << logn;
+    float64x2x4_t neon_a, neon_c; // 8
+    float64x2_t neon_x;           // 1
+    neon_x = vdupq_n_f64(x);
+    for (unsigned i = 0; i < falcon_n; i += 8) {
+        vloadx4(neon_a, &a[i]);
+
+        vfmulx4_i(neon_c, neon_a, neon_x);
+
+        vstorex4(&c[i], neon_c);
+    }
+}
+
+/* see inner.h
+ * Unused in the implementation
+ */
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_fft(fpr *restrict c,
+        const fpr *restrict a,
+        const fpr *restrict b,
+        unsigned logn) {
+
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, b_im, c_re, c_im, m;
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(b_re, &b[i]);
+        vloadx4(b_im, &b[i + hn]);
+
+        vfmulx4(m, b_re, b_re);
+        vfmlax4(m, m, b_im, b_im);
+
+        vfmulx4(c_re, a_re, b_re);
+        vfmlax4(c_re, c_re, a_im, b_im);
+
+        vfinvx4(m, m);
+
+        vfmulx4(c_im, a_im, b_re);
+        vfmlsx4(c_im, c_im, a_re, b_im);
+
+        vfmulx4(c_re, c_re, m);
+        vfmulx4(c_im, c_im, m);
+
+        vstorex4(&c[i], c_re);
+        vstorex4(&c[i + hn], c_im);
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+        const fpr *restrict a,
+        const fpr *restrict b,
+        unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, b_im, c_re;
+    float64x2x2_t x, y;
+    float64x2_t z;
+
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1; i = 0
+        /*
+         * x_re = a[0];
+         * x_im = a[1];
+         * y_re = b[0];
+         * y_im = b[1];
+         * d[0] = 1.0/( (x_re*x_re) + (x_im*x_im) + (y_re*y_re) + (y_im*y_im) );
+         */
+        vload(a_re.val[0], &a[0]);
+        vload(b_re.val[0], &b[0]);
+        vfmul(a_re.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(c_re.val[0], a_re.val[0], b_re.val[0], b_re.val[0]);
+        d[0] = 1.0 / vaddvq_f64(c_re.val[0]);
+        break;
+
+    case 2:
+        // n = 4; hn = 2; i = 0, 1
+        vloadx2(x, &a[0]);
+        vloadx2(y, &b[0]);
+
+        vfmul(z, x.val[0], x.val[0]);
+        vfmla(z, z, x.val[1], x.val[1]);
+        vfmla(z, z, y.val[0], y.val[0]);
+        vfmla(z, z, y.val[1], y.val[1]);
+        vfinv(z, z);
+
+        vstore(&d[0], z);
+        break;
+
+    case 3:
+        // n = 8; hn = 4; i = 0,1,2,3
+        vloadx4(a_re, &a[0]);
+        vloadx4(b_re, &b[0]);
+
+        vfmul(x.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(x.val[0], x.val[0], b_re.val[0], b_re.val[0]);
+        vfmla(x.val[0], x.val[0], a_re.val[2], a_re.val[2]);
+        vfmla(x.val[0], x.val[0], b_re.val[2], b_re.val[2]);
+        vfinv(x.val[0], x.val[0]);
+
+        vfmul(x.val[1], a_re.val[1], a_re.val[1]);
+        vfmla(x.val[1], x.val[1], b_re.val[1], b_re.val[1]);
+        vfmla(x.val[1], x.val[1], a_re.val[3], a_re.val[3]);
+        vfmla(x.val[1], x.val[1], b_re.val[3], b_re.val[3]);
+        vfinv(x.val[1], x.val[1]);
+
+        vstorex2(&d[0], x);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+            vloadx4(b_im, &b[i + hn]);
+
+            vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+            vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+            vfmla(c_re.val[0], c_re.val[0], b_re.val[0], b_re.val[0]);
+            vfmla(c_re.val[0], c_re.val[0], b_im.val[0], b_im.val[0]);
+            vfinv(c_re.val[0], c_re.val[0]);
+
+            vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+            vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+            vfmla(c_re.val[1], c_re.val[1], b_re.val[1], b_re.val[1]);
+            vfmla(c_re.val[1], c_re.val[1], b_im.val[1], b_im.val[1]);
+            vfinv(c_re.val[1], c_re.val[1]);
+
+            vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+            vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+            vfmla(c_re.val[2], c_re.val[2], b_re.val[2], b_re.val[2]);
+            vfmla(c_re.val[2], c_re.val[2], b_im.val[2], b_im.val[2]);
+            vfinv(c_re.val[2], c_re.val[2]);
+
+            vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+            vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+            vfmla(c_re.val[3], c_re.val[3], b_re.val[3], b_re.val[3]);
+            vfmla(c_re.val[3], c_re.val[3], b_im.val[3], b_im.val[3]);
+            vfinv(c_re.val[3], c_re.val[3]);
+
+            vstorex4(&d[i], c_re);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add_muladj_fft(
+    fpr *restrict d, const fpr *restrict F, const fpr *restrict G,
+    const fpr *restrict f, const fpr *restrict g, unsigned logn) {
+
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t F_re, F_im, G_re, G_im;
+    float64x2x4_t f_re, f_im, g_re, g_im;
+    float64x2x4_t a_re, a_im;
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(F_re, &F[i]);
+        vloadx4(F_im, &F[i + hn]);
+        vloadx4(f_re, &f[i]);
+        vloadx4(f_im, &f[i + hn]);
+
+        FPC_MUL_CONJx4(a_re, a_im, F_re, F_im, f_re, f_im);
+
+        vloadx4(G_re, &G[i]);
+        vloadx4(g_re, &g[i]);
+
+        vloadx4(G_im, &G[i + hn]);
+        vloadx4(g_im, &g[i + hn]);
+
+        FPC_MLA_CONJx4(a_re, a_im, G_re, G_im, g_re, g_im);
+
+        vstorex4(&d[i], a_re);
+        vstorex4(&d[i + hn], a_im);
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, c_re, c_im;
+    float64x2x2_t a_re_im, b_re_im, c_re_im;
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1; i = 0
+        vload(a_re.val[0], &a[0]);
+        vfmuln(a_re.val[0], a_re.val[0], b[0]);
+        vstore(&c[0], a_re.val[0]);
+        break;
+
+    case 2:
+        // n = 4; hn = 2; i = 0, 1
+        vload2(a_re_im, &a[0]);
+        vload(b_re_im.val[0], &b[0]);
+        vfmul_lane(c_re_im.val[0], a_re_im.val[0], b_re_im.val[0], 0);
+        vfmul_lane(c_re_im.val[1], a_re_im.val[1], b_re_im.val[0], 1);
+        vstore2(&c[0], c_re_im);
+        break;
+
+    case 3:
+        // n = 8; hn = 4; i = 0,1,2,3
+        vload4(a_re, &a[0]);
+        vloadx2(b_re_im, &b[0]);
+        vfmul_lane(c_re.val[0], a_re.val[0], b_re_im.val[0], 0);
+        vfmul_lane(c_re.val[1], a_re.val[1], b_re_im.val[0], 1);
+        vfmul_lane(c_re.val[2], a_re.val[2], b_re_im.val[1], 0);
+        vfmul_lane(c_re.val[3], a_re.val[3], b_re_im.val[1], 1);
+        vstore4(&c[0], c_re);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+
+            vfmulx4(c_re, a_re, b_re);
+            vfmulx4(c_im, a_im, b_re);
+
+            vstorex4(&c[i], c_re);
+            vstorex4(&c[i + hn], c_im);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, binv, c_re, c_im;
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(b_re, &b[i]);
+        vfinvx4(binv, b_re);
+
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+
+        vfmulx4(c_re, a_re, binv);
+        vfmulx4(c_im, a_im, binv);
+
+        vstorex4(&c[i], c_re);
+        vstorex4(&c[i + hn], c_im);
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log1(
+    const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+    float64x2x4_t g00_re, g01_re, g11_re;
+    float64x2x4_t mu_re, m;
+    float64x2_t neon_1i2;
+
+    const fpr imagine[2] = {1.0, -1.0};
+    // n = 2; hn = 1;
+    vload(g00_re.val[0], &g00[0]);
+
+    // g00_re^2 | g00_im^2
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    // 1 / ( g00_re^2 + g00_im^2 )
+    m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+    vload(g01_re.val[0], &g01[0]);
+    vload(neon_1i2, &imagine[0]);
+
+    // g01_re * g00_re | g01_im * g01_im
+    vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+    // g01_im | -g01_re
+    vswap(g01_re.val[1], g01_re.val[0]);
+    vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+    // g01_im * g00_re  - g01_re * g00_im
+    vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+    mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+    // re: mu_re * g01_re + mu_im * g01_im
+    vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+    vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+    vswap(g01_re.val[2], g01_re.val[2]);
+    // im: -g01_im * mu_re  + g01_re * mu_im
+    vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+    g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+    vload(g11_re.val[0], &g11[0]);
+
+    vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+    vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+    vstore(&g11[0], g11_re.val[0]);
+    vstore(&g01[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log2(
+    const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+    float64x2x2_t tmp;
+
+    // n = 4; hn = 2
+    vloadx2(tmp, &g00[0]);
+    g00_re.val[0] = tmp.val[0];
+    g00_im.val[0] = tmp.val[1];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vloadx2(tmp, &g01[0]);
+    g01_re.val[0] = tmp.val[0];
+    g01_im.val[0] = tmp.val[1];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+    vloadx2(tmp, &g11[0]);
+    g11_re.val[0] = tmp.val[0];
+    g11_im.val[0] = tmp.val[1];
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+    tmp.val[0] = d_re.val[0];
+    tmp.val[1] = d_im.val[0];
+    vstorex2(&g11[0], tmp);
+
+    vfneg(mu_im.val[0], mu_im.val[0]);
+    tmp.val[0] = mu_re.val[0];
+    tmp.val[1] = mu_im.val[0];
+    vstorex2(&g01[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log3(
+    const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+    float64x2x4_t mu_re, mu_im, m, d_re;
+    //  n = 8; hn = 4
+    vloadx4(g00_re, &g00[0]);
+    g00_im.val[0] = g00_re.val[2];
+    g00_im.val[1] = g00_re.val[3];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+    vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+    vfinv(m.val[1], m.val[1]);
+
+    vloadx4(g01_re, &g01[0]);
+    g01_im.val[0] = g01_re.val[2];
+    g01_im.val[1] = g01_re.val[3];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+    vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+    vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+    vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+    vloadx4(g11_re, &g11[0]);
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+    vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+    vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+    vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+    vstorex4(&g11[0], d_re);
+
+    vfneg(mu_re.val[2], mu_im.val[0]);
+    vfneg(mu_re.val[3], mu_im.val[1]);
+
+    vstorex4(&g01[0], mu_re);
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+        fpr *restrict g01,
+        fpr *restrict g11, unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log1(g00, g01, g11);
+
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log2(g00, g01, g11);
+
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log3(g00, g01, g11);
+
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(g00_re, &g00[i]);
+            vloadx4(g00_im, &g00[i + hn]);
+
+            vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+            vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+            vfinv(m.val[0], m.val[0]);
+
+            vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+            vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+            vfinv(m.val[1], m.val[1]);
+
+            vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+            vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+            vfinv(m.val[2], m.val[2]);
+
+            vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+            vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+            vfinv(m.val[3], m.val[3]);
+
+            vloadx4(g01_re, &g01[i]);
+            vloadx4(g01_im, &g01[i + hn]);
+
+            vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+            vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+            vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+            vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+            vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+            vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+            vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+            vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+            vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+            vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+            vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+            vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+            vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+            vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+            vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+            vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+            vfmulx4(mu_re, mu_re, m);
+            vfmulx4(mu_im, mu_im, m);
+            vstorex4(&g01[i], mu_re);
+
+            vloadx4(g11_re, &g11[i]);
+            vloadx4(g11_im, &g11[i + hn]);
+
+            vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+            vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+            vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+            vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+            vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+            vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+            vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+            vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+            vstorex4(&g11[i], d_re);
+
+            vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+            vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+            vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+            vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+            vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+            vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+            vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+            vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+            vstorex4(&g11[i + hn], d_im);
+
+            vfnegx4(mu_im, mu_im);
+            vstorex4(&g01[i + hn], mu_im);
+        }
+        break;
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log1(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11) {
+    float64x2x4_t g00_re, g01_re, g11_re;
+    float64x2x4_t mu_re, m;
+    float64x2_t neon_1i2;
+
+    const fpr imagine[2] = {1.0, -1.0};
+    // n = 2; hn = 1;
+    vload(g00_re.val[0], &g00[0]);
+
+    // g00_re^2 | g00_im^2
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    // 1 / ( g00_re^2 + g00_im^2 )
+    m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+    vload(g01_re.val[0], &g01[0]);
+    vload(neon_1i2, &imagine[0]);
+
+    // g01_re * g00_re | g01_im * g01_im
+    vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+    // g01_im | -g01_re
+    vswap(g01_re.val[1], g01_re.val[0]);
+    vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+    // g01_im * g00_re  - g01_re * g00_im
+    vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+    mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+    // re: mu_re * g01_re + mu_im * g01_im
+    vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+    vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+    vswap(g01_re.val[2], g01_re.val[2]);
+    // im: -g01_im * mu_re  + g01_re * mu_im
+    vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+    g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+    vload(g11_re.val[0], &g11[0]);
+
+    vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+    vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+    vstore(&d11[0], g11_re.val[0]);
+    vstore(&l10[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log2(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+    float64x2x2_t tmp;
+
+    // n = 4; hn = 2
+    vloadx2(tmp, &g00[0]);
+    g00_re.val[0] = tmp.val[0];
+    g00_im.val[0] = tmp.val[1];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vloadx2(tmp, &g01[0]);
+    g01_re.val[0] = tmp.val[0];
+    g01_im.val[0] = tmp.val[1];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+    vloadx2(tmp, &g11[0]);
+    g11_re.val[0] = tmp.val[0];
+    g11_im.val[0] = tmp.val[1];
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+    tmp.val[0] = d_re.val[0];
+    tmp.val[1] = d_im.val[0];
+    vstorex2(&d11[0], tmp);
+
+    vfneg(mu_im.val[0], mu_im.val[0]);
+    tmp.val[0] = mu_re.val[0];
+    tmp.val[1] = mu_im.val[0];
+    vstorex2(&l10[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log3(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+    float64x2x4_t mu_re, mu_im, m, d_re;
+    //  n = 8; hn = 4
+    vloadx4(g00_re, &g00[0]);
+    g00_im.val[0] = g00_re.val[2];
+    g00_im.val[1] = g00_re.val[3];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+    vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+    vfinv(m.val[1], m.val[1]);
+
+    vloadx4(g01_re, &g01[0]);
+    g01_im.val[0] = g01_re.val[2];
+    g01_im.val[1] = g01_re.val[3];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+    vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+    vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+    vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+    vloadx4(g11_re, &g11[0]);
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+    vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+    vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+    vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+    vstorex4(&d11[0], d_re);
+
+    vfneg(mu_re.val[2], mu_im.val[0]);
+    vfneg(mu_re.val[3], mu_im.val[1]);
+
+    vstorex4(&l10[0], mu_re);
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11, unsigned logn) {
+
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log1(d11, l10, g00, g01, g11);
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log2(d11, l10, g00, g01, g11);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log3(d11, l10, g00, g01, g11);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(g00_re, &g00[i]);
+            vloadx4(g00_im, &g00[i + hn]);
+
+            vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+            vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+            vfinv(m.val[0], m.val[0]);
+
+            vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+            vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+            vfinv(m.val[1], m.val[1]);
+
+            vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+            vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+            vfinv(m.val[2], m.val[2]);
+
+            vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+            vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+            vfinv(m.val[3], m.val[3]);
+
+            vloadx4(g01_re, &g01[i]);
+            vloadx4(g01_im, &g01[i + hn]);
+
+            vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+            vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+            vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+            vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+            vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+            vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+            vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+            vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+            vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+            vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+            vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+            vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+            vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+            vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+            vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+            vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+            vfmulx4(mu_re, mu_re, m);
+            vfmulx4(mu_im, mu_im, m);
+            vstorex4(&l10[i], mu_re);
+
+            vloadx4(g11_re, &g11[i]);
+            vloadx4(g11_im, &g11[i + hn]);
+
+            vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+            vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+            vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+            vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+            vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+            vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+            vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+            vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+            vstorex4(&d11[i], d_re);
+
+            vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+            vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+            vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+            vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+            vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+            vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+            vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+            vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+            vstorex4(&d11[i + hn], d_im);
+
+            vfnegx4(mu_im, mu_im);
+            vstorex4(&l10[i + hn], mu_im);
+        }
+        break;
+    }
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm,
+        const unsigned falcon_n) {
+    float64x2x4_t neon_t0;
+    uint16x8x4_t neon_hm;
+    uint16x8_t neon_zero;
+    uint32x4x4_t neon_hmu32[2];
+    int64x2x4_t neon_hms64[4];
+    neon_zero = vdupq_n_u16(0);
+    for (unsigned u = 0; u < falcon_n; u += 32) {
+        neon_hm = vld1q_u16_x4(&hm[u]);
+        neon_hmu32[0].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[0], neon_zero);
+        neon_hmu32[0].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[0], neon_zero);
+        neon_hmu32[0].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[1], neon_zero);
+        neon_hmu32[0].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[1], neon_zero);
+
+        neon_hmu32[1].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[2], neon_zero);
+        neon_hmu32[1].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[2], neon_zero);
+        neon_hmu32[1].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[3], neon_zero);
+        neon_hmu32[1].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[3], neon_zero);
+
+        neon_hms64[0].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[0].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[0].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+        neon_hms64[0].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+
+        neon_hms64[1].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[1].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[1].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+        neon_hms64[1].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+
+        neon_hms64[2].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[2].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[2].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+        neon_hms64[2].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+
+        neon_hms64[3].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[3].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[3].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+        neon_hms64[3].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+
+        vfcvtx4(neon_t0, neon_hms64[0]);
+        vstorex4(&t0[u], neon_t0);
+
+        vfcvtx4(neon_t0, neon_hms64[1]);
+        vstorex4(&t0[u + 8], neon_t0);
+
+        vfcvtx4(neon_t0, neon_hms64[2]);
+        vstorex4(&t0[u + 16], neon_t0);
+
+        vfcvtx4(neon_t0, neon_hms64[3]);
+        vstorex4(&t0[u + 24], neon_t0);
+    }
+}
+
+fpr PQCLEAN_FALCONPADDED1024_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2) {
+    float64x2x4_t r1, r11, r2, r22;
+    float64x2x4_t bnorm, bnorm2;
+
+    vfdupx4(bnorm, 0);
+    vfdupx4(bnorm2, 0);
+
+    for (unsigned i = 0; i < FALCON_N;) {
+        vloadx4(r1, &rt1[i]);
+        i += 8;
+
+        vfmla(bnorm.val[0], bnorm.val[0], r1.val[0], r1.val[0]);
+        vfmla(bnorm.val[1], bnorm.val[1], r1.val[1], r1.val[1]);
+        vfmla(bnorm.val[2], bnorm.val[2], r1.val[2], r1.val[2]);
+        vfmla(bnorm.val[3], bnorm.val[3], r1.val[3], r1.val[3]);
+
+        vloadx4(r11, &rt1[i]);
+        i += 8;
+
+        vfmla(bnorm2.val[0], bnorm2.val[0], r11.val[0], r11.val[0]);
+        vfmla(bnorm2.val[1], bnorm2.val[1], r11.val[1], r11.val[1]);
+        vfmla(bnorm2.val[2], bnorm2.val[2], r11.val[2], r11.val[2]);
+        vfmla(bnorm2.val[3], bnorm2.val[3], r11.val[3], r11.val[3]);
+    }
+
+    for (unsigned i = 0; i < FALCON_N;) {
+        vloadx4(r2, &rt2[i]);
+        i += 8;
+
+        vfmla(bnorm.val[0], bnorm.val[0], r2.val[0], r2.val[0]);
+        vfmla(bnorm.val[1], bnorm.val[1], r2.val[1], r2.val[1]);
+        vfmla(bnorm.val[2], bnorm.val[2], r2.val[2], r2.val[2]);
+        vfmla(bnorm.val[3], bnorm.val[3], r2.val[3], r2.val[3]);
+
+        vloadx4(r22, &rt2[i]);
+        i += 8;
+
+        vfmla(bnorm2.val[0], bnorm2.val[0], r22.val[0], r22.val[0]);
+        vfmla(bnorm2.val[1], bnorm2.val[1], r22.val[1], r22.val[1]);
+        vfmla(bnorm2.val[2], bnorm2.val[2], r22.val[2], r22.val[2]);
+        vfmla(bnorm2.val[3], bnorm2.val[3], r22.val[3], r22.val[3]);
+    }
+
+    vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[1]);
+    vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[1]);
+    vfadd(bnorm.val[2], bnorm.val[2], bnorm.val[3]);
+    vfadd(bnorm2.val[2], bnorm2.val[2], bnorm2.val[3]);
+    vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[2]);
+    vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[2]);
+
+    vfadd(bnorm.val[0], bnorm.val[0], bnorm2.val[0]);
+
+    return vaddvq_f64(bnorm.val[0]);
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_int.c
new file mode 100644
index 000000000..d9a353970
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_int.c
@@ -0,0 +1,501 @@
+/*
+ * poly_int.c
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+#include "macrous.h"
+#include "params.h"
+#include "poly.h"
+#include "ntt_consts.h"
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]) {
+    // Total SIMD registers: 24 = 16 + 8
+    int16x8x4_t a, b, e, f; // 16
+    int8x16x4_t c, d;       // 8
+
+    for (int i = 0; i < FALCON_N; i += 128) {
+        c = vld1q_s8_x4(&in[i]);
+
+        a.val[0] = vmovl_s8(vget_low_s8(c.val[0]));
+        a.val[2] = vmovl_s8(vget_low_s8(c.val[1]));
+        b.val[0] = vmovl_s8(vget_low_s8(c.val[2]));
+        b.val[2] = vmovl_s8(vget_low_s8(c.val[3]));
+
+        a.val[1] = vmovl_high_s8(c.val[0]);
+        a.val[3] = vmovl_high_s8(c.val[1]);
+        b.val[1] = vmovl_high_s8(c.val[2]);
+        b.val[3] = vmovl_high_s8(c.val[3]);
+
+        d = vld1q_s8_x4(&in[i + 64]);
+
+        e.val[0] = vmovl_s8(vget_low_s8(d.val[0]));
+        e.val[2] = vmovl_s8(vget_low_s8(d.val[1]));
+        f.val[0] = vmovl_s8(vget_low_s8(d.val[2]));
+        f.val[2] = vmovl_s8(vget_low_s8(d.val[3]));
+
+        e.val[1] = vmovl_high_s8(d.val[0]);
+        e.val[3] = vmovl_high_s8(d.val[1]);
+        f.val[1] = vmovl_high_s8(d.val[2]);
+        f.val[3] = vmovl_high_s8(d.val[3]);
+
+        vst1q_s16_x4(&out[i], a);
+        vst1q_s16_x4(&out[i + 32], b);
+        vst1q_s16_x4(&out[i + 64], e);
+        vst1q_s16_x4(&out[i + 96], f);
+    }
+}
+
+/*
+ * Return f[] = f[]/g[] % 12289
+ * See assembly https://godbolt.org/z/od3Ex7Mbx
+ */
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+    // Total SIMD registers: 24 = 4 + 19 + 1
+    int16x8x4_t src, dst, t, k; // 4
+    int16x8x4_t y0, y1, y2, y3, y4, y5,
+                y6, y7, y8, y9, y10, y11, y12,
+                y13, y14, y15, y16, y17, y18; // 19
+    int16x8_t neon_qmvm;              // 1
+
+    neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+
+    for (int i = 0; i < FALCON_N; i += 32) {
+        // Find y0 = g^12287
+        vload_s16_x4(y0, &g[i]);
+
+        // y0 is already in Montgomery domain
+
+        montmul_x4(y1, y0, y0, neon_qmvm, t);
+        montmul_x4(y2, y1, y0, neon_qmvm, k);
+        montmul_x4(y3, y2, y1, neon_qmvm, t);
+        montmul_x4(y4, y3, y3, neon_qmvm, k);
+        montmul_x4(y5, y4, y4, neon_qmvm, t);
+        montmul_x4(y6, y5, y5, neon_qmvm, k);
+        montmul_x4(y7, y6, y6, neon_qmvm, t);
+        montmul_x4(y8, y7, y7, neon_qmvm, k);
+        montmul_x4(y9, y8, y2, neon_qmvm, t);
+        montmul_x4(y10, y9, y8, neon_qmvm, k);
+        montmul_x4(y11, y10, y10, neon_qmvm, t);
+        montmul_x4(y12, y11, y11, neon_qmvm, k);
+        montmul_x4(y13, y12, y9, neon_qmvm, t);
+        montmul_x4(y14, y13, y13, neon_qmvm, k);
+        montmul_x4(y15, y14, y14, neon_qmvm, t);
+        montmul_x4(y16, y15, y10, neon_qmvm, k);
+        montmul_x4(y17, y16, y16, neon_qmvm, t);
+        montmul_x4(y18, y17, y0, neon_qmvm, k);
+
+        vload_s16_x4(src, &f[i]);
+
+        montmul_x4(dst, y18, src, neon_qmvm, t);
+
+        vstore_s16_x4(&f[i], dst);
+    }
+}
+
+/*
+ * f = g - s
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]) {
+    // Total SIMD registers: 29 = 28 + 1
+    int16x8x4_t a, b, c, d, e, h, t; // 28
+    int16x8_t neon_qmvm;             // 1
+    neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &g[i]);
+        vload_s16_x4(b, &s[i]);
+
+        e.val[0] = vsubq_s16(a.val[0], b.val[0]);
+        e.val[1] = vsubq_s16(a.val[1], b.val[1]);
+        e.val[2] = vsubq_s16(a.val[2], b.val[2]);
+        e.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+        vload_s16_x4(c, &g[i + 32]);
+        vload_s16_x4(d, &s[i + 32]);
+
+        h.val[0] = vsubq_s16(c.val[0], d.val[0]);
+        h.val[1] = vsubq_s16(c.val[1], d.val[1]);
+        h.val[2] = vsubq_s16(c.val[2], d.val[2]);
+        h.val[3] = vsubq_s16(c.val[3], d.val[3]);
+
+        barrett_x4(e, neon_qmvm, t);
+        barrett_x4(h, neon_qmvm, t);
+
+        vstore_s16_x4(&f[i], e);
+        vstore_s16_x4(&f[i + 32], h);
+    }
+}
+
+/*
+ * Check f[] has 0
+ * Return:
+ * 1 if 0 in f[]
+ * otherwise, 0
+ */
+uint16_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]) {
+    // Total SIMD registers: 22 = 12 + 8 + 2
+    int16x8x4_t a, b;      // 8
+    uint16x8x4_t c, d, e1; // 12
+    uint16x8x2_t e2;       // 2
+
+    e2.val[1] = vdupq_n_u16(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &f[i]);
+
+        // Compare bitwise Equal to zero (vector)
+        // a == 0 ? 1 : 0;
+        c.val[0] = vceqzq_s16(a.val[0]);
+        c.val[1] = vceqzq_s16(a.val[1]);
+        c.val[2] = vceqzq_s16(a.val[2]);
+        c.val[3] = vceqzq_s16(a.val[3]);
+
+        vload_s16_x4(b, &f[i + 32]);
+
+        d.val[0] = vceqzq_s16(b.val[0]);
+        d.val[1] = vceqzq_s16(b.val[1]);
+        d.val[2] = vceqzq_s16(b.val[2]);
+        d.val[3] = vceqzq_s16(b.val[3]);
+
+        e1.val[0] = vorrq_u16(d.val[0], c.val[0]);
+        e1.val[1] = vorrq_u16(d.val[1], c.val[1]);
+        e1.val[2] = vorrq_u16(d.val[2], c.val[2]);
+        e1.val[3] = vorrq_u16(d.val[3], c.val[3]);
+
+        e1.val[0] = vorrq_u16(e1.val[0], e1.val[2]);
+        e1.val[1] = vorrq_u16(e1.val[1], e1.val[3]);
+
+        e2.val[0] = vorrq_u16(e1.val[0], e1.val[1]);
+
+        e2.val[1] = vorrq_u16(e2.val[1], e2.val[0]);
+    }
+
+    uint16_t ret = vmaxvq_u16(e2.val[1]);
+
+    return ret;
+}
+
+/*
+ * Branchless conditional addtion with FALCON_Q if coeffcient is < 0
+ * If coefficient is larger than Q, it is subtracted with Q
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) {
+    // Total SIMD registers: 26 = 8 + 16 + 1 + 1
+    uint16x8x4_t b0, b1;        // 8
+    int16x8x4_t a0, a1, c0, c1; // 16
+    int16x8_t neon_q;           // 1
+    uint16x8_t neon_2q;         // 1
+
+    neon_q = vdupq_n_s16(FALCON_Q);
+    neon_2q = vdupq_n_u16(FALCON_Q << 1);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a0, &f[i]);
+
+        b0.val[0] = vcltzq_s16(a0.val[0]);
+        b0.val[1] = vcltzq_s16(a0.val[1]);
+        b0.val[2] = vcltzq_s16(a0.val[2]);
+        b0.val[3] = vcltzq_s16(a0.val[3]);
+
+        vload_s16_x4(a1, &f[i + 32]);
+
+        // Conditional addition with 2*FALCON_Q
+        b1.val[0] = vcltzq_s16(a1.val[0]);
+        b1.val[1] = vcltzq_s16(a1.val[1]);
+        b1.val[2] = vcltzq_s16(a1.val[2]);
+        b1.val[3] = vcltzq_s16(a1.val[3]);
+
+        c0.val[0] = vreinterpretq_s16_u16(vandq_u16(b0.val[0], neon_2q));
+        c0.val[1] = vreinterpretq_s16_u16(vandq_u16(b0.val[1], neon_2q));
+        c0.val[2] = vreinterpretq_s16_u16(vandq_u16(b0.val[2], neon_2q));
+        c0.val[3] = vreinterpretq_s16_u16(vandq_u16(b0.val[3], neon_2q));
+
+        c1.val[0] = vreinterpretq_s16_u16(vandq_u16(b1.val[0], neon_2q));
+        c1.val[1] = vreinterpretq_s16_u16(vandq_u16(b1.val[1], neon_2q));
+        c1.val[2] = vreinterpretq_s16_u16(vandq_u16(b1.val[2], neon_2q));
+        c1.val[3] = vreinterpretq_s16_u16(vandq_u16(b1.val[3], neon_2q));
+
+        vadd_x4(a0, a0, c0);
+        vadd_x4(a1, a1, c1);
+
+        // a > Q ? 1 : 0
+        b0.val[0] = vcgtq_s16(a0.val[0], neon_q);
+        b0.val[1] = vcgtq_s16(a0.val[1], neon_q);
+        b0.val[2] = vcgtq_s16(a0.val[2], neon_q);
+        b0.val[3] = vcgtq_s16(a0.val[3], neon_q);
+
+        b1.val[0] = vcgtq_s16(a1.val[0], neon_q);
+        b1.val[1] = vcgtq_s16(a1.val[1], neon_q);
+        b1.val[2] = vcgtq_s16(a1.val[2], neon_q);
+        b1.val[3] = vcgtq_s16(a1.val[3], neon_q);
+
+        // Conditional subtraction with FALCON_Q
+
+        c0.val[0] = vandq_s16(vreinterpretq_s16_u16(b0.val[0]), neon_q);
+        c0.val[1] = vandq_s16(vreinterpretq_s16_u16(b0.val[1]), neon_q);
+        c0.val[2] = vandq_s16(vreinterpretq_s16_u16(b0.val[2]), neon_q);
+        c0.val[3] = vandq_s16(vreinterpretq_s16_u16(b0.val[3]), neon_q);
+
+        c1.val[0] = vandq_s16(vreinterpretq_s16_u16(b1.val[0]), neon_q);
+        c1.val[1] = vandq_s16(vreinterpretq_s16_u16(b1.val[1]), neon_q);
+        c1.val[2] = vandq_s16(vreinterpretq_s16_u16(b1.val[2]), neon_q);
+        c1.val[3] = vandq_s16(vreinterpretq_s16_u16(b1.val[3]), neon_q);
+
+        vsub_x4(a0, a0, c0);
+        vsub_x4(a1, a1, c1);
+
+        vstore_s16_x4(&f[i], a0);
+        vstore_s16_x4(&f[i + 32], a1);
+    }
+}
+
+/*
+ * Perform conditional subtraction with Q and compare with min, max = -127, 127
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]) {
+    // Total SIMD registers: 32
+    int16x8x4_t a, f;                                   // 8
+    int16x8x4_t d0, d1;                                 // 8
+    uint16x8x4_t c0, c1, x0, x1;                        // 16
+    uint16x8x2_t e;                                     // 2
+    int8x16x4_t g;                                      // 4
+    int16x8_t neon_127, neon__127, neon_q_2, neon__q_2; // 4
+    uint16x8_t neon_q;                                  // 1
+    neon_127 = vdupq_n_s16(127);
+    neon__127 = vdupq_n_s16(-127);
+    neon_q = vdupq_n_u16(FALCON_Q);
+    neon_q_2 = vdupq_n_s16(FALCON_Q >> 1);
+    neon__q_2 = vdupq_n_s16(-(FALCON_Q >> 1));
+
+    e.val[1] = vdupq_n_u16(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &t[i]);
+        vload_s16_x4(f, &t[i + 32]);
+
+        // Conditional subtraction with FALCON_Q
+        // a >= Q/2 ? 1 : 0
+        c0.val[0] = vcgeq_s16(a.val[0], neon_q_2);
+        c0.val[1] = vcgeq_s16(a.val[1], neon_q_2);
+        c0.val[2] = vcgeq_s16(a.val[2], neon_q_2);
+        c0.val[3] = vcgeq_s16(a.val[3], neon_q_2);
+
+        c1.val[0] = vcgeq_s16(f.val[0], neon_q_2);
+        c1.val[1] = vcgeq_s16(f.val[1], neon_q_2);
+        c1.val[2] = vcgeq_s16(f.val[2], neon_q_2);
+        c1.val[3] = vcgeq_s16(f.val[3], neon_q_2);
+
+        // Perform subtraction with Q
+        d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+        d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+        d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+        d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+        d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+        d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+        d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+        d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+        vsub_x4(a, a, d0);
+        vsub_x4(f, f, d1);
+
+        // -Q/2 > a ? 1: 0
+        c0.val[0] = vcgtq_s16(neon__q_2, a.val[0]);
+        c0.val[1] = vcgtq_s16(neon__q_2, a.val[1]);
+        c0.val[2] = vcgtq_s16(neon__q_2, a.val[2]);
+        c0.val[3] = vcgtq_s16(neon__q_2, a.val[3]);
+
+        c1.val[0] = vcgtq_s16(neon__q_2, f.val[0]);
+        c1.val[1] = vcgtq_s16(neon__q_2, f.val[1]);
+        c1.val[2] = vcgtq_s16(neon__q_2, f.val[2]);
+        c1.val[3] = vcgtq_s16(neon__q_2, f.val[3]);
+
+        // Perform addition with Q
+        d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+        d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+        d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+        d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+        d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+        d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+        d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+        d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+        vadd_x4(a, a, d0);
+        vadd_x4(f, f, d1);
+
+        g.val[0] = vmovn_high_s16(vmovn_s16(a.val[0]), a.val[1]);
+        g.val[1] = vmovn_high_s16(vmovn_s16(a.val[2]), a.val[3]);
+        g.val[2] = vmovn_high_s16(vmovn_s16(f.val[0]), f.val[1]);
+        g.val[3] = vmovn_high_s16(vmovn_s16(f.val[2]), f.val[3]);
+
+        vst1q_s8_x4(&G[i], g);
+
+        // -127 > a ? 1 : 0
+        c0.val[0] = vcgtq_s16(neon__127, a.val[0]);
+        c0.val[1] = vcgtq_s16(neon__127, a.val[1]);
+        c0.val[2] = vcgtq_s16(neon__127, a.val[2]);
+        c0.val[3] = vcgtq_s16(neon__127, a.val[3]);
+        // a > 127 ? 1 : 0
+        c1.val[0] = vcgtq_s16(a.val[0], neon_127);
+        c1.val[1] = vcgtq_s16(a.val[1], neon_127);
+        c1.val[2] = vcgtq_s16(a.val[2], neon_127);
+        c1.val[3] = vcgtq_s16(a.val[3], neon_127);
+
+        // -127 > f ? 1 : 0
+        x0.val[0] = vcgtq_s16(neon__127, f.val[0]);
+        x0.val[1] = vcgtq_s16(neon__127, f.val[1]);
+        x0.val[2] = vcgtq_s16(neon__127, f.val[2]);
+        x0.val[3] = vcgtq_s16(neon__127, f.val[3]);
+        // f > 127 ? 1 : 0
+        x1.val[0] = vcgtq_s16(f.val[0], neon_127);
+        x1.val[1] = vcgtq_s16(f.val[1], neon_127);
+        x1.val[2] = vcgtq_s16(f.val[2], neon_127);
+        x1.val[3] = vcgtq_s16(f.val[3], neon_127);
+
+        c0.val[0] = vorrq_u16(c0.val[0], c1.val[0]);
+        c0.val[1] = vorrq_u16(c0.val[1], c1.val[1]);
+        c0.val[2] = vorrq_u16(c0.val[2], c1.val[2]);
+        c0.val[3] = vorrq_u16(c0.val[3], c1.val[3]);
+
+        x0.val[0] = vorrq_u16(x0.val[0], x1.val[0]);
+        x0.val[1] = vorrq_u16(x0.val[1], x1.val[1]);
+        x0.val[2] = vorrq_u16(x0.val[2], x1.val[2]);
+        x0.val[3] = vorrq_u16(x0.val[3], x1.val[3]);
+
+        c0.val[0] = vorrq_u16(c0.val[0], x0.val[0]);
+        c0.val[1] = vorrq_u16(c0.val[1], x0.val[1]);
+        c0.val[2] = vorrq_u16(c0.val[2], x0.val[2]);
+        c0.val[3] = vorrq_u16(c0.val[3], x0.val[3]);
+
+        c0.val[0] = vorrq_u16(c0.val[0], c0.val[2]);
+        c0.val[1] = vorrq_u16(c0.val[1], c0.val[3]);
+
+        e.val[0] = vorrq_u16(c0.val[0], c0.val[1]);
+
+        e.val[1] = vorrq_u16(e.val[1], e.val[0]);
+    }
+    if (vmaxvq_u16(e.val[1])) {
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+        const int8_t low, const int8_t high) {
+    // Total SIMD registers: 15
+    int8x16x4_t a;                 // 4
+    uint8x16x4_t c, d;             // 8
+    uint8x16_t e;                  // 1
+    int8x16_t neon_low, neon_high; // 2
+
+    neon_high = vdupq_n_s8(high);
+    neon_low = vdupq_n_s8(low);
+    e = vdupq_n_u8(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        a = vld1q_s8_x4(&t[i]);
+
+        // low > a ? 1 : 0
+        c.val[0] = vcgtq_s8(neon_low, a.val[0]);
+        c.val[1] = vcgtq_s8(neon_low, a.val[1]);
+        c.val[2] = vcgtq_s8(neon_low, a.val[2]);
+        c.val[3] = vcgtq_s8(neon_low, a.val[3]);
+        // a > high ? 1 : 0
+        d.val[0] = vcgtq_s8(a.val[0], neon_high);
+        d.val[1] = vcgtq_s8(a.val[1], neon_high);
+        d.val[2] = vcgtq_s8(a.val[2], neon_high);
+        d.val[3] = vcgtq_s8(a.val[3], neon_high);
+
+        c.val[0] = vorrq_u8(c.val[0], d.val[0]);
+        c.val[1] = vorrq_u8(c.val[1], d.val[1]);
+        c.val[2] = vorrq_u8(c.val[2], d.val[2]);
+        c.val[3] = vorrq_u8(c.val[3], d.val[3]);
+
+        c.val[0] = vorrq_u8(c.val[0], c.val[2]);
+        c.val[1] = vorrq_u8(c.val[1], c.val[3]);
+
+        c.val[0] = vorrq_u8(c.val[0], c.val[1]);
+
+        e = vorrq_u8(e, c.val[0]);
+
+        if (vmaxvq_u8(e)) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ * Work for FALCON_N >= 32, or FALCON_LOGN >= 5
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+        const int16_t low, const int16_t high) {
+    // Total SIMD registers = 15
+    int16x8x4_t a;                 // 4
+    uint16x8x4_t c, d;             // 8
+    uint16x8_t e;                  // 1
+    int16x8_t neon_low, neon_high; // 2
+
+    neon_high = vdupq_n_s16(high);
+    neon_low = vdupq_n_s16(low);
+    e = vdupq_n_u16(0);
+
+    for (int i = 0; i < FALCON_N; i += 32) {
+        a = vld1q_s16_x4(&t[i]);
+
+        // low > a ? 1 : 0
+        c.val[0] = vcgtq_s16(neon_low, a.val[0]);
+        c.val[1] = vcgtq_s16(neon_low, a.val[1]);
+        c.val[2] = vcgtq_s16(neon_low, a.val[2]);
+        c.val[3] = vcgtq_s16(neon_low, a.val[3]);
+        // a > high ? 1 : 0
+        d.val[0] = vcgtq_s16(a.val[0], neon_high);
+        d.val[1] = vcgtq_s16(a.val[1], neon_high);
+        d.val[2] = vcgtq_s16(a.val[2], neon_high);
+        d.val[3] = vcgtq_s16(a.val[3], neon_high);
+
+        c.val[0] = vorrq_u16(c.val[0], d.val[0]);
+        c.val[1] = vorrq_u16(c.val[1], d.val[1]);
+        c.val[2] = vorrq_u16(c.val[2], d.val[2]);
+        c.val[3] = vorrq_u16(c.val[3], d.val[3]);
+
+        c.val[0] = vorrq_u16(c.val[0], c.val[2]);
+        c.val[1] = vorrq_u16(c.val[1], c.val[3]);
+
+        c.val[0] = vorrq_u16(c.val[0], c.val[1]);
+
+        e = vorrq_u16(e, c.val[0]);
+
+        if (vmaxvq_u16(e)) {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/pqclean.c
new file mode 100644
index 000000000..8cc756323
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/pqclean.c
@@ -0,0 +1,377 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN   40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ *   private key:
+ *      header byte: 0101nnnn
+ *      private f  (6 or 5 bits by element, depending on degree)
+ *      private g  (6 or 5 bits by element, depending on degree)
+ *      private F  (8 bits by element)
+ *
+ *   public key:
+ *      header byte: 0000nnnn
+ *      public h   (14 bits by element)
+ *
+ *   signature:
+ *      header byte: 0011nnnn
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
+ *      padding    to PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES bytes
+ *
+ *   message + signature:
+ *      signature  PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES bytes
+ *      message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk) {
+    union {
+        uint8_t b[28 * FALCON_N];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N];
+    uint16_t h[FALCON_N];
+    unsigned char seed[48];
+    inner_shake256_context rng;
+    size_t u, v;
+
+    /*
+     * Generate key pair.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
+    PQCLEAN_FALCONPADDED1024_AARCH64_keygen(&rng, f, g, F, NULL, h, FALCON_LOGN, tmp.b);
+    inner_shake256_ctx_release(&rng);
+
+    /*
+     * Encode private key.
+     */
+    sk[0] = 0x50 + FALCON_LOGN;
+    u = 1;
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+            f, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+            g, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+            F, PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[FALCON_LOGN]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+
+    /*
+     * Encode public key.
+     */
+    pk[0] = 0x00 + FALCON_LOGN;
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_modq_encode(
+            pk + 1, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1,
+            h, FALCON_LOGN);
+    if (v != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+        const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    union {
+        uint8_t b[72 * FALCON_N];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N], G[FALCON_N];
+    struct {
+        int16_t sig[FALCON_N];
+        uint16_t hm[FALCON_N];
+    } r;
+    unsigned char seed[48];
+    inner_shake256_context sc;
+    size_t u, v;
+
+    /*
+     * Decode the private key.
+     */
+    if (sk[0] != 0x50 + FALCON_LOGN) {
+        return -1;
+    }
+    u = 1;
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(
+            f, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN],
+            sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(
+            g, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN],
+            sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(
+            F, PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[FALCON_LOGN],
+            sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+    if (!PQCLEAN_FALCONPADDED1024_AARCH64_complete_private(G, f, g, F, tmp.b)) {
+        return -1;
+    }
+
+    /*
+     * Create a random nonce (40 bytes).
+     */
+    randombytes(nonce, NONCELEN);
+
+    /*
+     * Hash message nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(&sc, r.hm, FALCON_LOGN, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Initialize a RNG.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
+
+    /*
+     * Compute and return the signature. This loops until a signature
+     * value is found that fits in the provided buffer.
+     */
+    for (;;) {
+        PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+        v = PQCLEAN_FALCONPADDED1024_AARCH64_comp_encode(sigbuf, sigbuflen, r.sig);
+        if (v != 0) {
+            inner_shake256_ctx_release(&sc);
+            memset(sigbuf + v, 0, sigbuflen - v);
+            return 0;
+        }
+    }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+    const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    union {
+        uint8_t b[2 * FALCON_N];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int16_t h[FALCON_N];
+    int16_t hm[FALCON_N];
+    int16_t sig[FALCON_N];
+    inner_shake256_context sc;
+    size_t v;
+
+    /*
+     * Decode public key.
+     */
+    if (pk[0] != 0x00 + FALCON_LOGN) {
+        return -1;
+    }
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_modq_decode( (uint16_t *) h,
+            pk + 1, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1, FALCON_LOGN)
+            != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+    // We move the conversion to NTT domain of `h` inside verify_raw()
+
+    /*
+     * Decode signature.
+     */
+    if (sigbuflen == 0) {
+        return -1;
+    }
+
+    v = PQCLEAN_FALCONPADDED1024_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+    if (v == 0) {
+        return -1;
+    }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
+
+    /*
+     * Hash nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(&sc, (uint16_t *) hm, FALCON_LOGN, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Verify signature.
+     */
+    if (!PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw(hm, sig, h, (int16_t *) tmp.b)) {
+        return -1;
+    }
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    size_t vlen;
+
+    vlen = PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sig[0] = 0x30 + FALCON_LOGN;
+    *siglen = 1 + NONCELEN + vlen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    if (siglen < 1 + NONCELEN) {
+        return -1;
+    }
+    if (sig[0] != 0x30 + FALCON_LOGN) {
+        return -1;
+    }
+    return do_verify(sig + 1,
+                     sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    uint8_t *sigbuf;
+    size_t sigbuflen;
+
+    /*
+     * Move the message to its final location; this is a memmove() so
+     * it handles overlaps properly.
+     */
+    memmove(sm + PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES, m, mlen);
+    sigbuf = sm + 1 + NONCELEN;
+    sigbuflen = PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sm[0] = 0x30 + FALCON_LOGN;
+    sigbuflen ++;
+    *smlen = mlen + NONCELEN + sigbuflen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+    const uint8_t *sigbuf;
+    size_t pmlen, sigbuflen;
+
+    if (smlen < PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES) {
+        return -1;
+    }
+    sigbuflen = PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+    pmlen = smlen - PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES;
+    if (sm[0] != 0x30 + FALCON_LOGN) {
+        return -1;
+    }
+    sigbuf = sm + 1 + NONCELEN;
+
+    /*
+     * The one-byte signature header has been verified. Nonce is at sm+1
+     * followed by the signature (pointed to by sigbuf). The message
+     * follows the signature value.
+     */
+    if (do_verify(sm + 1, sigbuf, sigbuflen,
+                  sm + PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES, pmlen, pk) < 0) {
+        return -1;
+    }
+
+    /*
+     * Signature is correct, we just have to copy/move the message
+     * to its final destination. The memmove() properly handles
+     * overlaps.
+     */
+    memmove(m, sm + PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES, pmlen);
+    *mlen = pmlen;
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/rng.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/rng.c
new file mode 100644
index 000000000..33ed43d88
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/rng.c
@@ -0,0 +1,194 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include "inner.h"
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_get_seed(void *seed, size_t len) {
+    unsigned char tmp[48];
+    for (size_t i = 0; i < len; i++) {
+        tmp[i] = (unsigned char) i;
+    }
+    memcpy(seed, tmp, len);
+    return 1;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(prng *p, inner_shake256_context *src) {
+    /*
+     * To ensure reproducibility for a given seed, we
+     * must enforce little-endian interpretation of
+     * the state words.
+     */
+    uint8_t tmp[56];
+    uint64_t th, tl;
+    int i;
+
+    inner_shake256_extract(src, tmp, 56);
+    for (i = 0; i < 14; i ++) {
+        uint32_t w;
+
+        w = (uint32_t)tmp[(i << 2) + 0]
+            | ((uint32_t)tmp[(i << 2) + 1] << 8)
+            | ((uint32_t)tmp[(i << 2) + 2] << 16)
+            | ((uint32_t)tmp[(i << 2) + 3] << 24);
+        *(uint32_t *)(p->state.d + (i << 2)) = w;
+    }
+    tl = *(uint32_t *)(p->state.d + 48);
+    th = *(uint32_t *)(p->state.d + 52);
+    *(uint64_t *)(p->state.d + 48) = tl + (th << 32);
+    PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(prng *p) {
+
+    static const uint32_t CW[] = {
+        0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+    };
+
+    uint64_t cc;
+    size_t u;
+
+    /*
+     * State uses local endianness. Only the output bytes must be
+     * converted to little endian (if used on a big-endian machine).
+     */
+    cc = *(uint64_t *)(p->state.d + 48);
+    for (u = 0; u < 8; u ++) {
+        uint32_t state[16];
+        size_t v;
+        int i;
+
+        memcpy(&state[0], CW, sizeof CW);
+        memcpy(&state[4], p->state.d, 48);
+        state[14] ^= (uint32_t)cc;
+        state[15] ^= (uint32_t)(cc >> 32);
+        for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d)   do { \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] << 16) | (state[d] >> 16); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] << 12) | (state[b] >> 20); \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] <<  8) | (state[d] >> 24); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] <<  7) | (state[b] >> 25); \
+    } while (0)
+
+            QROUND( 0,  4,  8, 12);
+            QROUND( 1,  5,  9, 13);
+            QROUND( 2,  6, 10, 14);
+            QROUND( 3,  7, 11, 15);
+            QROUND( 0,  5, 10, 15);
+            QROUND( 1,  6, 11, 12);
+            QROUND( 2,  7,  8, 13);
+            QROUND( 3,  4,  9, 14);
+
+#undef QROUND
+
+        }
+
+        for (v = 0; v < 4; v ++) {
+            state[v] += CW[v];
+        }
+        for (v = 4; v < 14; v ++) {
+            state[v] += ((uint32_t *)p->state.d)[v - 4];
+        }
+        state[14] += ((uint32_t *)p->state.d)[10]
+                     ^ (uint32_t)cc;
+        state[15] += ((uint32_t *)p->state.d)[11]
+                     ^ (uint32_t)(cc >> 32);
+        cc ++;
+
+        /*
+         * We mimic the interleaving that is used in the AVX2
+         * implementation.
+         */
+        for (v = 0; v < 16; v ++) {
+            p->buf.d[(u << 2) + (v << 5) + 0] =
+                (uint8_t)state[v];
+            p->buf.d[(u << 2) + (v << 5) + 1] =
+                (uint8_t)(state[v] >> 8);
+            p->buf.d[(u << 2) + (v << 5) + 2] =
+                (uint8_t)(state[v] >> 16);
+            p->buf.d[(u << 2) + (v << 5) + 3] =
+                (uint8_t)(state[v] >> 24);
+        }
+    }
+    *(uint64_t *)(p->state.d + 48) = cc;
+
+    p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len) {
+    uint8_t *buf;
+
+    buf = dst;
+    while (len > 0) {
+        size_t clen;
+
+        clen = (sizeof p->buf.d) - p->ptr;
+        if (clen > len) {
+            clen = len;
+        }
+        memcpy(buf, p->buf.d, clen);
+        buf += clen;
+        len -= clen;
+        p->ptr += clen;
+        if (p->ptr == sizeof p->buf.d) {
+            PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sampler.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sampler.c
new file mode 100644
index 000000000..1b2e4cde9
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sampler.c
@@ -0,0 +1,292 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include <arm_neon.h>
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_gaussian0_sampler(prng *p) {
+
+    static const uint32_t dist[] = {
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
+    };
+
+    uint32_t v0, v1, v2, hi;
+    uint64_t lo;
+    int z;
+
+    /*
+     * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+     */
+    lo = prng_get_u64(p);
+    hi = prng_get_u8(p);
+    v0 = (uint32_t)lo & 0xFFFFFF;
+    v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+    v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+    /*
+     * Sampled value is z, such that v0..v2 is lower than the first
+     * z elements of the table.
+     */
+
+    uint32x4x3_t w;
+    uint32x4_t x0, x1, x2, cc0, cc1, cc2, zz;
+    uint32x2x3_t wh;
+    uint32x2_t cc0h, cc1h, cc2h, zzh;
+    x0 = vdupq_n_u32(v0);
+    x1 = vdupq_n_u32(v1);
+    x2 = vdupq_n_u32(v2);
+
+    // 0: 0, 3, 6, 9
+    // 1: 1, 4, 7, 10
+    // 2: 2, 5, 8, 11
+    // v0 - w0
+    // v1 - w1
+    // v2 - w2
+    // cc1 - cc0 >> 31
+    // cc2 - cc1 >> 31
+    // z + cc2 >> 31
+    w = vld3q_u32(&dist[0]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vshrq_n_u32(cc2, 31);
+
+    w = vld3q_u32(&dist[12]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vsraq_n_u32(zz, cc2, 31);
+
+    w = vld3q_u32(&dist[24]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vsraq_n_u32(zz, cc2, 31);
+
+    w = vld3q_u32(&dist[36]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vsraq_n_u32(zz, cc2, 31);
+
+    // 0: 48, 51
+    // 1: 49, 52
+    // 2: 50, 53
+    wh = vld3_u32(&dist[48]);
+    cc0h = vsub_u32(vget_low_u32(x0), wh.val[2]);
+    cc1h = vsub_u32(vget_low_u32(x1), wh.val[1]);
+    cc2h = vsub_u32(vget_low_u32(x2), wh.val[0]);
+    cc1h = (uint32x2_t)vsra_n_s32((int32x2_t)cc1h, (int32x2_t)cc0h, 31);
+    cc2h = (uint32x2_t)vsra_n_s32((int32x2_t)cc2h, (int32x2_t)cc1h, 31);
+    zzh = vshr_n_u32(cc2h, 31);
+
+    z = (int) (vaddvq_u32(zz) + vaddv_u32(zzh));
+    return z;
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+    int s, i;
+    fpr r;
+    uint32_t sw, w;
+    uint64_t z;
+
+    /*
+     * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+     * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+     */
+    s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+    r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+    /*
+     * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+     * (the minimum value for sigma), r = 0 and b = 1, then we get
+     * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+     * with probability about 0.000000000230383991, which is
+     * approximatively equal to 2^(-32). In any case, if s >= 64,
+     * then BerExp will be non-zero with probability less than
+     * 2^(-64), so we can simply saturate s at 63.
+     */
+    sw = (uint32_t)s;
+    sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+    s = (int)sw;
+
+    /*
+     * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+     * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+     * We scale it up to 2^64, then right-shift it by s bits because
+     * we really want exp(-x) = 2^(-s)*exp(-r).
+     *
+     * The "-1" operation makes sure that the value fits on 64 bits
+     * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+     * case). The bias is negligible since fpr_expm_p63() only computes
+     * with 51 bits of precision or so.
+     */
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+    /*
+     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+     * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+     * PRNG output to limit its consumption, the sign of the difference
+     * yields the expected result.
+     */
+    i = 64;
+    do {
+        i -= 8;
+        w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+    } while (!w && i > 0);
+    return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_sampler(void *ctx, fpr mu, fpr isigma) {
+    sampler_context *spc;
+    int s;
+    fpr r, dss, ccs;
+
+    spc = ctx;
+
+    /*
+     * Center is mu. We compute mu = s + r where s is an integer
+     * and 0 <= r < 1.
+     */
+    s = (int)fpr_floor(mu);
+    r = fpr_sub(mu, fpr_of(s));
+
+    /*
+     * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+     */
+    dss = fpr_half(fpr_sqr(isigma));
+
+    /*
+     * ccs = sigma_min / sigma = sigma_min * isigma.
+     */
+    ccs = fpr_mul(isigma, spc->sigma_min);
+
+    /*
+     * We now need to sample on center r.
+     */
+    for (;;) {
+        int z0, z, b;
+        fpr x;
+
+        /*
+         * Sample z for a Gaussian distribution. Then get a
+         * random bit b to turn the sampling into a bimodal
+         * distribution: if b = 1, we use z+1, otherwise we
+         * use -z. We thus have two situations:
+         *
+         *  - b = 1: z >= 1 and sampled against a Gaussian
+         *    centered on 1.
+         *  - b = 0: z <= 0 and sampled against a Gaussian
+         *    centered on 0.
+         */
+        z0 = PQCLEAN_FALCONPADDED1024_AARCH64_gaussian0_sampler(&spc->p);
+        b = (int)prng_get_u8(&spc->p) & 1;
+        z = b + ((b << 1) - 1) * z0;
+
+        /*
+         * Rejection sampling. We want a Gaussian centered on r;
+         * but we sampled against a Gaussian centered on b (0 or
+         * 1). But we know that z is always in the range where
+         * our sampling distribution is greater than the Gaussian
+         * distribution, so rejection works.
+         *
+         * We got z with distribution:
+         *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+         * We target distribution:
+         *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
+         * Rejection sampling works by keeping the value z with
+         * probability S(z)/G(z), and starting again otherwise.
+         * This requires S(z) <= G(z), which is the case here.
+         * Thus, we simply need to keep our z with probability:
+         *    P = exp(-x)
+         * where:
+         *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+         *
+         * Here, we scale up the Bernouilli distribution, which
+         * makes rejection more probable, but makes rejection
+         * rate sufficiently decorrelated from the Gaussian
+         * center and standard deviation that the whole sampler
+         * can be said to be constant-time.
+         */
+        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+        if (BerExp(&spc->p, x, ccs)) {
+            /*
+             * Rejection sampling was centered on r, but the
+             * actual center is mu = s + r.
+             */
+            return s + z;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sign.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sign.c
new file mode 100644
index 000000000..48e0d8dee
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sign.c
@@ -0,0 +1,951 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+#include "util.h"
+#include <arm_neon.h>
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ *   N = 2^logn
+ *   phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+    /*
+     * For logn = 0 (polynomials are constant), the "tree" is a
+     * single element. Otherwise, the tree node has size 2^logn, and
+     * has two child trees for size logn-1 each. Thus, treesize s()
+     * must fulfill these two relations:
+     *
+     *   s(0) = 1
+     *   s(logn) = (2^logn) + 2*s(logn-1)
+     */
+    return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *restrict tree,
+                fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) {
+    size_t n, hn;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g0[0];
+        return;
+    }
+    hn = n >> 1;
+
+    /*
+     * The LDL decomposition yields L (which is written in the tree)
+     * and the diagonal of D. Since d00 = g0, we just write d11
+     * into tmp.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+    /*
+     * Split d00 (currently in g0) and d11 (currently in tmp). We
+     * reuse g0 and g1 as temporary storage spaces:
+     *   d00 splits into g1, g1+hn
+     *   d11 splits into g0, g0+hn
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(g1, g1 + hn, g0, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+    /*
+     * Each split result is the first row of a new auto-adjoint
+     * quasicyclic matrix for the next recursive step.
+     */
+    ffLDL_fft_inner(tree + n,
+                    g1, g1 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
+          const fpr *restrict g01, const fpr *restrict g11,
+          unsigned logn, fpr *restrict tmp) {
+    size_t n, hn;
+    fpr *d00, *d11;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g00[0];
+        return;
+    }
+    hn = n >> 1;
+    d00 = tmp;
+    d11 = tmp + n;
+    tmp += n << 1;
+
+    memcpy(d00, g00, n * sizeof * g00);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(tmp, tmp + hn, d00, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(d00, d00 + hn, d11, logn);
+    memcpy(d11, tmp, n * sizeof * tmp);
+
+    ffLDL_fft_inner(tree + n, d11, d11 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), d00, d00 + hn, logn - 1, tmp);
+
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+    /*
+     * TODO: make an iterative version.
+     */
+    size_t n;
+
+    n = MKN(logn);
+    if (n == 1) {
+        /*
+         * We actually store in the tree leaf the inverse of
+         * the value mandated by the specification: this
+         * saves a division both here and in the sampler.
+         */
+        tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma_10);
+    } else {
+        ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+        ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+                               orig_logn, logn - 1);
+    }
+}
+
+/* =================================================================== */
+
+/*
+ * The expanded private key contains:
+ *  - The B0 matrix (four elements)
+ *  - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+    (void)logn;
+    return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+    return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+    return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+    return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+    return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_expand_privkey(fpr *restrict expanded_key,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        uint8_t *restrict tmp) {
+    fpr *rf, *rg, *rF, *rG;
+    fpr *b00, *b01, *b10, *b11;
+    fpr *g00, *g01, *g11, *gxx;
+    fpr *tree;
+
+    b00 = expanded_key + skoff_b00(FALCON_LOGN);
+    b01 = expanded_key + skoff_b01(FALCON_LOGN);
+    b10 = expanded_key + skoff_b10(FALCON_LOGN);
+    b11 = expanded_key + skoff_b11(FALCON_LOGN);
+    tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+    /*
+     * We load the private key elements directly into the B0 matrix,
+     * since B0 = [[g, -f], [G, -F]].
+     */
+    rg = b00;
+    rf = b01;
+    rG = b10;
+    rF = b11;
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rg, g, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rg, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rf, f, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rf, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(rf, rf, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rG, G, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rG, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rF, F, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rF, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(rF, rF, FALCON_LOGN);
+
+    /*
+     * Compute the FFT for the key elements, and negate f and F.
+     */
+
+    /*
+     * The Gram matrix is G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle).
+     */
+    g00 = (fpr *)tmp;
+    g01 = g00 + FALCON_N;
+    g11 = g01 + FALCON_N;
+    gxx = g11 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(g00, b00, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(g00, g00, b01, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(g01, b00, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(g01, g01, b01, b11, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(g11, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(g11, g11, b11, FALCON_LOGN);
+
+    /*
+     * Compute the Falcon tree.
+     */
+    ffLDL_fft(tree, g00, g01, g11, FALCON_LOGN, gxx);
+
+    /*
+     * Normalize tree.
+     */
+    ffLDL_binary_normalize(tree, FALCON_LOGN, FALCON_LOGN);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+                       fpr *restrict t0, fpr *restrict t1,
+                       fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
+                       unsigned orig_logn, unsigned logn, fpr *restrict tmp) {
+    size_t n, hn;
+    fpr *z0, *z1;
+
+    /*
+     * Deepest level: the LDL tree leaf value is just g00 (the
+     * array has length only 1 at this point); we normalize it
+     * with regards to sigma, then use it for sampling.
+     */
+    if (logn == 0) {
+        fpr leaf;
+
+        leaf = g00[0];
+        leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma_10);
+        t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+        t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+        return;
+    }
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Decompose G into LDL. We only need d00 (identical to g00),
+     * d11, and l10; we do that in place.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft(g00, g01, g11, logn);
+
+    /*
+     * Split d00 and d11 and expand them into half-size quasi-cyclic
+     * Gram matrices. We also save l10 in tmp[].
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(tmp, tmp + hn, g00, logn);
+    memcpy(g00, tmp, n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(tmp, tmp + hn, g11, logn);
+    memcpy(g11, tmp, n * sizeof * tmp);
+    memcpy(tmp, g01, n * sizeof * g01);
+    memcpy(g01, g00, hn * sizeof * g00);
+    memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+    /*
+     * The half-size Gram matrices for the recursive LDL tree
+     * building are now:
+     *   - left sub-tree: g00, g00+hn, g01
+     *   - right sub-tree: g11, g11+hn, g01+hn
+     * l10 is in tmp[].
+     */
+
+    /*
+     * We split t1 and use the first recursive call on the two
+     * halves, using the right sub-tree. The result is merged
+     * back into tmp + 2*n.
+     */
+    z1 = tmp + n;
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+                           g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * l10.
+     * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+     * in tmp + (n << 1). The buffer in z1 is free.
+     *
+     * In the end, z1 is written over t1, and tb0 is in t0.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(z1, t1, tmp + (n << 1), logn);
+    memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t0, t0, tmp, z1, logn);
+
+    /*
+     * Second recursive invocation, on the split tb0 (currently in t0)
+     * and the left sub-tree.
+     */
+    z0 = tmp;
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z0, z0 + hn, t0, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+                           g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+               fpr *restrict z0, fpr *restrict z1,
+               const fpr *restrict tree,
+               const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
+               fpr *restrict tmp) {
+    size_t n, hn;
+    const fpr *tree0, *tree1;
+
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        // Split
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        // Sampling
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        // Merge
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        // Split
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        // Sampling
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        // Merge
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * General recursive case (logn >= 2).
+     */
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    tree0 = tree + n;
+    tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+    /*
+     * We split t1 into z1 (reused as temporary storage), then do
+     * the recursive invocation, with output in tmp. We finally
+     * merge back into z1.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree1, z1, z1 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(tmp, t1, z1, logn);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(tmp, t0, tmp, tree, logn);
+
+    /*
+     * Second recursive invocation.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z0, z0 + hn, tmp, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree0, z0, z0 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+             const fpr *restrict expanded_key,
+             const uint16_t *hm, fpr *restrict tmp) {
+    fpr *t0, *t1, *tx, *ty;
+    const fpr *b00, *b01, *b10, *b11, *tree;
+    fpr ni;
+    int16_t *s1tmp, *s2tmp;
+
+    t0 = tmp;
+    t1 = t0 + FALCON_N;
+    b00 = expanded_key + skoff_b00(FALCON_LOGN);
+    b01 = expanded_key + skoff_b01(FALCON_LOGN);
+    b10 = expanded_key + skoff_b10(FALCON_LOGN);
+    b11 = expanded_key + skoff_b11(FALCON_LOGN);
+    tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(t0, FALCON_LOGN);
+    ni = fpr_inverse_of_q;
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+    tx = t1 + FALCON_N;
+    ty = tx + FALCON_N;
+
+    /*
+     * Apply sampling. Output is written back in [tx, ty].
+     */
+    ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, FALCON_LOGN, ty + FALCON_N);
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t0, tx, b00, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t0, t0, ty, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t0, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t1, tx, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t1, t1, ty, b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t1, FALCON_LOGN);
+
+    /*
+     * Compute the signature.
+     */
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+
+    s1tmp = (int16_t *)tx;
+    s2tmp = (int16_t *)tmp;
+
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+        memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+        memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+            const int8_t *restrict f, const int8_t *restrict g,
+            const int8_t *restrict F, const int8_t *restrict G,
+            const uint16_t *hm, fpr *restrict tmp) {
+    fpr *t0, *t1, *tx, *ty;
+    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+    fpr ni;
+    int16_t *s1tmp, *s2tmp;
+
+    /*
+     * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+     */
+    b00 = tmp;
+    b01 = b00 + FALCON_N;
+    b10 = b01 + FALCON_N;
+    b11 = b10 + FALCON_N;
+    t0 = b11 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b00, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b10, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+    /*
+     * Compute the Gram matrix G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle). g10 is not kept
+     * since it is equal to adj(g01).
+     *
+     * We _replace_ the matrix B with the Gram matrix, but we
+     * must keep b01 and b11 for computing the target vector.
+     *
+     * Memory layout:
+     * b00 | b01 | b10 | b11 | t0 | t1
+     * g00 | g01 | g11 | b01 | t0 | t1
+     */
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(t1, b00, b10, FALCON_LOGN);   // t1 <- b00*adj(b10)
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(t0, b01, FALCON_LOGN);    // t0 <- b01*adj(b01)
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(b00, b00, FALCON_LOGN);   // b00 <- b00*adj(b00)
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(b00, b00, t0, FALCON_LOGN);      // b00 <- g00
+
+    memcpy(t0, b01, FALCON_N * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(b01, t1, b01, b11, FALCON_LOGN);  // b01 <- b01*adj(b11)
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(b10, b10, FALCON_LOGN);   // b10 <- b10*adj(b10)
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(b10, b10, b11, FALCON_LOGN);    // t1 = g11 <- b11*adj(b11)
+
+    /*
+     * We rename variables to make things clearer. The three elements
+     * of the Gram matrix uses the first 3*n slots of tmp[], followed
+     * by b11 and b01 (in that order).
+     */
+    g00 = b00;
+    g01 = b01;
+    g11 = b10;
+    b01 = t0;
+    t0 = b01 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    /*
+     * Memory layout at that point:
+     *   g00 g01 g11 b11 b01 t0 t1
+     */
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(t0, FALCON_LOGN);
+    ni = fpr_inverse_of_q;
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+    /*
+     * b01 and b11 can be discarded, so we move back (t0,t1).
+     * Memory layout is now:
+     *      g00 g01 g11 t0 t1
+     */
+    memcpy(b11, t0, FALCON_N * 2 * sizeof * t0);
+    t0 = g11 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    /*
+     * Apply sampling; result is written over (t0,t1).
+     * t1, g00
+     */
+    ffSampling_fft_dyntree(samp, samp_ctx,
+                           t0, t1, g00, g01, g11, FALCON_LOGN, FALCON_LOGN, t1 + FALCON_N);
+
+    /*
+     * We arrange the layout back to:
+     *     b00 b01 b10 b11 t0 t1
+     *
+     * We did not conserve the matrix basis, so we must recompute
+     * it now.
+     */
+    b00 = tmp;
+    b01 = b00 + FALCON_N;
+    b10 = b01 + FALCON_N;
+    b11 = b10 + FALCON_N;
+    memmove(b11 + FALCON_N, t0, FALCON_N * 2 * sizeof * t0);
+    t0 = b11 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b00, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b10, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+    tx = t1 + FALCON_N;
+    ty = tx + FALCON_N;
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(tx, t0, b00, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(ty, t0, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t0, tx, t1, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t1, ty, t1, b11, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t0, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t1, FALCON_LOGN);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s1tmp = (int16_t *)tx;
+    s2tmp = (int16_t *)tmp;
+
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+        memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+        memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *restrict expanded_key,
+        const uint16_t *hm, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min_10;
+        PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED1024_AARCH64_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_tree(samp, samp_ctx, sig, expanded_key, hm, ftmp)) {
+            break;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *restrict f, const int8_t *restrict g,
+        const int8_t *restrict F, const int8_t *restrict G,
+        const uint16_t *hm, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min_10;
+        PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED1024_AARCH64_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_dyn(samp, samp_ctx, sig, f, g, F, G, hm, ftmp)) {
+            break;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.c
new file mode 100644
index 000000000..92300bb57
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.c
@@ -0,0 +1,71 @@
+/*
+ * Utils function
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "util.h"
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, const unsigned logn) {
+    float64x2x4_t neon_flo64, neon_fhi64;
+    int64x2x4_t neon_lo64, neon_hi64;
+    int32x4_t neon_lo32[2], neon_hi32[2];
+    int16x8_t neon_lo16, neon_hi16;
+    int8x16_t neon_8;
+
+    const unsigned falcon_n =  1 << logn;
+
+    for (unsigned i = 0; i < falcon_n; i += 16) {
+        neon_8 = vld1q_s8(&t[i]);
+
+        // Extend from 8 to 16 bit
+        // x7 | x6 | x5 | x5 - x3 | x2 | x1 | x0
+        neon_lo16 = vmovl_s8(vget_low_s8(neon_8));
+        neon_hi16 = vmovl_high_s8(neon_8);
+
+        // Extend from 16 to 32 bit
+        // xxx3 | xxx2 | xxx1 | xxx0
+        neon_lo32[0] = vmovl_s16(vget_low_s16(neon_lo16));
+        neon_lo32[1] = vmovl_high_s16(neon_lo16);
+        neon_hi32[0] = vmovl_s16(vget_low_s16(neon_hi16));
+        neon_hi32[1] = vmovl_high_s16(neon_hi16);
+
+        // Extend from 32 to 64 bit
+        neon_lo64.val[0] = vmovl_s32(vget_low_s32(neon_lo32[0]));
+        neon_lo64.val[1] = vmovl_high_s32(neon_lo32[0]);
+        neon_lo64.val[2] = vmovl_s32(vget_low_s32(neon_lo32[1]));
+        neon_lo64.val[3] = vmovl_high_s32(neon_lo32[1]);
+
+        neon_hi64.val[0] = vmovl_s32(vget_low_s32(neon_hi32[0]));
+        neon_hi64.val[1] = vmovl_high_s32(neon_hi32[0]);
+        neon_hi64.val[2] = vmovl_s32(vget_low_s32(neon_hi32[1]));
+        neon_hi64.val[3] = vmovl_high_s32(neon_hi32[1]);
+
+        vfcvtx4(neon_flo64, neon_lo64);
+        vfcvtx4(neon_fhi64, neon_hi64);
+
+        vstorex4(&r[i], neon_flo64);
+        vstorex4(&r[i + 8], neon_fhi64);
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.h
new file mode 100644
index 000000000..78bd83343
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.h
@@ -0,0 +1,8 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#define poly_small_to_fp PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/vrfy.c
new file mode 100644
index 000000000..0aa6015da
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/vrfy.c
@@ -0,0 +1,174 @@
+/*
+ * Falcon signature verification.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt(int16_t *h) {
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt_monty(int16_t *h) {
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_MONT);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+        int16_t *h, int16_t *tmp) {
+    int16_t *tt = tmp;
+
+    /*
+     * Compute s1 = c0 - s2*h mod phi mod q (in tt[]).
+     */
+
+    memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT_INV);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(tt, h);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(tt, INVNTT_NONE);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(tt, c0, tt);
+
+    /*
+     * Signature is valid if and only if the aggregate (s1,s2) vector
+     * is short enough.
+     */
+    return PQCLEAN_FALCONPADDED1024_AARCH64_is_short(tt, s2);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_compute_public(int16_t *h, const int8_t *f, const int8_t *g, int16_t *tmp) {
+    int16_t *tt = tmp;
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(h, g);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(tt, f);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT);
+
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(tt)) {
+        return 0;
+    }
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(h, tt);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_convert_to_unsigned(h);
+
+    return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_complete_private(int8_t *G, const int8_t *f,
+        const int8_t *g, const int8_t *F,
+        uint8_t *tmp) {
+    int16_t *t1, *t2;
+
+    t1 = (int16_t *)tmp;
+    t2 = t1 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(t1, g);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(t1, NTT_NONE);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(t2, F);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(t2, NTT_MONT);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(t1, t2);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(t2, f);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(t2, NTT_MONT);
+
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(t2)) {
+        return 0;
+    }
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(t1, t2);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(t1, INVNTT_NINV);
+
+    if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_int16_to_int8(G, t1)) {
+        return 0;
+    }
+    return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp) {
+    int16_t *tt = (int16_t *)tmp;
+    uint16_t r;
+
+    memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT);
+
+    r = PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(tt);
+
+    return (int)(1u - (r >> 15));
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+        const int16_t *s1, const int16_t *s2,
+        uint8_t *tmp) {
+    int16_t *tt = (int16_t *)tmp;
+    uint16_t r;
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(h, c0, s1);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT);
+    r = PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(tt);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(h, tt);
+
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = (uint16_t) (~r & (uint16_t) - PQCLEAN_FALCONPADDED1024_AARCH64_is_short(s1, s2));
+    return (int)(r >> 15);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp) {
+    int16_t *s2 = (int16_t *)tmp;
+
+    memcpy(s2, sig, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(s2, NTT_MONT);
+
+    int r = PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(s2);
+
+    return r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/LICENSE b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin <thomas.pornin@nccgroup.com>.
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/api.h b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/api.h
new file mode 100644
index 000000000..da6103260
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED1024_AVX2_API_H
+#define PQCLEAN_FALCONPADDED1024_AVX2_API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES   2305
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES   1793
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES            1280
+
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_ALGNAME          "Falcon-padded-1024"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ *   public (pk): PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES
+ *   private (sk): PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/codec.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/codec.c
new file mode 100644
index 000000000..84466aa71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_modq_encode(
+    void *out, size_t max_out_len,
+    const uint16_t *x, unsigned logn) {
+    size_t n, out_len, u;
+    uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        if (x[u] >= 12289) {
+            return 0;
+        }
+    }
+    out_len = ((n * 14) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << 14) | x[u];
+        acc_len += 14;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_modq_decode(
+    uint16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len, u;
+    const uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * 14) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    u = 0;
+    while (u < n) {
+        acc = (acc << 8) | (*buf ++);
+        acc_len += 8;
+        if (acc_len >= 14) {
+            unsigned w;
+
+            acc_len -= 14;
+            w = (acc >> acc_len) & 0x3FFF;
+            if (w >= 12289) {
+                return 0;
+            }
+            x[u ++] = (uint16_t)w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint16_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_decode(
+    int16_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            w |= -(w & mask2);
+            x[u ++] = (int16_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+    void *out, size_t max_out_len,
+    const int8_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint8_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+    int8_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            x[u ++] = (int8_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_comp_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn) {
+    uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = out;
+
+    /*
+     * Make sure that all values are within the -2047..+2047 range.
+     */
+    for (u = 0; u < n; u ++) {
+        if (x[u] < -2047 || x[u] > +2047) {
+            return 0;
+        }
+    }
+
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        int t;
+        unsigned w;
+
+        /*
+         * Get sign and absolute value of next integer; push the
+         * sign bit.
+         */
+        acc <<= 1;
+        t = x[u];
+        if (t < 0) {
+            t = -t;
+            acc |= 1;
+        }
+        w = (unsigned)t;
+
+        /*
+         * Push the low 7 bits of the absolute value.
+         */
+        acc <<= 7;
+        acc |= w & 127u;
+        w >>= 7;
+
+        /*
+         * We pushed exactly 8 bits.
+         */
+        acc_len += 8;
+
+        /*
+         * Push as many zeros as necessary, then a one. Since the
+         * absolute value is at most 2047, w can only range up to
+         * 15 at this point, thus we will add at most 16 bits
+         * here. With the 8 bits above and possibly up to 7 bits
+         * from previous iterations, we may go up to 31 bits, which
+         * will fit in the accumulator, which is an uint32_t.
+         */
+        acc <<= (w + 1);
+        acc |= 1;
+        acc_len += w + 1;
+
+        /*
+         * Produce all full bytes.
+         */
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            if (buf != NULL) {
+                if (v >= max_out_len) {
+                    return 0;
+                }
+                buf[v] = (uint8_t)(acc >> acc_len);
+            }
+            v ++;
+        }
+    }
+
+    /*
+     * Flush remaining bits (if any).
+     */
+    if (acc_len > 0) {
+        if (buf != NULL) {
+            if (v >= max_out_len) {
+                return 0;
+            }
+            buf[v] = (uint8_t)(acc << (8 - acc_len));
+        }
+        v ++;
+    }
+
+    return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_comp_decode(
+    int16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    const uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        unsigned b, s, m;
+
+        /*
+         * Get next eight bits: sign and low seven bits of the
+         * absolute value.
+         */
+        if (v >= max_in_len) {
+            return 0;
+        }
+        acc = (acc << 8) | (uint32_t)buf[v ++];
+        b = acc >> acc_len;
+        s = b & 128;
+        m = b & 127;
+
+        /*
+         * Get next bits until a 1 is reached.
+         */
+        for (;;) {
+            if (acc_len == 0) {
+                if (v >= max_in_len) {
+                    return 0;
+                }
+                acc = (acc << 8) | (uint32_t)buf[v ++];
+                acc_len = 8;
+            }
+            acc_len --;
+            if (((acc >> acc_len) & 1) != 0) {
+                break;
+            }
+            m += 128;
+            if (m > 2047) {
+                return 0;
+            }
+        }
+
+        /*
+         * "-0" is forbidden.
+         */
+        if (s && m == 0) {
+            return 0;
+        }
+        if (s) {
+            x[u] = (int16_t) - m;
+        } else {
+            x[u] = (int16_t)m;
+        }
+    }
+
+    /*
+     * Unused bits in the last byte must be zero.
+     */
+    if ((acc & ((1u << acc_len) - 1u)) != 0) {
+        return 0;
+    }
+
+    return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
+ *      1       2     129       56.31       143       60.02
+ *      2       4     123       40.93       160       46.52
+ *      3       8      97       28.97       159       38.01
+ *      4      16     100       21.48       154       32.50
+ *      5      32      71       15.41       151       29.36
+ *      6      64      59       11.07       138       27.77
+ *      7     128      39        7.91       144       27.00
+ *      8     256      32        5.63       148       26.61
+ *      9     512      22        4.00       137       26.46
+ *     10    1024      15        2.84       146       26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    7,
+    7,
+    6,
+    6,
+    5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ *   log(n)     n   max sig coeff (theoretical)
+ *      1       2       412
+ *      2       4       583
+ *      3       8       824
+ *      4      16      1166
+ *      5      32      1649
+ *      6      64      2332
+ *      7     128      3299
+ *      8     256      4665
+ *      9     512      6598
+ *     10    1024      9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_sig_bits[] = {
+    0, /* unused */
+    10,
+    11,
+    11,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/common.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/common.c
new file mode 100644
index 000000000..affe907eb
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
+
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
+    /*
+     * Each 16-bit sample is a value in 0..65535. The value is
+     * kept if it falls in 0..61444 (because 61445 = 5*12289)
+     * and rejected otherwise; thus, each sample has probability
+     * about 0.93758 of being selected.
+     *
+     * We want to oversample enough to be sure that we will
+     * have enough values with probability at least 1 - 2^(-256).
+     * Depending on degree N, this leads to the following
+     * required oversampling:
+     *
+     *   logn     n  oversampling
+     *     1      2     65
+     *     2      4     67
+     *     3      8     71
+     *     4     16     77
+     *     5     32     86
+     *     6     64    100
+     *     7    128    122
+     *     8    256    154
+     *     9    512    205
+     *    10   1024    287
+     *
+     * If logn >= 7, then the provided temporary buffer is large
+     * enough. Otherwise, we use a stack buffer of 63 entries
+     * (i.e. 126 bytes) for the values that do not fit in tmp[].
+     */
+
+    static const uint16_t overtab[] = {
+        0, /* unused */
+        65,
+        67,
+        71,
+        77,
+        86,
+        100,
+        122,
+        154,
+        205,
+        287
+    };
+
+    unsigned n, n2, u, m, p, over;
+    uint16_t *tt1, tt2[63];
+
+    /*
+     * We first generate m 16-bit value. Values 0..n-1 go to x[].
+     * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+     * We also reduce modulo q the values; rejected values are set
+     * to 0xFFFF.
+     */
+    n = 1U << logn;
+    n2 = n << 1;
+    over = overtab[logn];
+    m = n + over;
+    tt1 = (uint16_t *)tmp;
+    for (u = 0; u < m; u ++) {
+        uint8_t buf[2];
+        uint32_t w, wr;
+
+        inner_shake256_extract(sc, buf, sizeof buf);
+        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+        wr |= ((w - 61445) >> 31) - 1;
+        if (u < n) {
+            x[u] = (uint16_t)wr;
+        } else if (u < n2) {
+            tt1[u - n] = (uint16_t)wr;
+        } else {
+            tt2[u - n2] = (uint16_t)wr;
+        }
+    }
+
+    /*
+     * Now we must "squeeze out" the invalid values. We do this in
+     * a logarithmic sequence of passes; each pass computes where a
+     * value should go, and moves it down by 'p' slots if necessary,
+     * where 'p' uses an increasing powers-of-two scale. It can be
+     * shown that in all cases where the loop decides that a value
+     * has to be moved down by p slots, the destination slot is
+     * "free" (i.e. contains an invalid value).
+     */
+    for (p = 1; p <= over; p <<= 1) {
+        unsigned v;
+
+        /*
+         * In the loop below:
+         *
+         *   - v contains the index of the final destination of
+         *     the value; it is recomputed dynamically based on
+         *     whether values are valid or not.
+         *
+         *   - u is the index of the value we consider ("source");
+         *     its address is s.
+         *
+         *   - The loop may swap the value with the one at index
+         *     u-p. The address of the swap destination is d.
+         */
+        v = 0;
+        for (u = 0; u < m; u ++) {
+            uint16_t *s, *d;
+            unsigned j, sv, dv, mk;
+
+            if (u < n) {
+                s = &x[u];
+            } else if (u < n2) {
+                s = &tt1[u - n];
+            } else {
+                s = &tt2[u - n2];
+            }
+            sv = *s;
+
+            /*
+             * The value in sv should ultimately go to
+             * address v, i.e. jump back by u-v slots.
+             */
+            j = u - v;
+
+            /*
+             * We increment v for the next iteration, but
+             * only if the source value is valid. The mask
+             * 'mk' is -1 if the value is valid, 0 otherwise,
+             * so we _subtract_ mk.
+             */
+            mk = (sv >> 15) - 1U;
+            v -= mk;
+
+            /*
+             * In this loop we consider jumps by p slots; if
+             * u < p then there is nothing more to do.
+             */
+            if (u < p) {
+                continue;
+            }
+
+            /*
+             * Destination for the swap: value at address u-p.
+             */
+            if ((u - p) < n) {
+                d = &x[u - p];
+            } else if ((u - p) < n2) {
+                d = &tt1[(u - p) - n];
+            } else {
+                d = &tt2[(u - p) - n2];
+            }
+            dv = *d;
+
+            /*
+             * The swap should be performed only if the source
+             * is valid AND the jump j has its 'p' bit set.
+             */
+            mk &= -(((j & p) + 0x1FF) >> 9);
+
+            *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+        }
+    }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+    0,    /* unused */
+    101498,
+    208714,
+    428865,
+    892039,
+    1852696,
+    3842630,
+    7959734,
+    16468416,
+    34034726,
+    70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_is_short(
+    const int16_t *s1, const int16_t *s2, unsigned logn) {
+    /*
+     * We use the l2-norm. Code below uses only 32-bit operations to
+     * compute the square of the norm with saturation to 2^32-1 if
+     * the value exceeds 2^31-1.
+     */
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = (size_t)1 << logn;
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s1[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+        z = s2[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    s |= -(ng >> 31);
+
+    return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(
+    uint32_t sqn, const int16_t *s2, unsigned logn) {
+    size_t n, u;
+    uint32_t ng;
+
+    n = (size_t)1 << logn;
+    ng = -(sqn >> 31);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s2[u];
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+    }
+    sqn |= -(ng >> 31);
+
+    return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fft.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fft.c
new file mode 100644
index 000000000..2b8ca7b49
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fft.c
@@ -0,0 +1,1108 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_add(a_re, b_re); \
+        fpct_im = fpr_add(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_sub(a_re, b_re); \
+        fpct_im = fpr_sub(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+        fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+        fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ *    Re(f(w_j)) -> slot rev(j)/2
+ *    Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_FFT(fpr *f, unsigned logn) {
+    /*
+     * FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = N
+     *   for m = 1; m < N; m *= 2:
+     *       ht = t/2
+     *       for i1 = 0; i1 < m; i1 ++:
+     *           j1 = i1 * t
+     *           s = GM[m + i1]
+     *           for j = j1; j < (j1 + ht); j ++:
+     *               x = f[j]
+     *               y = s * f[j + ht]
+     *               f[j] = x + y
+     *               f[j + ht] = x - y
+     *       t = ht
+     *
+     * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+     *
+     * In the description above, f[] is supposed to contain complex
+     * numbers. In our in-memory representation, the real and
+     * imaginary parts of f[k] are in array slots k and k+N/2.
+     *
+     * We only keep the first half of the complex numbers. We can
+     * see that after the first iteration, the first and second halves
+     * of the array of complex numbers have separate lives, so we
+     * simply ignore the second part.
+     */
+
+    unsigned u;
+    size_t t, n, hn, m;
+
+    /*
+     * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+     * (because GM[1] = w^rev(1) = w^(N/2) = i).
+     * In our chosen representation, this is a no-op: everything is
+     * already where it should be.
+     */
+
+    /*
+     * Subsequent iterations are truncated to use only the first
+     * half of values.
+     */
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    t = hn;
+    for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+        size_t ht, hm, i1, j1;
+
+        ht = t >> 1;
+        hm = m >> 1;
+        for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+            size_t j, j2;
+
+            j2 = j1 + ht;
+            if (ht >= 4) {
+                __m256d s_re, s_im;
+
+                s_re = _mm256_set1_pd(
+                           fpr_gm_tab[((m + i1) << 1) + 0].v);
+                s_im = _mm256_set1_pd(
+                           fpr_gm_tab[((m + i1) << 1) + 1].v);
+                for (j = j1; j < j2; j += 4) {
+                    __m256d x_re, x_im, y_re, y_im;
+                    __m256d z_re, z_im;
+
+                    x_re = _mm256_loadu_pd(&f[j].v);
+                    x_im = _mm256_loadu_pd(&f[j + hn].v);
+                    z_re = _mm256_loadu_pd(&f[j + ht].v);
+                    z_im = _mm256_loadu_pd(&f[j + ht + hn].v);
+                    y_re = FMSUB(z_re, s_re,
+                                 _mm256_mul_pd(z_im, s_im));
+                    y_im = FMADD(z_re, s_im,
+                                 _mm256_mul_pd(z_im, s_re));
+                    _mm256_storeu_pd(&f[j].v,
+                                     _mm256_add_pd(x_re, y_re));
+                    _mm256_storeu_pd(&f[j + hn].v,
+                                     _mm256_add_pd(x_im, y_im));
+                    _mm256_storeu_pd(&f[j + ht].v,
+                                     _mm256_sub_pd(x_re, y_re));
+                    _mm256_storeu_pd(&f[j + ht + hn].v,
+                                     _mm256_sub_pd(x_im, y_im));
+                }
+            } else {
+                fpr s_re, s_im;
+
+                s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+                s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+                for (j = j1; j < j2; j ++) {
+                    fpr x_re, x_im, y_re, y_im;
+
+                    x_re = f[j];
+                    x_im = f[j + hn];
+                    y_re = f[j + ht];
+                    y_im = f[j + ht + hn];
+                    FPC_MUL(y_re, y_im,
+                            y_re, y_im, s_re, s_im);
+                    FPC_ADD(f[j], f[j + hn],
+                            x_re, x_im, y_re, y_im);
+                    FPC_SUB(f[j + ht], f[j + ht + hn],
+                            x_re, x_im, y_re, y_im);
+                }
+            }
+        }
+        t = ht;
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_iFFT(fpr *f, unsigned logn) {
+    /*
+     * Inverse FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = 1
+     *   for m = N; m > 1; m /= 2:
+     *       hm = m/2
+     *       dt = t*2
+     *       for i1 = 0; i1 < hm; i1 ++:
+     *           j1 = i1 * dt
+     *           s = iGM[hm + i1]
+     *           for j = j1; j < (j1 + t); j ++:
+     *               x = f[j]
+     *               y = f[j + t]
+     *               f[j] = x + y
+     *               f[j + t] = s * (x - y)
+     *       t = dt
+     *   for i1 = 0; i1 < N; i1 ++:
+     *       f[i1] = f[i1] / N
+     *
+     * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+     * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+     *
+     * In the main loop (not counting the final division loop), in
+     * all iterations except the last, the first and second half of f[]
+     * (as an array of complex numbers) are separate. In our chosen
+     * representation, we do not keep the second half.
+     *
+     * The last iteration recombines the recomputed half with the
+     * implicit half, and should yield only real numbers since the
+     * target polynomial is real; moreover, s = i at that step.
+     * Thus, when considering x and y:
+     *    y = conj(x) since the final f[j] must be real
+     *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+     *    filled with 2*Im(x).
+     * But we already have Re(x) and Im(x) in array slots j and j+t
+     * in our chosen representation. That last iteration is thus a
+     * simple doubling of the values in all the array.
+     *
+     * We make the last iteration a no-op by tweaking the final
+     * division into a division by N/2, not N.
+     */
+    size_t u, n, hn, t, m;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    hn = n >> 1;
+    for (u = logn; u > 1; u --) {
+        size_t hm, dt, i1, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+            size_t j, j2;
+
+            j2 = j1 + t;
+            if (t >= 4) {
+                __m256d s_re, s_im;
+
+                s_re = _mm256_set1_pd(
+                           fpr_gm_tab[((hm + i1) << 1) + 0].v);
+                s_im = _mm256_set1_pd(
+                           fpr_gm_tab[((hm + i1) << 1) + 1].v);
+                for (j = j1; j < j2; j += 4) {
+                    __m256d x_re, x_im, y_re, y_im;
+                    __m256d z_re, z_im;
+
+                    x_re = _mm256_loadu_pd(&f[j].v);
+                    x_im = _mm256_loadu_pd(&f[j + hn].v);
+                    y_re = _mm256_loadu_pd(&f[j + t].v);
+                    y_im = _mm256_loadu_pd(&f[j + t + hn].v);
+                    _mm256_storeu_pd(&f[j].v,
+                                     _mm256_add_pd(x_re, y_re));
+                    _mm256_storeu_pd(&f[j + hn].v,
+                                     _mm256_add_pd(x_im, y_im));
+                    x_re = _mm256_sub_pd(y_re, x_re);
+                    x_im = _mm256_sub_pd(x_im, y_im);
+                    z_re = FMSUB(x_im, s_im,
+                                 _mm256_mul_pd(x_re, s_re));
+                    z_im = FMADD(x_re, s_im,
+                                 _mm256_mul_pd(x_im, s_re));
+                    _mm256_storeu_pd(&f[j + t].v, z_re);
+                    _mm256_storeu_pd(&f[j + t + hn].v, z_im);
+                }
+            } else {
+                fpr s_re, s_im;
+
+                s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+                s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+                for (j = j1; j < j2; j ++) {
+                    fpr x_re, x_im, y_re, y_im;
+
+                    x_re = f[j];
+                    x_im = f[j + hn];
+                    y_re = f[j + t];
+                    y_im = f[j + t + hn];
+                    FPC_ADD(f[j], f[j + hn],
+                            x_re, x_im, y_re, y_im);
+                    FPC_SUB(x_re, x_im,
+                            x_re, x_im, y_re, y_im);
+                    FPC_MUL(f[j + t], f[j + t + hn],
+                            x_re, x_im, s_re, s_im);
+                }
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * Last iteration is a no-op, provided that we divide by N/2
+     * instead of N. We need to make a special case for logn = 0.
+     */
+    if (logn > 0) {
+        fpr ni;
+
+        ni = fpr_p2_tab[logn];
+        for (u = 0; u < n; u ++) {
+            f[u] = fpr_mul(f[u], ni);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_add(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_add_pd(
+                                 _mm256_loadu_pd(&a[u].v),
+                                 _mm256_loadu_pd(&b[u].v)));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_add(a[u], b[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_sub_pd(
+                                 _mm256_loadu_pd(&a[u].v),
+                                 _mm256_loadu_pd(&b[u].v)));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_sub(a[u], b[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        __m256d s;
+
+        s = _mm256_set1_pd(-0.0);
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_neg(a[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 8) {
+        __m256d s;
+
+        s = _mm256_set1_pd(-0.0);
+        for (u = (n >> 1); u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+        }
+    } else {
+        for (u = (n >> 1); u < n; u ++) {
+            a[u] = fpr_neg(a[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            c_re = FMSUB(
+                       a_re, b_re, _mm256_mul_pd(a_im, b_im));
+            c_im = FMADD(
+                       a_re, b_im, _mm256_mul_pd(a_im, b_re));
+            _mm256_storeu_pd(&a[u].v, c_re);
+            _mm256_storeu_pd(&a[u + hn].v, c_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = b[u + hn];
+            FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            c_re = FMADD(
+                       a_re, b_re, _mm256_mul_pd(a_im, b_im));
+            c_im = FMSUB(
+                       a_im, b_re, _mm256_mul_pd(a_re, b_im));
+            _mm256_storeu_pd(&a[u].v, c_re);
+            _mm256_storeu_pd(&a[u + hn].v, c_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = fpr_neg(b[u + hn]);
+            FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d zero;
+
+        zero = _mm256_setzero_pd();
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            _mm256_storeu_pd(&a[u].v,
+                             FMADD(a_re, a_re,
+                                   _mm256_mul_pd(a_im, a_im)));
+            _mm256_storeu_pd(&a[u + hn].v, zero);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+            a[u + hn] = fpr_zero;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        __m256d x4;
+
+        x4 = _mm256_set1_pd(x.v);
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_mul(a[u], x);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_div_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            t = _mm256_div_pd(one,
+                              FMADD(b_re, b_re,
+                                    _mm256_mul_pd(b_im, b_im)));
+            b_re = _mm256_mul_pd(b_re, t);
+            b_im = _mm256_mul_pd(b_im, t);
+            c_re = FMADD(
+                       a_re, b_re, _mm256_mul_pd(a_im, b_im));
+            c_im = FMSUB(
+                       a_im, b_re, _mm256_mul_pd(a_re, b_im));
+            _mm256_storeu_pd(&a[u].v, c_re);
+            _mm256_storeu_pd(&a[u + hn].v, c_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = b[u + hn];
+            FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, dv;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            dv = _mm256_div_pd(one,
+                               _mm256_add_pd(
+                                   FMADD(a_re, a_re,
+                                         _mm256_mul_pd(a_im, a_im)),
+                                   FMADD(b_re, b_re,
+                                         _mm256_mul_pd(b_im, b_im))));
+            _mm256_storeu_pd(&d[u].v, dv);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im;
+            fpr b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = b[u + hn];
+            d[u] = fpr_inv(fpr_add(
+                               fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+                               fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d F_re, F_im, G_re, G_im;
+            __m256d f_re, f_im, g_re, g_im;
+            __m256d a_re, a_im, b_re, b_im;
+
+            F_re = _mm256_loadu_pd(&F[u].v);
+            F_im = _mm256_loadu_pd(&F[u + hn].v);
+            G_re = _mm256_loadu_pd(&G[u].v);
+            G_im = _mm256_loadu_pd(&G[u + hn].v);
+            f_re = _mm256_loadu_pd(&f[u].v);
+            f_im = _mm256_loadu_pd(&f[u + hn].v);
+            g_re = _mm256_loadu_pd(&g[u].v);
+            g_im = _mm256_loadu_pd(&g[u + hn].v);
+
+            a_re = FMADD(F_re, f_re,
+                         _mm256_mul_pd(F_im, f_im));
+            a_im = FMSUB(F_im, f_re,
+                         _mm256_mul_pd(F_re, f_im));
+            b_re = FMADD(G_re, g_re,
+                         _mm256_mul_pd(G_im, g_im));
+            b_im = FMSUB(G_im, g_re,
+                         _mm256_mul_pd(G_re, g_im));
+            _mm256_storeu_pd(&d[u].v,
+                             _mm256_add_pd(a_re, b_re));
+            _mm256_storeu_pd(&d[u + hn].v,
+                             _mm256_add_pd(a_im, b_im));
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr F_re, F_im, G_re, G_im;
+            fpr f_re, f_im, g_re, g_im;
+            fpr a_re, a_im, b_re, b_im;
+
+            F_re = F[u];
+            F_im = F[u + hn];
+            G_re = G[u];
+            G_im = G[u + hn];
+            f_re = f[u];
+            f_im = f[u + hn];
+            g_re = g[u];
+            g_im = g[u + hn];
+
+            FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+            FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+            d[u] = fpr_add(a_re, b_re);
+            d[u + hn] = fpr_add(a_im, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, bv;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            bv = _mm256_loadu_pd(&b[u].v);
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_mul_pd(a_re, bv));
+            _mm256_storeu_pd(&a[u + hn].v,
+                             _mm256_mul_pd(a_im, bv));
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            a[u] = fpr_mul(a[u], b[u]);
+            a[u + hn] = fpr_mul(a[u + hn], b[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_div_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d ib, a_re, a_im;
+
+            ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
+            _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr ib;
+
+            ib = fpr_inv(b[u]);
+            a[u] = fpr_mul(a[u], ib);
+            a[u + hn] = fpr_mul(a[u + hn], ib);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_LDL_fft(
+    const fpr *g00,
+    fpr *g01, fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+            g00_re = _mm256_loadu_pd(&g00[u].v);
+            g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+            g01_re = _mm256_loadu_pd(&g01[u].v);
+            g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+            g11_re = _mm256_loadu_pd(&g11[u].v);
+            g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+            t = _mm256_div_pd(one,
+                              FMADD(g00_re, g00_re,
+                                    _mm256_mul_pd(g00_im, g00_im)));
+            g00_re = _mm256_mul_pd(g00_re, t);
+            g00_im = _mm256_mul_pd(g00_im, t);
+            mu_re = FMADD(g01_re, g00_re,
+                          _mm256_mul_pd(g01_im, g00_im));
+            mu_im = FMSUB(g01_re, g00_im,
+                          _mm256_mul_pd(g01_im, g00_re));
+            xi_re = FMSUB(mu_re, g01_re,
+                          _mm256_mul_pd(mu_im, g01_im));
+            xi_im = FMADD(mu_im, g01_re,
+                          _mm256_mul_pd(mu_re, g01_im));
+            _mm256_storeu_pd(&g11[u].v,
+                             _mm256_sub_pd(g11_re, xi_re));
+            _mm256_storeu_pd(&g11[u + hn].v,
+                             _mm256_add_pd(g11_im, xi_im));
+            _mm256_storeu_pd(&g01[u].v, mu_re);
+            _mm256_storeu_pd(&g01[u + hn].v, mu_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            fpr mu_re, mu_im;
+
+            g00_re = g00[u];
+            g00_im = g00[u + hn];
+            g01_re = g01[u];
+            g01_im = g01[u + hn];
+            g11_re = g11[u];
+            g11_im = g11[u + hn];
+            FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+            FPC_MUL(g01_re, g01_im,
+                    mu_re, mu_im, g01_re, fpr_neg(g01_im));
+            FPC_SUB(g11[u], g11[u + hn],
+                    g11_re, g11_im, g01_re, g01_im);
+            g01[u] = mu_re;
+            g01[u + hn] = fpr_neg(mu_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(
+    fpr *d11, fpr *l10,
+    const fpr *g00, const fpr *g01,
+    const fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+            g00_re = _mm256_loadu_pd(&g00[u].v);
+            g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+            g01_re = _mm256_loadu_pd(&g01[u].v);
+            g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+            g11_re = _mm256_loadu_pd(&g11[u].v);
+            g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+            t = _mm256_div_pd(one,
+                              FMADD(g00_re, g00_re,
+                                    _mm256_mul_pd(g00_im, g00_im)));
+            g00_re = _mm256_mul_pd(g00_re, t);
+            g00_im = _mm256_mul_pd(g00_im, t);
+            mu_re = FMADD(g01_re, g00_re,
+                          _mm256_mul_pd(g01_im, g00_im));
+            mu_im = FMSUB(g01_re, g00_im,
+                          _mm256_mul_pd(g01_im, g00_re));
+            xi_re = FMSUB(mu_re, g01_re,
+                          _mm256_mul_pd(mu_im, g01_im));
+            xi_im = FMADD(mu_im, g01_re,
+                          _mm256_mul_pd(mu_re, g01_im));
+            _mm256_storeu_pd(&d11[u].v,
+                             _mm256_sub_pd(g11_re, xi_re));
+            _mm256_storeu_pd(&d11[u + hn].v,
+                             _mm256_add_pd(g11_im, xi_im));
+            _mm256_storeu_pd(&l10[u].v, mu_re);
+            _mm256_storeu_pd(&l10[u + hn].v, mu_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            fpr mu_re, mu_im;
+
+            g00_re = g00[u];
+            g00_im = g00[u + hn];
+            g01_re = g01[u];
+            g01_im = g01[u + hn];
+            g11_re = g11[u];
+            g11_im = g11[u + hn];
+            FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+            FPC_MUL(g01_re, g01_im,
+                    mu_re, mu_im, g01_re, fpr_neg(g01_im));
+            FPC_SUB(d11[u], d11[u + hn],
+                    g11_re, g11_im, g01_re, g01_im);
+            l10[u] = mu_re;
+            l10[u + hn] = fpr_neg(mu_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(
+    fpr *f0, fpr *f1,
+    const fpr *f, unsigned logn) {
+    /*
+     * The FFT representation we use is in bit-reversed order
+     * (element i contains f(w^(rev(i))), where rev() is the
+     * bit-reversal function over the ring degree. This changes
+     * indexes with regards to the Falcon specification.
+     */
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    if (n >= 8) {
+        __m256d half, sv;
+
+        half = _mm256_set1_pd(0.5);
+        sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
+        for (u = 0; u < qn; u += 2) {
+            __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
+
+            ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
+            ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
+            ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
+            ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
+            _mm_storeu_pd(&f0[u].v,
+                          _mm256_extractf128_pd(ff0, 0));
+            _mm_storeu_pd(&f0[u + qn].v,
+                          _mm256_extractf128_pd(ff0, 1));
+
+            ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
+            gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+            ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
+            ff3 = _mm256_hadd_pd(
+                      _mm256_mul_pd(ff1, gmt),
+                      _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
+            ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
+            _mm_storeu_pd(&f1[u].v,
+                          _mm256_extractf128_pd(ff3, 0));
+            _mm_storeu_pd(&f1[u + qn].v,
+                          _mm256_extractf128_pd(ff3, 1));
+        }
+    } else {
+        f0[0] = f[0];
+        f1[0] = f[hn];
+
+        for (u = 0; u < qn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+            fpr t_re, t_im;
+
+            a_re = f[(u << 1) + 0];
+            a_im = f[(u << 1) + 0 + hn];
+            b_re = f[(u << 1) + 1];
+            b_im = f[(u << 1) + 1 + hn];
+
+            FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+            f0[u] = fpr_half(t_re);
+            f0[u + qn] = fpr_half(t_im);
+
+            FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+            FPC_MUL(t_re, t_im, t_re, t_im,
+                    fpr_gm_tab[((u + hn) << 1) + 0],
+                    fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+            f1[u] = fpr_half(t_re);
+            f1[u + qn] = fpr_half(t_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(
+    fpr *f,
+    const fpr *f0, const fpr *f1, unsigned logn) {
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    if (n >= 16) {
+        for (u = 0; u < qn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+            __m256d gm1, gm2, g_re, g_im;
+            __m256d t_re, t_im, u_re, u_im;
+            __m256d tu1_re, tu2_re, tu1_im, tu2_im;
+
+            a_re = _mm256_loadu_pd(&f0[u].v);
+            a_im = _mm256_loadu_pd(&f0[u + qn].v);
+            c_re = _mm256_loadu_pd(&f1[u].v);
+            c_im = _mm256_loadu_pd(&f1[u + qn].v);
+
+            gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+            gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
+            g_re = _mm256_unpacklo_pd(gm1, gm2);
+            g_im = _mm256_unpackhi_pd(gm1, gm2);
+            g_re = _mm256_permute4x64_pd(g_re, 0xD8);
+            g_im = _mm256_permute4x64_pd(g_im, 0xD8);
+
+            b_re = FMSUB(
+                       c_re, g_re, _mm256_mul_pd(c_im, g_im));
+            b_im = FMADD(
+                       c_re, g_im, _mm256_mul_pd(c_im, g_re));
+
+            t_re = _mm256_add_pd(a_re, b_re);
+            t_im = _mm256_add_pd(a_im, b_im);
+            u_re = _mm256_sub_pd(a_re, b_re);
+            u_im = _mm256_sub_pd(a_im, b_im);
+
+            tu1_re = _mm256_unpacklo_pd(t_re, u_re);
+            tu2_re = _mm256_unpackhi_pd(t_re, u_re);
+            tu1_im = _mm256_unpacklo_pd(t_im, u_im);
+            tu2_im = _mm256_unpackhi_pd(t_im, u_im);
+            _mm256_storeu_pd(&f[(u << 1)].v,
+                             _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
+            _mm256_storeu_pd(&f[(u << 1) + 4].v,
+                             _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
+            _mm256_storeu_pd(&f[(u << 1) + hn].v,
+                             _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
+            _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
+                             _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
+        }
+    } else {
+        f[0] = f0[0];
+        f[hn] = f1[0];
+
+        for (u = 0; u < qn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+            fpr t_re, t_im;
+
+            a_re = f0[u];
+            a_im = f0[u + qn];
+            FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+                    fpr_gm_tab[((u + hn) << 1) + 0],
+                    fpr_gm_tab[((u + hn) << 1) + 1]);
+            FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+            f[(u << 1) + 0] = t_re;
+            f[(u << 1) + 0 + hn] = t_im;
+            FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+            f[(u << 1) + 1] = t_re;
+            f[(u << 1) + 1 + hn] = t_im;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.c
new file mode 100644
index 000000000..8940f3400
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.c
@@ -0,0 +1,1076 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+const fpr fpr_gm_tab[] = {
+    {0}, {0}, /* unused */
+    {-0.000000000000000000000000000}, { 1.000000000000000000000000000},
+    { 0.707106781186547524400844362}, { 0.707106781186547524400844362},
+    {-0.707106781186547524400844362}, { 0.707106781186547524400844362},
+    { 0.923879532511286756128183189}, { 0.382683432365089771728459984},
+    {-0.382683432365089771728459984}, { 0.923879532511286756128183189},
+    { 0.382683432365089771728459984}, { 0.923879532511286756128183189},
+    {-0.923879532511286756128183189}, { 0.382683432365089771728459984},
+    { 0.980785280403230449126182236}, { 0.195090322016128267848284868},
+    {-0.195090322016128267848284868}, { 0.980785280403230449126182236},
+    { 0.555570233019602224742830814}, { 0.831469612302545237078788378},
+    {-0.831469612302545237078788378}, { 0.555570233019602224742830814},
+    { 0.831469612302545237078788378}, { 0.555570233019602224742830814},
+    {-0.555570233019602224742830814}, { 0.831469612302545237078788378},
+    { 0.195090322016128267848284868}, { 0.980785280403230449126182236},
+    {-0.980785280403230449126182236}, { 0.195090322016128267848284868},
+    { 0.995184726672196886244836953}, { 0.098017140329560601994195564},
+    {-0.098017140329560601994195564}, { 0.995184726672196886244836953},
+    { 0.634393284163645498215171613}, { 0.773010453362736960810906610},
+    {-0.773010453362736960810906610}, { 0.634393284163645498215171613},
+    { 0.881921264348355029712756864}, { 0.471396736825997648556387626},
+    {-0.471396736825997648556387626}, { 0.881921264348355029712756864},
+    { 0.290284677254462367636192376}, { 0.956940335732208864935797887},
+    {-0.956940335732208864935797887}, { 0.290284677254462367636192376},
+    { 0.956940335732208864935797887}, { 0.290284677254462367636192376},
+    {-0.290284677254462367636192376}, { 0.956940335732208864935797887},
+    { 0.471396736825997648556387626}, { 0.881921264348355029712756864},
+    {-0.881921264348355029712756864}, { 0.471396736825997648556387626},
+    { 0.773010453362736960810906610}, { 0.634393284163645498215171613},
+    {-0.634393284163645498215171613}, { 0.773010453362736960810906610},
+    { 0.098017140329560601994195564}, { 0.995184726672196886244836953},
+    {-0.995184726672196886244836953}, { 0.098017140329560601994195564},
+    { 0.998795456205172392714771605}, { 0.049067674327418014254954977},
+    {-0.049067674327418014254954977}, { 0.998795456205172392714771605},
+    { 0.671558954847018400625376850}, { 0.740951125354959091175616897},
+    {-0.740951125354959091175616897}, { 0.671558954847018400625376850},
+    { 0.903989293123443331586200297}, { 0.427555093430282094320966857},
+    {-0.427555093430282094320966857}, { 0.903989293123443331586200297},
+    { 0.336889853392220050689253213}, { 0.941544065183020778412509403},
+    {-0.941544065183020778412509403}, { 0.336889853392220050689253213},
+    { 0.970031253194543992603984207}, { 0.242980179903263889948274162},
+    {-0.242980179903263889948274162}, { 0.970031253194543992603984207},
+    { 0.514102744193221726593693839}, { 0.857728610000272069902269984},
+    {-0.857728610000272069902269984}, { 0.514102744193221726593693839},
+    { 0.803207531480644909806676513}, { 0.595699304492433343467036529},
+    {-0.595699304492433343467036529}, { 0.803207531480644909806676513},
+    { 0.146730474455361751658850130}, { 0.989176509964780973451673738},
+    {-0.989176509964780973451673738}, { 0.146730474455361751658850130},
+    { 0.989176509964780973451673738}, { 0.146730474455361751658850130},
+    {-0.146730474455361751658850130}, { 0.989176509964780973451673738},
+    { 0.595699304492433343467036529}, { 0.803207531480644909806676513},
+    {-0.803207531480644909806676513}, { 0.595699304492433343467036529},
+    { 0.857728610000272069902269984}, { 0.514102744193221726593693839},
+    {-0.514102744193221726593693839}, { 0.857728610000272069902269984},
+    { 0.242980179903263889948274162}, { 0.970031253194543992603984207},
+    {-0.970031253194543992603984207}, { 0.242980179903263889948274162},
+    { 0.941544065183020778412509403}, { 0.336889853392220050689253213},
+    {-0.336889853392220050689253213}, { 0.941544065183020778412509403},
+    { 0.427555093430282094320966857}, { 0.903989293123443331586200297},
+    {-0.903989293123443331586200297}, { 0.427555093430282094320966857},
+    { 0.740951125354959091175616897}, { 0.671558954847018400625376850},
+    {-0.671558954847018400625376850}, { 0.740951125354959091175616897},
+    { 0.049067674327418014254954977}, { 0.998795456205172392714771605},
+    {-0.998795456205172392714771605}, { 0.049067674327418014254954977},
+    { 0.999698818696204220115765650}, { 0.024541228522912288031734529},
+    {-0.024541228522912288031734529}, { 0.999698818696204220115765650},
+    { 0.689540544737066924616730630}, { 0.724247082951466920941069243},
+    {-0.724247082951466920941069243}, { 0.689540544737066924616730630},
+    { 0.914209755703530654635014829}, { 0.405241314004989870908481306},
+    {-0.405241314004989870908481306}, { 0.914209755703530654635014829},
+    { 0.359895036534988148775104572}, { 0.932992798834738887711660256},
+    {-0.932992798834738887711660256}, { 0.359895036534988148775104572},
+    { 0.975702130038528544460395766}, { 0.219101240156869797227737547},
+    {-0.219101240156869797227737547}, { 0.975702130038528544460395766},
+    { 0.534997619887097210663076905}, { 0.844853565249707073259571205},
+    {-0.844853565249707073259571205}, { 0.534997619887097210663076905},
+    { 0.817584813151583696504920884}, { 0.575808191417845300745972454},
+    {-0.575808191417845300745972454}, { 0.817584813151583696504920884},
+    { 0.170961888760301226363642357}, { 0.985277642388941244774018433},
+    {-0.985277642388941244774018433}, { 0.170961888760301226363642357},
+    { 0.992479534598709998156767252}, { 0.122410675199216198498704474},
+    {-0.122410675199216198498704474}, { 0.992479534598709998156767252},
+    { 0.615231590580626845484913563}, { 0.788346427626606262009164705},
+    {-0.788346427626606262009164705}, { 0.615231590580626845484913563},
+    { 0.870086991108711418652292404}, { 0.492898192229784036873026689},
+    {-0.492898192229784036873026689}, { 0.870086991108711418652292404},
+    { 0.266712757474898386325286515}, { 0.963776065795439866686464356},
+    {-0.963776065795439866686464356}, { 0.266712757474898386325286515},
+    { 0.949528180593036667195936074}, { 0.313681740398891476656478846},
+    {-0.313681740398891476656478846}, { 0.949528180593036667195936074},
+    { 0.449611329654606600046294579}, { 0.893224301195515320342416447},
+    {-0.893224301195515320342416447}, { 0.449611329654606600046294579},
+    { 0.757208846506484547575464054}, { 0.653172842953776764084203014},
+    {-0.653172842953776764084203014}, { 0.757208846506484547575464054},
+    { 0.073564563599667423529465622}, { 0.997290456678690216135597140},
+    {-0.997290456678690216135597140}, { 0.073564563599667423529465622},
+    { 0.997290456678690216135597140}, { 0.073564563599667423529465622},
+    {-0.073564563599667423529465622}, { 0.997290456678690216135597140},
+    { 0.653172842953776764084203014}, { 0.757208846506484547575464054},
+    {-0.757208846506484547575464054}, { 0.653172842953776764084203014},
+    { 0.893224301195515320342416447}, { 0.449611329654606600046294579},
+    {-0.449611329654606600046294579}, { 0.893224301195515320342416447},
+    { 0.313681740398891476656478846}, { 0.949528180593036667195936074},
+    {-0.949528180593036667195936074}, { 0.313681740398891476656478846},
+    { 0.963776065795439866686464356}, { 0.266712757474898386325286515},
+    {-0.266712757474898386325286515}, { 0.963776065795439866686464356},
+    { 0.492898192229784036873026689}, { 0.870086991108711418652292404},
+    {-0.870086991108711418652292404}, { 0.492898192229784036873026689},
+    { 0.788346427626606262009164705}, { 0.615231590580626845484913563},
+    {-0.615231590580626845484913563}, { 0.788346427626606262009164705},
+    { 0.122410675199216198498704474}, { 0.992479534598709998156767252},
+    {-0.992479534598709998156767252}, { 0.122410675199216198498704474},
+    { 0.985277642388941244774018433}, { 0.170961888760301226363642357},
+    {-0.170961888760301226363642357}, { 0.985277642388941244774018433},
+    { 0.575808191417845300745972454}, { 0.817584813151583696504920884},
+    {-0.817584813151583696504920884}, { 0.575808191417845300745972454},
+    { 0.844853565249707073259571205}, { 0.534997619887097210663076905},
+    {-0.534997619887097210663076905}, { 0.844853565249707073259571205},
+    { 0.219101240156869797227737547}, { 0.975702130038528544460395766},
+    {-0.975702130038528544460395766}, { 0.219101240156869797227737547},
+    { 0.932992798834738887711660256}, { 0.359895036534988148775104572},
+    {-0.359895036534988148775104572}, { 0.932992798834738887711660256},
+    { 0.405241314004989870908481306}, { 0.914209755703530654635014829},
+    {-0.914209755703530654635014829}, { 0.405241314004989870908481306},
+    { 0.724247082951466920941069243}, { 0.689540544737066924616730630},
+    {-0.689540544737066924616730630}, { 0.724247082951466920941069243},
+    { 0.024541228522912288031734529}, { 0.999698818696204220115765650},
+    {-0.999698818696204220115765650}, { 0.024541228522912288031734529},
+    { 0.999924701839144540921646491}, { 0.012271538285719926079408262},
+    {-0.012271538285719926079408262}, { 0.999924701839144540921646491},
+    { 0.698376249408972853554813503}, { 0.715730825283818654125532623},
+    {-0.715730825283818654125532623}, { 0.698376249408972853554813503},
+    { 0.919113851690057743908477789}, { 0.393992040061048108596188661},
+    {-0.393992040061048108596188661}, { 0.919113851690057743908477789},
+    { 0.371317193951837543411934967}, { 0.928506080473215565937167396},
+    {-0.928506080473215565937167396}, { 0.371317193951837543411934967},
+    { 0.978317370719627633106240097}, { 0.207111376192218549708116020},
+    {-0.207111376192218549708116020}, { 0.978317370719627633106240097},
+    { 0.545324988422046422313987347}, { 0.838224705554838043186996856},
+    {-0.838224705554838043186996856}, { 0.545324988422046422313987347},
+    { 0.824589302785025264474803737}, { 0.565731810783613197389765011},
+    {-0.565731810783613197389765011}, { 0.824589302785025264474803737},
+    { 0.183039887955140958516532578}, { 0.983105487431216327180301155},
+    {-0.983105487431216327180301155}, { 0.183039887955140958516532578},
+    { 0.993906970002356041546922813}, { 0.110222207293883058807899140},
+    {-0.110222207293883058807899140}, { 0.993906970002356041546922813},
+    { 0.624859488142386377084072816}, { 0.780737228572094478301588484},
+    {-0.780737228572094478301588484}, { 0.624859488142386377084072816},
+    { 0.876070094195406607095844268}, { 0.482183772079122748517344481},
+    {-0.482183772079122748517344481}, { 0.876070094195406607095844268},
+    { 0.278519689385053105207848526}, { 0.960430519415565811199035138},
+    {-0.960430519415565811199035138}, { 0.278519689385053105207848526},
+    { 0.953306040354193836916740383}, { 0.302005949319228067003463232},
+    {-0.302005949319228067003463232}, { 0.953306040354193836916740383},
+    { 0.460538710958240023633181487}, { 0.887639620402853947760181617},
+    {-0.887639620402853947760181617}, { 0.460538710958240023633181487},
+    { 0.765167265622458925888815999}, { 0.643831542889791465068086063},
+    {-0.643831542889791465068086063}, { 0.765167265622458925888815999},
+    { 0.085797312344439890461556332}, { 0.996312612182778012627226190},
+    {-0.996312612182778012627226190}, { 0.085797312344439890461556332},
+    { 0.998118112900149207125155861}, { 0.061320736302208577782614593},
+    {-0.061320736302208577782614593}, { 0.998118112900149207125155861},
+    { 0.662415777590171761113069817}, { 0.749136394523459325469203257},
+    {-0.749136394523459325469203257}, { 0.662415777590171761113069817},
+    { 0.898674465693953843041976744}, { 0.438616238538527637647025738},
+    {-0.438616238538527637647025738}, { 0.898674465693953843041976744},
+    { 0.325310292162262934135954708}, { 0.945607325380521325730945387},
+    {-0.945607325380521325730945387}, { 0.325310292162262934135954708},
+    { 0.966976471044852109087220226}, { 0.254865659604514571553980779},
+    {-0.254865659604514571553980779}, { 0.966976471044852109087220226},
+    { 0.503538383725717558691867071}, { 0.863972856121586737918147054},
+    {-0.863972856121586737918147054}, { 0.503538383725717558691867071},
+    { 0.795836904608883536262791915}, { 0.605511041404325513920626941},
+    {-0.605511041404325513920626941}, { 0.795836904608883536262791915},
+    { 0.134580708507126186316358409}, { 0.990902635427780025108237011},
+    {-0.990902635427780025108237011}, { 0.134580708507126186316358409},
+    { 0.987301418157858382399815802}, { 0.158858143333861441684385360},
+    {-0.158858143333861441684385360}, { 0.987301418157858382399815802},
+    { 0.585797857456438860328080838}, { 0.810457198252594791726703434},
+    {-0.810457198252594791726703434}, { 0.585797857456438860328080838},
+    { 0.851355193105265142261290312}, { 0.524589682678468906215098464},
+    {-0.524589682678468906215098464}, { 0.851355193105265142261290312},
+    { 0.231058108280671119643236018}, { 0.972939952205560145467720114},
+    {-0.972939952205560145467720114}, { 0.231058108280671119643236018},
+    { 0.937339011912574923201899593}, { 0.348418680249434568419308588},
+    {-0.348418680249434568419308588}, { 0.937339011912574923201899593},
+    { 0.416429560097637182562598911}, { 0.909167983090522376563884788},
+    {-0.909167983090522376563884788}, { 0.416429560097637182562598911},
+    { 0.732654271672412834615546649}, { 0.680600997795453050594430464},
+    {-0.680600997795453050594430464}, { 0.732654271672412834615546649},
+    { 0.036807222941358832324332691}, { 0.999322384588349500896221011},
+    {-0.999322384588349500896221011}, { 0.036807222941358832324332691},
+    { 0.999322384588349500896221011}, { 0.036807222941358832324332691},
+    {-0.036807222941358832324332691}, { 0.999322384588349500896221011},
+    { 0.680600997795453050594430464}, { 0.732654271672412834615546649},
+    {-0.732654271672412834615546649}, { 0.680600997795453050594430464},
+    { 0.909167983090522376563884788}, { 0.416429560097637182562598911},
+    {-0.416429560097637182562598911}, { 0.909167983090522376563884788},
+    { 0.348418680249434568419308588}, { 0.937339011912574923201899593},
+    {-0.937339011912574923201899593}, { 0.348418680249434568419308588},
+    { 0.972939952205560145467720114}, { 0.231058108280671119643236018},
+    {-0.231058108280671119643236018}, { 0.972939952205560145467720114},
+    { 0.524589682678468906215098464}, { 0.851355193105265142261290312},
+    {-0.851355193105265142261290312}, { 0.524589682678468906215098464},
+    { 0.810457198252594791726703434}, { 0.585797857456438860328080838},
+    {-0.585797857456438860328080838}, { 0.810457198252594791726703434},
+    { 0.158858143333861441684385360}, { 0.987301418157858382399815802},
+    {-0.987301418157858382399815802}, { 0.158858143333861441684385360},
+    { 0.990902635427780025108237011}, { 0.134580708507126186316358409},
+    {-0.134580708507126186316358409}, { 0.990902635427780025108237011},
+    { 0.605511041404325513920626941}, { 0.795836904608883536262791915},
+    {-0.795836904608883536262791915}, { 0.605511041404325513920626941},
+    { 0.863972856121586737918147054}, { 0.503538383725717558691867071},
+    {-0.503538383725717558691867071}, { 0.863972856121586737918147054},
+    { 0.254865659604514571553980779}, { 0.966976471044852109087220226},
+    {-0.966976471044852109087220226}, { 0.254865659604514571553980779},
+    { 0.945607325380521325730945387}, { 0.325310292162262934135954708},
+    {-0.325310292162262934135954708}, { 0.945607325380521325730945387},
+    { 0.438616238538527637647025738}, { 0.898674465693953843041976744},
+    {-0.898674465693953843041976744}, { 0.438616238538527637647025738},
+    { 0.749136394523459325469203257}, { 0.662415777590171761113069817},
+    {-0.662415777590171761113069817}, { 0.749136394523459325469203257},
+    { 0.061320736302208577782614593}, { 0.998118112900149207125155861},
+    {-0.998118112900149207125155861}, { 0.061320736302208577782614593},
+    { 0.996312612182778012627226190}, { 0.085797312344439890461556332},
+    {-0.085797312344439890461556332}, { 0.996312612182778012627226190},
+    { 0.643831542889791465068086063}, { 0.765167265622458925888815999},
+    {-0.765167265622458925888815999}, { 0.643831542889791465068086063},
+    { 0.887639620402853947760181617}, { 0.460538710958240023633181487},
+    {-0.460538710958240023633181487}, { 0.887639620402853947760181617},
+    { 0.302005949319228067003463232}, { 0.953306040354193836916740383},
+    {-0.953306040354193836916740383}, { 0.302005949319228067003463232},
+    { 0.960430519415565811199035138}, { 0.278519689385053105207848526},
+    {-0.278519689385053105207848526}, { 0.960430519415565811199035138},
+    { 0.482183772079122748517344481}, { 0.876070094195406607095844268},
+    {-0.876070094195406607095844268}, { 0.482183772079122748517344481},
+    { 0.780737228572094478301588484}, { 0.624859488142386377084072816},
+    {-0.624859488142386377084072816}, { 0.780737228572094478301588484},
+    { 0.110222207293883058807899140}, { 0.993906970002356041546922813},
+    {-0.993906970002356041546922813}, { 0.110222207293883058807899140},
+    { 0.983105487431216327180301155}, { 0.183039887955140958516532578},
+    {-0.183039887955140958516532578}, { 0.983105487431216327180301155},
+    { 0.565731810783613197389765011}, { 0.824589302785025264474803737},
+    {-0.824589302785025264474803737}, { 0.565731810783613197389765011},
+    { 0.838224705554838043186996856}, { 0.545324988422046422313987347},
+    {-0.545324988422046422313987347}, { 0.838224705554838043186996856},
+    { 0.207111376192218549708116020}, { 0.978317370719627633106240097},
+    {-0.978317370719627633106240097}, { 0.207111376192218549708116020},
+    { 0.928506080473215565937167396}, { 0.371317193951837543411934967},
+    {-0.371317193951837543411934967}, { 0.928506080473215565937167396},
+    { 0.393992040061048108596188661}, { 0.919113851690057743908477789},
+    {-0.919113851690057743908477789}, { 0.393992040061048108596188661},
+    { 0.715730825283818654125532623}, { 0.698376249408972853554813503},
+    {-0.698376249408972853554813503}, { 0.715730825283818654125532623},
+    { 0.012271538285719926079408262}, { 0.999924701839144540921646491},
+    {-0.999924701839144540921646491}, { 0.012271538285719926079408262},
+    { 0.999981175282601142656990438}, { 0.006135884649154475359640235},
+    {-0.006135884649154475359640235}, { 0.999981175282601142656990438},
+    { 0.702754744457225302452914421}, { 0.711432195745216441522130290},
+    {-0.711432195745216441522130290}, { 0.702754744457225302452914421},
+    { 0.921514039342041943465396332}, { 0.388345046698826291624993541},
+    {-0.388345046698826291624993541}, { 0.921514039342041943465396332},
+    { 0.377007410216418256726567823}, { 0.926210242138311341974793388},
+    {-0.926210242138311341974793388}, { 0.377007410216418256726567823},
+    { 0.979569765685440534439326110}, { 0.201104634842091911558443546},
+    {-0.201104634842091911558443546}, { 0.979569765685440534439326110},
+    { 0.550457972936604802977289893}, { 0.834862874986380056304401383},
+    {-0.834862874986380056304401383}, { 0.550457972936604802977289893},
+    { 0.828045045257755752067527592}, { 0.560661576197336023839710223},
+    {-0.560661576197336023839710223}, { 0.828045045257755752067527592},
+    { 0.189068664149806212754997837}, { 0.981963869109555264072848154},
+    {-0.981963869109555264072848154}, { 0.189068664149806212754997837},
+    { 0.994564570734255452119106243}, { 0.104121633872054579120943880},
+    {-0.104121633872054579120943880}, { 0.994564570734255452119106243},
+    { 0.629638238914927025372981341}, { 0.776888465673232450040827983},
+    {-0.776888465673232450040827983}, { 0.629638238914927025372981341},
+    { 0.879012226428633477831323711}, { 0.476799230063322133342158117},
+    {-0.476799230063322133342158117}, { 0.879012226428633477831323711},
+    { 0.284407537211271843618310615}, { 0.958703474895871555374645792},
+    {-0.958703474895871555374645792}, { 0.284407537211271843618310615},
+    { 0.955141168305770721498157712}, { 0.296150888243623824121786128},
+    {-0.296150888243623824121786128}, { 0.955141168305770721498157712},
+    { 0.465976495767966177902756065}, { 0.884797098430937780104007041},
+    {-0.884797098430937780104007041}, { 0.465976495767966177902756065},
+    { 0.769103337645579639346626069}, { 0.639124444863775743801488193},
+    {-0.639124444863775743801488193}, { 0.769103337645579639346626069},
+    { 0.091908956497132728624990979}, { 0.995767414467659793982495643},
+    {-0.995767414467659793982495643}, { 0.091908956497132728624990979},
+    { 0.998475580573294752208559038}, { 0.055195244349689939809447526},
+    {-0.055195244349689939809447526}, { 0.998475580573294752208559038},
+    { 0.666999922303637506650154222}, { 0.745057785441465962407907310},
+    {-0.745057785441465962407907310}, { 0.666999922303637506650154222},
+    { 0.901348847046022014570746093}, { 0.433093818853151968484222638},
+    {-0.433093818853151968484222638}, { 0.901348847046022014570746093},
+    { 0.331106305759876401737190737}, { 0.943593458161960361495301445},
+    {-0.943593458161960361495301445}, { 0.331106305759876401737190737},
+    { 0.968522094274417316221088329}, { 0.248927605745720168110682816},
+    {-0.248927605745720168110682816}, { 0.968522094274417316221088329},
+    { 0.508830142543107036931749324}, { 0.860866938637767279344583877},
+    {-0.860866938637767279344583877}, { 0.508830142543107036931749324},
+    { 0.799537269107905033500246232}, { 0.600616479383868926653875896},
+    {-0.600616479383868926653875896}, { 0.799537269107905033500246232},
+    { 0.140658239332849230714788846}, { 0.990058210262297105505906464},
+    {-0.990058210262297105505906464}, { 0.140658239332849230714788846},
+    { 0.988257567730749491404792538}, { 0.152797185258443427720336613},
+    {-0.152797185258443427720336613}, { 0.988257567730749491404792538},
+    { 0.590759701858874228423887908}, { 0.806847553543799272206514313},
+    {-0.806847553543799272206514313}, { 0.590759701858874228423887908},
+    { 0.854557988365400520767862276}, { 0.519355990165589587361829932},
+    {-0.519355990165589587361829932}, { 0.854557988365400520767862276},
+    { 0.237023605994367206867735915}, { 0.971503890986251775537099622},
+    {-0.971503890986251775537099622}, { 0.237023605994367206867735915},
+    { 0.939459223602189911962669246}, { 0.342660717311994397592781983},
+    {-0.342660717311994397592781983}, { 0.939459223602189911962669246},
+    { 0.422000270799799685941287941}, { 0.906595704514915365332960588},
+    {-0.906595704514915365332960588}, { 0.422000270799799685941287941},
+    { 0.736816568877369875090132520}, { 0.676092703575315960360419228},
+    {-0.676092703575315960360419228}, { 0.736816568877369875090132520},
+    { 0.042938256934940823077124540}, { 0.999077727752645382888781997},
+    {-0.999077727752645382888781997}, { 0.042938256934940823077124540},
+    { 0.999529417501093163079703322}, { 0.030674803176636625934021028},
+    {-0.030674803176636625934021028}, { 0.999529417501093163079703322},
+    { 0.685083667772700381362052545}, { 0.728464390448225196492035438},
+    {-0.728464390448225196492035438}, { 0.685083667772700381362052545},
+    { 0.911706032005429851404397325}, { 0.410843171057903942183466675},
+    {-0.410843171057903942183466675}, { 0.911706032005429851404397325},
+    { 0.354163525420490382357395796}, { 0.935183509938947577642207480},
+    {-0.935183509938947577642207480}, { 0.354163525420490382357395796},
+    { 0.974339382785575860518721668}, { 0.225083911359792835991642120},
+    {-0.225083911359792835991642120}, { 0.974339382785575860518721668},
+    { 0.529803624686294668216054671}, { 0.848120344803297251279133563},
+    {-0.848120344803297251279133563}, { 0.529803624686294668216054671},
+    { 0.814036329705948361654516690}, { 0.580813958095764545075595272},
+    {-0.580813958095764545075595272}, { 0.814036329705948361654516690},
+    { 0.164913120489969921418189113}, { 0.986308097244598647863297524},
+    {-0.986308097244598647863297524}, { 0.164913120489969921418189113},
+    { 0.991709753669099522860049931}, { 0.128498110793793172624415589},
+    {-0.128498110793793172624415589}, { 0.991709753669099522860049931},
+    { 0.610382806276309452716352152}, { 0.792106577300212351782342879},
+    {-0.792106577300212351782342879}, { 0.610382806276309452716352152},
+    { 0.867046245515692651480195629}, { 0.498227666972781852410983869},
+    {-0.498227666972781852410983869}, { 0.867046245515692651480195629},
+    { 0.260794117915275518280186509}, { 0.965394441697689374550843858},
+    {-0.965394441697689374550843858}, { 0.260794117915275518280186509},
+    { 0.947585591017741134653387321}, { 0.319502030816015677901518272},
+    {-0.319502030816015677901518272}, { 0.947585591017741134653387321},
+    { 0.444122144570429231642069418}, { 0.895966249756185155914560282},
+    {-0.895966249756185155914560282}, { 0.444122144570429231642069418},
+    { 0.753186799043612482483430486}, { 0.657806693297078656931182264},
+    {-0.657806693297078656931182264}, { 0.753186799043612482483430486},
+    { 0.067443919563664057897972422}, { 0.997723066644191609848546728},
+    {-0.997723066644191609848546728}, { 0.067443919563664057897972422},
+    { 0.996820299291165714972629398}, { 0.079682437971430121147120656},
+    {-0.079682437971430121147120656}, { 0.996820299291165714972629398},
+    { 0.648514401022112445084560551}, { 0.761202385484261814029709836},
+    {-0.761202385484261814029709836}, { 0.648514401022112445084560551},
+    { 0.890448723244757889952150560}, { 0.455083587126343823535869268},
+    {-0.455083587126343823535869268}, { 0.890448723244757889952150560},
+    { 0.307849640041534893682063646}, { 0.951435020969008369549175569},
+    {-0.951435020969008369549175569}, { 0.307849640041534893682063646},
+    { 0.962121404269041595429604316}, { 0.272621355449948984493347477},
+    {-0.272621355449948984493347477}, { 0.962121404269041595429604316},
+    { 0.487550160148435954641485027}, { 0.873094978418290098636085973},
+    {-0.873094978418290098636085973}, { 0.487550160148435954641485027},
+    { 0.784556597155575233023892575}, { 0.620057211763289178646268191},
+    {-0.620057211763289178646268191}, { 0.784556597155575233023892575},
+    { 0.116318630911904767252544319}, { 0.993211949234794533104601012},
+    {-0.993211949234794533104601012}, { 0.116318630911904767252544319},
+    { 0.984210092386929073193874387}, { 0.177004220412148756196839844},
+    {-0.177004220412148756196839844}, { 0.984210092386929073193874387},
+    { 0.570780745886967280232652864}, { 0.821102514991104679060430820},
+    {-0.821102514991104679060430820}, { 0.570780745886967280232652864},
+    { 0.841554977436898409603499520}, { 0.540171472729892881297845480},
+    {-0.540171472729892881297845480}, { 0.841554977436898409603499520},
+    { 0.213110319916091373967757518}, { 0.977028142657754351485866211},
+    {-0.977028142657754351485866211}, { 0.213110319916091373967757518},
+    { 0.930766961078983731944872340}, { 0.365612997804773870011745909},
+    {-0.365612997804773870011745909}, { 0.930766961078983731944872340},
+    { 0.399624199845646828544117031}, { 0.916679059921042663116457013},
+    {-0.916679059921042663116457013}, { 0.399624199845646828544117031},
+    { 0.720002507961381629076682999}, { 0.693971460889654009003734389},
+    {-0.693971460889654009003734389}, { 0.720002507961381629076682999},
+    { 0.018406729905804820927366313}, { 0.999830581795823422015722275},
+    {-0.999830581795823422015722275}, { 0.018406729905804820927366313},
+    { 0.999830581795823422015722275}, { 0.018406729905804820927366313},
+    {-0.018406729905804820927366313}, { 0.999830581795823422015722275},
+    { 0.693971460889654009003734389}, { 0.720002507961381629076682999},
+    {-0.720002507961381629076682999}, { 0.693971460889654009003734389},
+    { 0.916679059921042663116457013}, { 0.399624199845646828544117031},
+    {-0.399624199845646828544117031}, { 0.916679059921042663116457013},
+    { 0.365612997804773870011745909}, { 0.930766961078983731944872340},
+    {-0.930766961078983731944872340}, { 0.365612997804773870011745909},
+    { 0.977028142657754351485866211}, { 0.213110319916091373967757518},
+    {-0.213110319916091373967757518}, { 0.977028142657754351485866211},
+    { 0.540171472729892881297845480}, { 0.841554977436898409603499520},
+    {-0.841554977436898409603499520}, { 0.540171472729892881297845480},
+    { 0.821102514991104679060430820}, { 0.570780745886967280232652864},
+    {-0.570780745886967280232652864}, { 0.821102514991104679060430820},
+    { 0.177004220412148756196839844}, { 0.984210092386929073193874387},
+    {-0.984210092386929073193874387}, { 0.177004220412148756196839844},
+    { 0.993211949234794533104601012}, { 0.116318630911904767252544319},
+    {-0.116318630911904767252544319}, { 0.993211949234794533104601012},
+    { 0.620057211763289178646268191}, { 0.784556597155575233023892575},
+    {-0.784556597155575233023892575}, { 0.620057211763289178646268191},
+    { 0.873094978418290098636085973}, { 0.487550160148435954641485027},
+    {-0.487550160148435954641485027}, { 0.873094978418290098636085973},
+    { 0.272621355449948984493347477}, { 0.962121404269041595429604316},
+    {-0.962121404269041595429604316}, { 0.272621355449948984493347477},
+    { 0.951435020969008369549175569}, { 0.307849640041534893682063646},
+    {-0.307849640041534893682063646}, { 0.951435020969008369549175569},
+    { 0.455083587126343823535869268}, { 0.890448723244757889952150560},
+    {-0.890448723244757889952150560}, { 0.455083587126343823535869268},
+    { 0.761202385484261814029709836}, { 0.648514401022112445084560551},
+    {-0.648514401022112445084560551}, { 0.761202385484261814029709836},
+    { 0.079682437971430121147120656}, { 0.996820299291165714972629398},
+    {-0.996820299291165714972629398}, { 0.079682437971430121147120656},
+    { 0.997723066644191609848546728}, { 0.067443919563664057897972422},
+    {-0.067443919563664057897972422}, { 0.997723066644191609848546728},
+    { 0.657806693297078656931182264}, { 0.753186799043612482483430486},
+    {-0.753186799043612482483430486}, { 0.657806693297078656931182264},
+    { 0.895966249756185155914560282}, { 0.444122144570429231642069418},
+    {-0.444122144570429231642069418}, { 0.895966249756185155914560282},
+    { 0.319502030816015677901518272}, { 0.947585591017741134653387321},
+    {-0.947585591017741134653387321}, { 0.319502030816015677901518272},
+    { 0.965394441697689374550843858}, { 0.260794117915275518280186509},
+    {-0.260794117915275518280186509}, { 0.965394441697689374550843858},
+    { 0.498227666972781852410983869}, { 0.867046245515692651480195629},
+    {-0.867046245515692651480195629}, { 0.498227666972781852410983869},
+    { 0.792106577300212351782342879}, { 0.610382806276309452716352152},
+    {-0.610382806276309452716352152}, { 0.792106577300212351782342879},
+    { 0.128498110793793172624415589}, { 0.991709753669099522860049931},
+    {-0.991709753669099522860049931}, { 0.128498110793793172624415589},
+    { 0.986308097244598647863297524}, { 0.164913120489969921418189113},
+    {-0.164913120489969921418189113}, { 0.986308097244598647863297524},
+    { 0.580813958095764545075595272}, { 0.814036329705948361654516690},
+    {-0.814036329705948361654516690}, { 0.580813958095764545075595272},
+    { 0.848120344803297251279133563}, { 0.529803624686294668216054671},
+    {-0.529803624686294668216054671}, { 0.848120344803297251279133563},
+    { 0.225083911359792835991642120}, { 0.974339382785575860518721668},
+    {-0.974339382785575860518721668}, { 0.225083911359792835991642120},
+    { 0.935183509938947577642207480}, { 0.354163525420490382357395796},
+    {-0.354163525420490382357395796}, { 0.935183509938947577642207480},
+    { 0.410843171057903942183466675}, { 0.911706032005429851404397325},
+    {-0.911706032005429851404397325}, { 0.410843171057903942183466675},
+    { 0.728464390448225196492035438}, { 0.685083667772700381362052545},
+    {-0.685083667772700381362052545}, { 0.728464390448225196492035438},
+    { 0.030674803176636625934021028}, { 0.999529417501093163079703322},
+    {-0.999529417501093163079703322}, { 0.030674803176636625934021028},
+    { 0.999077727752645382888781997}, { 0.042938256934940823077124540},
+    {-0.042938256934940823077124540}, { 0.999077727752645382888781997},
+    { 0.676092703575315960360419228}, { 0.736816568877369875090132520},
+    {-0.736816568877369875090132520}, { 0.676092703575315960360419228},
+    { 0.906595704514915365332960588}, { 0.422000270799799685941287941},
+    {-0.422000270799799685941287941}, { 0.906595704514915365332960588},
+    { 0.342660717311994397592781983}, { 0.939459223602189911962669246},
+    {-0.939459223602189911962669246}, { 0.342660717311994397592781983},
+    { 0.971503890986251775537099622}, { 0.237023605994367206867735915},
+    {-0.237023605994367206867735915}, { 0.971503890986251775537099622},
+    { 0.519355990165589587361829932}, { 0.854557988365400520767862276},
+    {-0.854557988365400520767862276}, { 0.519355990165589587361829932},
+    { 0.806847553543799272206514313}, { 0.590759701858874228423887908},
+    {-0.590759701858874228423887908}, { 0.806847553543799272206514313},
+    { 0.152797185258443427720336613}, { 0.988257567730749491404792538},
+    {-0.988257567730749491404792538}, { 0.152797185258443427720336613},
+    { 0.990058210262297105505906464}, { 0.140658239332849230714788846},
+    {-0.140658239332849230714788846}, { 0.990058210262297105505906464},
+    { 0.600616479383868926653875896}, { 0.799537269107905033500246232},
+    {-0.799537269107905033500246232}, { 0.600616479383868926653875896},
+    { 0.860866938637767279344583877}, { 0.508830142543107036931749324},
+    {-0.508830142543107036931749324}, { 0.860866938637767279344583877},
+    { 0.248927605745720168110682816}, { 0.968522094274417316221088329},
+    {-0.968522094274417316221088329}, { 0.248927605745720168110682816},
+    { 0.943593458161960361495301445}, { 0.331106305759876401737190737},
+    {-0.331106305759876401737190737}, { 0.943593458161960361495301445},
+    { 0.433093818853151968484222638}, { 0.901348847046022014570746093},
+    {-0.901348847046022014570746093}, { 0.433093818853151968484222638},
+    { 0.745057785441465962407907310}, { 0.666999922303637506650154222},
+    {-0.666999922303637506650154222}, { 0.745057785441465962407907310},
+    { 0.055195244349689939809447526}, { 0.998475580573294752208559038},
+    {-0.998475580573294752208559038}, { 0.055195244349689939809447526},
+    { 0.995767414467659793982495643}, { 0.091908956497132728624990979},
+    {-0.091908956497132728624990979}, { 0.995767414467659793982495643},
+    { 0.639124444863775743801488193}, { 0.769103337645579639346626069},
+    {-0.769103337645579639346626069}, { 0.639124444863775743801488193},
+    { 0.884797098430937780104007041}, { 0.465976495767966177902756065},
+    {-0.465976495767966177902756065}, { 0.884797098430937780104007041},
+    { 0.296150888243623824121786128}, { 0.955141168305770721498157712},
+    {-0.955141168305770721498157712}, { 0.296150888243623824121786128},
+    { 0.958703474895871555374645792}, { 0.284407537211271843618310615},
+    {-0.284407537211271843618310615}, { 0.958703474895871555374645792},
+    { 0.476799230063322133342158117}, { 0.879012226428633477831323711},
+    {-0.879012226428633477831323711}, { 0.476799230063322133342158117},
+    { 0.776888465673232450040827983}, { 0.629638238914927025372981341},
+    {-0.629638238914927025372981341}, { 0.776888465673232450040827983},
+    { 0.104121633872054579120943880}, { 0.994564570734255452119106243},
+    {-0.994564570734255452119106243}, { 0.104121633872054579120943880},
+    { 0.981963869109555264072848154}, { 0.189068664149806212754997837},
+    {-0.189068664149806212754997837}, { 0.981963869109555264072848154},
+    { 0.560661576197336023839710223}, { 0.828045045257755752067527592},
+    {-0.828045045257755752067527592}, { 0.560661576197336023839710223},
+    { 0.834862874986380056304401383}, { 0.550457972936604802977289893},
+    {-0.550457972936604802977289893}, { 0.834862874986380056304401383},
+    { 0.201104634842091911558443546}, { 0.979569765685440534439326110},
+    {-0.979569765685440534439326110}, { 0.201104634842091911558443546},
+    { 0.926210242138311341974793388}, { 0.377007410216418256726567823},
+    {-0.377007410216418256726567823}, { 0.926210242138311341974793388},
+    { 0.388345046698826291624993541}, { 0.921514039342041943465396332},
+    {-0.921514039342041943465396332}, { 0.388345046698826291624993541},
+    { 0.711432195745216441522130290}, { 0.702754744457225302452914421},
+    {-0.702754744457225302452914421}, { 0.711432195745216441522130290},
+    { 0.006135884649154475359640235}, { 0.999981175282601142656990438},
+    {-0.999981175282601142656990438}, { 0.006135884649154475359640235},
+    { 0.999995293809576171511580126}, { 0.003067956762965976270145365},
+    {-0.003067956762965976270145365}, { 0.999995293809576171511580126},
+    { 0.704934080375904908852523758}, { 0.709272826438865651316533772},
+    {-0.709272826438865651316533772}, { 0.704934080375904908852523758},
+    { 0.922701128333878570437264227}, { 0.385516053843918864075607949},
+    {-0.385516053843918864075607949}, { 0.922701128333878570437264227},
+    { 0.379847208924051170576281147}, { 0.925049240782677590302371869},
+    {-0.925049240782677590302371869}, { 0.379847208924051170576281147},
+    { 0.980182135968117392690210009}, { 0.198098410717953586179324918},
+    {-0.198098410717953586179324918}, { 0.980182135968117392690210009},
+    { 0.553016705580027531764226988}, { 0.833170164701913186439915922},
+    {-0.833170164701913186439915922}, { 0.553016705580027531764226988},
+    { 0.829761233794523042469023765}, { 0.558118531220556115693702964},
+    {-0.558118531220556115693702964}, { 0.829761233794523042469023765},
+    { 0.192080397049892441679288205}, { 0.981379193313754574318224190},
+    {-0.981379193313754574318224190}, { 0.192080397049892441679288205},
+    { 0.994879330794805620591166107}, { 0.101069862754827824987887585},
+    {-0.101069862754827824987887585}, { 0.994879330794805620591166107},
+    { 0.632018735939809021909403706}, { 0.774953106594873878359129282},
+    {-0.774953106594873878359129282}, { 0.632018735939809021909403706},
+    { 0.880470889052160770806542929}, { 0.474100214650550014398580015},
+    {-0.474100214650550014398580015}, { 0.880470889052160770806542929},
+    { 0.287347459544729526477331841}, { 0.957826413027532890321037029},
+    {-0.957826413027532890321037029}, { 0.287347459544729526477331841},
+    { 0.956045251349996443270479823}, { 0.293219162694258650606608599},
+    {-0.293219162694258650606608599}, { 0.956045251349996443270479823},
+    { 0.468688822035827933697617870}, { 0.883363338665731594736308015},
+    {-0.883363338665731594736308015}, { 0.468688822035827933697617870},
+    { 0.771060524261813773200605759}, { 0.636761861236284230413943435},
+    {-0.636761861236284230413943435}, { 0.771060524261813773200605759},
+    { 0.094963495329638998938034312}, { 0.995480755491926941769171600},
+    {-0.995480755491926941769171600}, { 0.094963495329638998938034312},
+    { 0.998640218180265222418199049}, { 0.052131704680283321236358216},
+    {-0.052131704680283321236358216}, { 0.998640218180265222418199049},
+    { 0.669282588346636065720696366}, { 0.743007952135121693517362293},
+    {-0.743007952135121693517362293}, { 0.669282588346636065720696366},
+    { 0.902673318237258806751502391}, { 0.430326481340082633908199031},
+    {-0.430326481340082633908199031}, { 0.902673318237258806751502391},
+    { 0.333999651442009404650865481}, { 0.942573197601446879280758735},
+    {-0.942573197601446879280758735}, { 0.333999651442009404650865481},
+    { 0.969281235356548486048290738}, { 0.245955050335794611599924709},
+    {-0.245955050335794611599924709}, { 0.969281235356548486048290738},
+    { 0.511468850437970399504391001}, { 0.859301818357008404783582139},
+    {-0.859301818357008404783582139}, { 0.511468850437970399504391001},
+    { 0.801376171723140219430247777}, { 0.598160706996342311724958652},
+    {-0.598160706996342311724958652}, { 0.801376171723140219430247777},
+    { 0.143695033150294454819773349}, { 0.989622017463200834623694454},
+    {-0.989622017463200834623694454}, { 0.143695033150294454819773349},
+    { 0.988721691960323767604516485}, { 0.149764534677321517229695737},
+    {-0.149764534677321517229695737}, { 0.988721691960323767604516485},
+    { 0.593232295039799808047809426}, { 0.805031331142963597922659282},
+    {-0.805031331142963597922659282}, { 0.593232295039799808047809426},
+    { 0.856147328375194481019630732}, { 0.516731799017649881508753876},
+    {-0.516731799017649881508753876}, { 0.856147328375194481019630732},
+    { 0.240003022448741486568922365}, { 0.970772140728950302138169611},
+    {-0.970772140728950302138169611}, { 0.240003022448741486568922365},
+    { 0.940506070593268323787291309}, { 0.339776884406826857828825803},
+    {-0.339776884406826857828825803}, { 0.940506070593268323787291309},
+    { 0.424779681209108833357226189}, { 0.905296759318118774354048329},
+    {-0.905296759318118774354048329}, { 0.424779681209108833357226189},
+    { 0.738887324460615147933116508}, { 0.673829000378756060917568372},
+    {-0.673829000378756060917568372}, { 0.738887324460615147933116508},
+    { 0.046003182130914628814301788}, { 0.998941293186856850633930266},
+    {-0.998941293186856850633930266}, { 0.046003182130914628814301788},
+    { 0.999618822495178597116830637}, { 0.027608145778965741612354872},
+    {-0.027608145778965741612354872}, { 0.999618822495178597116830637},
+    { 0.687315340891759108199186948}, { 0.726359155084345976817494315},
+    {-0.726359155084345976817494315}, { 0.687315340891759108199186948},
+    { 0.912962190428398164628018233}, { 0.408044162864978680820747499},
+    {-0.408044162864978680820747499}, { 0.912962190428398164628018233},
+    { 0.357030961233430032614954036}, { 0.934092550404258914729877883},
+    {-0.934092550404258914729877883}, { 0.357030961233430032614954036},
+    { 0.975025345066994146844913468}, { 0.222093620973203534094094721},
+    {-0.222093620973203534094094721}, { 0.975025345066994146844913468},
+    { 0.532403127877197971442805218}, { 0.846490938774052078300544488},
+    {-0.846490938774052078300544488}, { 0.532403127877197971442805218},
+    { 0.815814410806733789010772660}, { 0.578313796411655563342245019},
+    {-0.578313796411655563342245019}, { 0.815814410806733789010772660},
+    { 0.167938294974731178054745536}, { 0.985797509167567424700995000},
+    {-0.985797509167567424700995000}, { 0.167938294974731178054745536},
+    { 0.992099313142191757112085445}, { 0.125454983411546238542336453},
+    {-0.125454983411546238542336453}, { 0.992099313142191757112085445},
+    { 0.612810082429409703935211936}, { 0.790230221437310055030217152},
+    {-0.790230221437310055030217152}, { 0.612810082429409703935211936},
+    { 0.868570705971340895340449876}, { 0.495565261825772531150266670},
+    {-0.495565261825772531150266670}, { 0.868570705971340895340449876},
+    { 0.263754678974831383611349322}, { 0.964589793289812723836432159},
+    {-0.964589793289812723836432159}, { 0.263754678974831383611349322},
+    { 0.948561349915730288158494826}, { 0.316593375556165867243047035},
+    {-0.316593375556165867243047035}, { 0.948561349915730288158494826},
+    { 0.446868840162374195353044389}, { 0.894599485631382678433072126},
+    {-0.894599485631382678433072126}, { 0.446868840162374195353044389},
+    { 0.755201376896536527598710756}, { 0.655492852999615385312679701},
+    {-0.655492852999615385312679701}, { 0.755201376896536527598710756},
+    { 0.070504573389613863027351471}, { 0.997511456140303459699448390},
+    {-0.997511456140303459699448390}, { 0.070504573389613863027351471},
+    { 0.997060070339482978987989949}, { 0.076623861392031492278332463},
+    {-0.076623861392031492278332463}, { 0.997060070339482978987989949},
+    { 0.650846684996380915068975573}, { 0.759209188978388033485525443},
+    {-0.759209188978388033485525443}, { 0.650846684996380915068975573},
+    { 0.891840709392342727796478697}, { 0.452349587233770874133026703},
+    {-0.452349587233770874133026703}, { 0.891840709392342727796478697},
+    { 0.310767152749611495835997250}, { 0.950486073949481721759926101},
+    {-0.950486073949481721759926101}, { 0.310767152749611495835997250},
+    { 0.962953266873683886347921481}, { 0.269668325572915106525464462},
+    {-0.269668325572915106525464462}, { 0.962953266873683886347921481},
+    { 0.490226483288291154229598449}, { 0.871595086655951034842481435},
+    {-0.871595086655951034842481435}, { 0.490226483288291154229598449},
+    { 0.786455213599085757522319464}, { 0.617647307937803932403979402},
+    {-0.617647307937803932403979402}, { 0.786455213599085757522319464},
+    { 0.119365214810991364593637790}, { 0.992850414459865090793563344},
+    {-0.992850414459865090793563344}, { 0.119365214810991364593637790},
+    { 0.984748501801904218556553176}, { 0.173983873387463827950700807},
+    {-0.173983873387463827950700807}, { 0.984748501801904218556553176},
+    { 0.573297166698042212820171239}, { 0.819347520076796960824689637},
+    {-0.819347520076796960824689637}, { 0.573297166698042212820171239},
+    { 0.843208239641845437161743865}, { 0.537587076295645482502214932},
+    {-0.537587076295645482502214932}, { 0.843208239641845437161743865},
+    { 0.216106797076219509948385131}, { 0.976369731330021149312732194},
+    {-0.976369731330021149312732194}, { 0.216106797076219509948385131},
+    { 0.931884265581668106718557199}, { 0.362755724367397216204854462},
+    {-0.362755724367397216204854462}, { 0.931884265581668106718557199},
+    { 0.402434650859418441082533934}, { 0.915448716088267819566431292},
+    {-0.915448716088267819566431292}, { 0.402434650859418441082533934},
+    { 0.722128193929215321243607198}, { 0.691759258364157774906734132},
+    {-0.691759258364157774906734132}, { 0.722128193929215321243607198},
+    { 0.021474080275469507418374898}, { 0.999769405351215321657617036},
+    {-0.999769405351215321657617036}, { 0.021474080275469507418374898},
+    { 0.999882347454212525633049627}, { 0.015339206284988101044151868},
+    {-0.015339206284988101044151868}, { 0.999882347454212525633049627},
+    { 0.696177131491462944788582591}, { 0.717870045055731736211325329},
+    {-0.717870045055731736211325329}, { 0.696177131491462944788582591},
+    { 0.917900775621390457642276297}, { 0.396809987416710328595290911},
+    {-0.396809987416710328595290911}, { 0.917900775621390457642276297},
+    { 0.368466829953372331712746222}, { 0.929640895843181265457918066},
+    {-0.929640895843181265457918066}, { 0.368466829953372331712746222},
+    { 0.977677357824509979943404762}, { 0.210111836880469621717489972},
+    {-0.210111836880469621717489972}, { 0.977677357824509979943404762},
+    { 0.542750784864515906586768661}, { 0.839893794195999504583383987},
+    {-0.839893794195999504583383987}, { 0.542750784864515906586768661},
+    { 0.822849781375826332046780034}, { 0.568258952670131549790548489},
+    {-0.568258952670131549790548489}, { 0.822849781375826332046780034},
+    { 0.180022901405699522679906590}, { 0.983662419211730274396237776},
+    {-0.983662419211730274396237776}, { 0.180022901405699522679906590},
+    { 0.993564135520595333782021697}, { 0.113270952177564349018228733},
+    {-0.113270952177564349018228733}, { 0.993564135520595333782021697},
+    { 0.622461279374149972519166721}, { 0.782650596166575738458949301},
+    {-0.782650596166575738458949301}, { 0.622461279374149972519166721},
+    { 0.874586652278176112634431897}, { 0.484869248000791101822951699},
+    {-0.484869248000791101822951699}, { 0.874586652278176112634431897},
+    { 0.275571819310958163076425168}, { 0.961280485811320641748659653},
+    {-0.961280485811320641748659653}, { 0.275571819310958163076425168},
+    { 0.952375012719765858529893608}, { 0.304929229735402406490728633},
+    {-0.304929229735402406490728633}, { 0.952375012719765858529893608},
+    { 0.457813303598877221904961155}, { 0.889048355854664562540777729},
+    {-0.889048355854664562540777729}, { 0.457813303598877221904961155},
+    { 0.763188417263381271704838297}, { 0.646176012983316364832802220},
+    {-0.646176012983316364832802220}, { 0.763188417263381271704838297},
+    { 0.082740264549375693111987083}, { 0.996571145790554847093566910},
+    {-0.996571145790554847093566910}, { 0.082740264549375693111987083},
+    { 0.997925286198596012623025462}, { 0.064382630929857460819324537},
+    {-0.064382630929857460819324537}, { 0.997925286198596012623025462},
+    { 0.660114342067420478559490747}, { 0.751165131909686411205819422},
+    {-0.751165131909686411205819422}, { 0.660114342067420478559490747},
+    { 0.897324580705418281231391836}, { 0.441371268731716692879988968},
+    {-0.441371268731716692879988968}, { 0.897324580705418281231391836},
+    { 0.322407678801069848384807478}, { 0.946600913083283570044599823},
+    {-0.946600913083283570044599823}, { 0.322407678801069848384807478},
+    { 0.966190003445412555433832961}, { 0.257831102162159005614471295},
+    {-0.257831102162159005614471295}, { 0.966190003445412555433832961},
+    { 0.500885382611240786241285004}, { 0.865513624090569082825488358},
+    {-0.865513624090569082825488358}, { 0.500885382611240786241285004},
+    { 0.793975477554337164895083757}, { 0.607949784967773667243642671},
+    {-0.607949784967773667243642671}, { 0.793975477554337164895083757},
+    { 0.131540028702883111103387493}, { 0.991310859846115418957349799},
+    {-0.991310859846115418957349799}, { 0.131540028702883111103387493},
+    { 0.986809401814185476970235952}, { 0.161886393780111837641387995},
+    {-0.161886393780111837641387995}, { 0.986809401814185476970235952},
+    { 0.583308652937698294392830961}, { 0.812250586585203913049744181},
+    {-0.812250586585203913049744181}, { 0.583308652937698294392830961},
+    { 0.849741768000852489471268395}, { 0.527199134781901348464274575},
+    {-0.527199134781901348464274575}, { 0.849741768000852489471268395},
+    { 0.228072083170885739254457379}, { 0.973644249650811925318383912},
+    {-0.973644249650811925318383912}, { 0.228072083170885739254457379},
+    { 0.936265667170278246576310996}, { 0.351292756085567125601307623},
+    {-0.351292756085567125601307623}, { 0.936265667170278246576310996},
+    { 0.413638312238434547471944324}, { 0.910441292258067196934095369},
+    {-0.910441292258067196934095369}, { 0.413638312238434547471944324},
+    { 0.730562769227827561177758850}, { 0.682845546385248068164596123},
+    {-0.682845546385248068164596123}, { 0.730562769227827561177758850},
+    { 0.033741171851377584833716112}, { 0.999430604555461772019008327},
+    {-0.999430604555461772019008327}, { 0.033741171851377584833716112},
+    { 0.999204758618363895492950001}, { 0.039872927587739811128578738},
+    {-0.039872927587739811128578738}, { 0.999204758618363895492950001},
+    { 0.678350043129861486873655042}, { 0.734738878095963464563223604},
+    {-0.734738878095963464563223604}, { 0.678350043129861486873655042},
+    { 0.907886116487666212038681480}, { 0.419216888363223956433010020},
+    {-0.419216888363223956433010020}, { 0.907886116487666212038681480},
+    { 0.345541324963989065539191723}, { 0.938403534063108112192420774},
+    {-0.938403534063108112192420774}, { 0.345541324963989065539191723},
+    { 0.972226497078936305708321144}, { 0.234041958583543423191242045},
+    {-0.234041958583543423191242045}, { 0.972226497078936305708321144},
+    { 0.521975292937154342694258318}, { 0.852960604930363657746588082},
+    {-0.852960604930363657746588082}, { 0.521975292937154342694258318},
+    { 0.808656181588174991946968128}, { 0.588281548222645304786439813},
+    {-0.588281548222645304786439813}, { 0.808656181588174991946968128},
+    { 0.155828397654265235743101486}, { 0.987784141644572154230969032},
+    {-0.987784141644572154230969032}, { 0.155828397654265235743101486},
+    { 0.990485084256457037998682243}, { 0.137620121586486044948441663},
+    {-0.137620121586486044948441663}, { 0.990485084256457037998682243},
+    { 0.603066598540348201693430617}, { 0.797690840943391108362662755},
+    {-0.797690840943391108362662755}, { 0.603066598540348201693430617},
+    { 0.862423956111040538690933878}, { 0.506186645345155291048942344},
+    {-0.506186645345155291048942344}, { 0.862423956111040538690933878},
+    { 0.251897818154216950498106628}, { 0.967753837093475465243391912},
+    {-0.967753837093475465243391912}, { 0.251897818154216950498106628},
+    { 0.944604837261480265659265493}, { 0.328209843579092526107916817},
+    {-0.328209843579092526107916817}, { 0.944604837261480265659265493},
+    { 0.435857079922255491032544080}, { 0.900015892016160228714535267},
+    {-0.900015892016160228714535267}, { 0.435857079922255491032544080},
+    { 0.747100605980180144323078847}, { 0.664710978203344868130324985},
+    {-0.664710978203344868130324985}, { 0.747100605980180144323078847},
+    { 0.058258264500435759613979782}, { 0.998301544933892840738782163},
+    {-0.998301544933892840738782163}, { 0.058258264500435759613979782},
+    { 0.996044700901251989887944810}, { 0.088853552582524596561586535},
+    {-0.088853552582524596561586535}, { 0.996044700901251989887944810},
+    { 0.641481012808583151988739898}, { 0.767138911935820381181694573},
+    {-0.767138911935820381181694573}, { 0.641481012808583151988739898},
+    { 0.886222530148880631647990821}, { 0.463259783551860197390719637},
+    {-0.463259783551860197390719637}, { 0.886222530148880631647990821},
+    { 0.299079826308040476750336973}, { 0.954228095109105629780430732},
+    {-0.954228095109105629780430732}, { 0.299079826308040476750336973},
+    { 0.959571513081984528335528181}, { 0.281464937925757984095231007},
+    {-0.281464937925757984095231007}, { 0.959571513081984528335528181},
+    { 0.479493757660153026679839798}, { 0.877545290207261291668470750},
+    {-0.877545290207261291668470750}, { 0.479493757660153026679839798},
+    { 0.778816512381475953374724325}, { 0.627251815495144113509622565},
+    {-0.627251815495144113509622565}, { 0.778816512381475953374724325},
+    { 0.107172424956808849175529148}, { 0.994240449453187946358413442},
+    {-0.994240449453187946358413442}, { 0.107172424956808849175529148},
+    { 0.982539302287441255907040396}, { 0.186055151663446648105438304},
+    {-0.186055151663446648105438304}, { 0.982539302287441255907040396},
+    { 0.563199344013834115007363772}, { 0.826321062845663480311195452},
+    {-0.826321062845663480311195452}, { 0.563199344013834115007363772},
+    { 0.836547727223511984524285790}, { 0.547894059173100165608820571},
+    {-0.547894059173100165608820571}, { 0.836547727223511984524285790},
+    { 0.204108966092816874181696950}, { 0.978948175319062194715480124},
+    {-0.978948175319062194715480124}, { 0.204108966092816874181696950},
+    { 0.927362525650401087274536959}, { 0.374164062971457997104393020},
+    {-0.374164062971457997104393020}, { 0.927362525650401087274536959},
+    { 0.391170384302253888687512949}, { 0.920318276709110566440076541},
+    {-0.920318276709110566440076541}, { 0.391170384302253888687512949},
+    { 0.713584868780793592903125099}, { 0.700568793943248366792866380},
+    {-0.700568793943248366792866380}, { 0.713584868780793592903125099},
+    { 0.009203754782059819315102378}, { 0.999957644551963866333120920},
+    {-0.999957644551963866333120920}, { 0.009203754782059819315102378},
+    { 0.999957644551963866333120920}, { 0.009203754782059819315102378},
+    {-0.009203754782059819315102378}, { 0.999957644551963866333120920},
+    { 0.700568793943248366792866380}, { 0.713584868780793592903125099},
+    {-0.713584868780793592903125099}, { 0.700568793943248366792866380},
+    { 0.920318276709110566440076541}, { 0.391170384302253888687512949},
+    {-0.391170384302253888687512949}, { 0.920318276709110566440076541},
+    { 0.374164062971457997104393020}, { 0.927362525650401087274536959},
+    {-0.927362525650401087274536959}, { 0.374164062971457997104393020},
+    { 0.978948175319062194715480124}, { 0.204108966092816874181696950},
+    {-0.204108966092816874181696950}, { 0.978948175319062194715480124},
+    { 0.547894059173100165608820571}, { 0.836547727223511984524285790},
+    {-0.836547727223511984524285790}, { 0.547894059173100165608820571},
+    { 0.826321062845663480311195452}, { 0.563199344013834115007363772},
+    {-0.563199344013834115007363772}, { 0.826321062845663480311195452},
+    { 0.186055151663446648105438304}, { 0.982539302287441255907040396},
+    {-0.982539302287441255907040396}, { 0.186055151663446648105438304},
+    { 0.994240449453187946358413442}, { 0.107172424956808849175529148},
+    {-0.107172424956808849175529148}, { 0.994240449453187946358413442},
+    { 0.627251815495144113509622565}, { 0.778816512381475953374724325},
+    {-0.778816512381475953374724325}, { 0.627251815495144113509622565},
+    { 0.877545290207261291668470750}, { 0.479493757660153026679839798},
+    {-0.479493757660153026679839798}, { 0.877545290207261291668470750},
+    { 0.281464937925757984095231007}, { 0.959571513081984528335528181},
+    {-0.959571513081984528335528181}, { 0.281464937925757984095231007},
+    { 0.954228095109105629780430732}, { 0.299079826308040476750336973},
+    {-0.299079826308040476750336973}, { 0.954228095109105629780430732},
+    { 0.463259783551860197390719637}, { 0.886222530148880631647990821},
+    {-0.886222530148880631647990821}, { 0.463259783551860197390719637},
+    { 0.767138911935820381181694573}, { 0.641481012808583151988739898},
+    {-0.641481012808583151988739898}, { 0.767138911935820381181694573},
+    { 0.088853552582524596561586535}, { 0.996044700901251989887944810},
+    {-0.996044700901251989887944810}, { 0.088853552582524596561586535},
+    { 0.998301544933892840738782163}, { 0.058258264500435759613979782},
+    {-0.058258264500435759613979782}, { 0.998301544933892840738782163},
+    { 0.664710978203344868130324985}, { 0.747100605980180144323078847},
+    {-0.747100605980180144323078847}, { 0.664710978203344868130324985},
+    { 0.900015892016160228714535267}, { 0.435857079922255491032544080},
+    {-0.435857079922255491032544080}, { 0.900015892016160228714535267},
+    { 0.328209843579092526107916817}, { 0.944604837261480265659265493},
+    {-0.944604837261480265659265493}, { 0.328209843579092526107916817},
+    { 0.967753837093475465243391912}, { 0.251897818154216950498106628},
+    {-0.251897818154216950498106628}, { 0.967753837093475465243391912},
+    { 0.506186645345155291048942344}, { 0.862423956111040538690933878},
+    {-0.862423956111040538690933878}, { 0.506186645345155291048942344},
+    { 0.797690840943391108362662755}, { 0.603066598540348201693430617},
+    {-0.603066598540348201693430617}, { 0.797690840943391108362662755},
+    { 0.137620121586486044948441663}, { 0.990485084256457037998682243},
+    {-0.990485084256457037998682243}, { 0.137620121586486044948441663},
+    { 0.987784141644572154230969032}, { 0.155828397654265235743101486},
+    {-0.155828397654265235743101486}, { 0.987784141644572154230969032},
+    { 0.588281548222645304786439813}, { 0.808656181588174991946968128},
+    {-0.808656181588174991946968128}, { 0.588281548222645304786439813},
+    { 0.852960604930363657746588082}, { 0.521975292937154342694258318},
+    {-0.521975292937154342694258318}, { 0.852960604930363657746588082},
+    { 0.234041958583543423191242045}, { 0.972226497078936305708321144},
+    {-0.972226497078936305708321144}, { 0.234041958583543423191242045},
+    { 0.938403534063108112192420774}, { 0.345541324963989065539191723},
+    {-0.345541324963989065539191723}, { 0.938403534063108112192420774},
+    { 0.419216888363223956433010020}, { 0.907886116487666212038681480},
+    {-0.907886116487666212038681480}, { 0.419216888363223956433010020},
+    { 0.734738878095963464563223604}, { 0.678350043129861486873655042},
+    {-0.678350043129861486873655042}, { 0.734738878095963464563223604},
+    { 0.039872927587739811128578738}, { 0.999204758618363895492950001},
+    {-0.999204758618363895492950001}, { 0.039872927587739811128578738},
+    { 0.999430604555461772019008327}, { 0.033741171851377584833716112},
+    {-0.033741171851377584833716112}, { 0.999430604555461772019008327},
+    { 0.682845546385248068164596123}, { 0.730562769227827561177758850},
+    {-0.730562769227827561177758850}, { 0.682845546385248068164596123},
+    { 0.910441292258067196934095369}, { 0.413638312238434547471944324},
+    {-0.413638312238434547471944324}, { 0.910441292258067196934095369},
+    { 0.351292756085567125601307623}, { 0.936265667170278246576310996},
+    {-0.936265667170278246576310996}, { 0.351292756085567125601307623},
+    { 0.973644249650811925318383912}, { 0.228072083170885739254457379},
+    {-0.228072083170885739254457379}, { 0.973644249650811925318383912},
+    { 0.527199134781901348464274575}, { 0.849741768000852489471268395},
+    {-0.849741768000852489471268395}, { 0.527199134781901348464274575},
+    { 0.812250586585203913049744181}, { 0.583308652937698294392830961},
+    {-0.583308652937698294392830961}, { 0.812250586585203913049744181},
+    { 0.161886393780111837641387995}, { 0.986809401814185476970235952},
+    {-0.986809401814185476970235952}, { 0.161886393780111837641387995},
+    { 0.991310859846115418957349799}, { 0.131540028702883111103387493},
+    {-0.131540028702883111103387493}, { 0.991310859846115418957349799},
+    { 0.607949784967773667243642671}, { 0.793975477554337164895083757},
+    {-0.793975477554337164895083757}, { 0.607949784967773667243642671},
+    { 0.865513624090569082825488358}, { 0.500885382611240786241285004},
+    {-0.500885382611240786241285004}, { 0.865513624090569082825488358},
+    { 0.257831102162159005614471295}, { 0.966190003445412555433832961},
+    {-0.966190003445412555433832961}, { 0.257831102162159005614471295},
+    { 0.946600913083283570044599823}, { 0.322407678801069848384807478},
+    {-0.322407678801069848384807478}, { 0.946600913083283570044599823},
+    { 0.441371268731716692879988968}, { 0.897324580705418281231391836},
+    {-0.897324580705418281231391836}, { 0.441371268731716692879988968},
+    { 0.751165131909686411205819422}, { 0.660114342067420478559490747},
+    {-0.660114342067420478559490747}, { 0.751165131909686411205819422},
+    { 0.064382630929857460819324537}, { 0.997925286198596012623025462},
+    {-0.997925286198596012623025462}, { 0.064382630929857460819324537},
+    { 0.996571145790554847093566910}, { 0.082740264549375693111987083},
+    {-0.082740264549375693111987083}, { 0.996571145790554847093566910},
+    { 0.646176012983316364832802220}, { 0.763188417263381271704838297},
+    {-0.763188417263381271704838297}, { 0.646176012983316364832802220},
+    { 0.889048355854664562540777729}, { 0.457813303598877221904961155},
+    {-0.457813303598877221904961155}, { 0.889048355854664562540777729},
+    { 0.304929229735402406490728633}, { 0.952375012719765858529893608},
+    {-0.952375012719765858529893608}, { 0.304929229735402406490728633},
+    { 0.961280485811320641748659653}, { 0.275571819310958163076425168},
+    {-0.275571819310958163076425168}, { 0.961280485811320641748659653},
+    { 0.484869248000791101822951699}, { 0.874586652278176112634431897},
+    {-0.874586652278176112634431897}, { 0.484869248000791101822951699},
+    { 0.782650596166575738458949301}, { 0.622461279374149972519166721},
+    {-0.622461279374149972519166721}, { 0.782650596166575738458949301},
+    { 0.113270952177564349018228733}, { 0.993564135520595333782021697},
+    {-0.993564135520595333782021697}, { 0.113270952177564349018228733},
+    { 0.983662419211730274396237776}, { 0.180022901405699522679906590},
+    {-0.180022901405699522679906590}, { 0.983662419211730274396237776},
+    { 0.568258952670131549790548489}, { 0.822849781375826332046780034},
+    {-0.822849781375826332046780034}, { 0.568258952670131549790548489},
+    { 0.839893794195999504583383987}, { 0.542750784864515906586768661},
+    {-0.542750784864515906586768661}, { 0.839893794195999504583383987},
+    { 0.210111836880469621717489972}, { 0.977677357824509979943404762},
+    {-0.977677357824509979943404762}, { 0.210111836880469621717489972},
+    { 0.929640895843181265457918066}, { 0.368466829953372331712746222},
+    {-0.368466829953372331712746222}, { 0.929640895843181265457918066},
+    { 0.396809987416710328595290911}, { 0.917900775621390457642276297},
+    {-0.917900775621390457642276297}, { 0.396809987416710328595290911},
+    { 0.717870045055731736211325329}, { 0.696177131491462944788582591},
+    {-0.696177131491462944788582591}, { 0.717870045055731736211325329},
+    { 0.015339206284988101044151868}, { 0.999882347454212525633049627},
+    {-0.999882347454212525633049627}, { 0.015339206284988101044151868},
+    { 0.999769405351215321657617036}, { 0.021474080275469507418374898},
+    {-0.021474080275469507418374898}, { 0.999769405351215321657617036},
+    { 0.691759258364157774906734132}, { 0.722128193929215321243607198},
+    {-0.722128193929215321243607198}, { 0.691759258364157774906734132},
+    { 0.915448716088267819566431292}, { 0.402434650859418441082533934},
+    {-0.402434650859418441082533934}, { 0.915448716088267819566431292},
+    { 0.362755724367397216204854462}, { 0.931884265581668106718557199},
+    {-0.931884265581668106718557199}, { 0.362755724367397216204854462},
+    { 0.976369731330021149312732194}, { 0.216106797076219509948385131},
+    {-0.216106797076219509948385131}, { 0.976369731330021149312732194},
+    { 0.537587076295645482502214932}, { 0.843208239641845437161743865},
+    {-0.843208239641845437161743865}, { 0.537587076295645482502214932},
+    { 0.819347520076796960824689637}, { 0.573297166698042212820171239},
+    {-0.573297166698042212820171239}, { 0.819347520076796960824689637},
+    { 0.173983873387463827950700807}, { 0.984748501801904218556553176},
+    {-0.984748501801904218556553176}, { 0.173983873387463827950700807},
+    { 0.992850414459865090793563344}, { 0.119365214810991364593637790},
+    {-0.119365214810991364593637790}, { 0.992850414459865090793563344},
+    { 0.617647307937803932403979402}, { 0.786455213599085757522319464},
+    {-0.786455213599085757522319464}, { 0.617647307937803932403979402},
+    { 0.871595086655951034842481435}, { 0.490226483288291154229598449},
+    {-0.490226483288291154229598449}, { 0.871595086655951034842481435},
+    { 0.269668325572915106525464462}, { 0.962953266873683886347921481},
+    {-0.962953266873683886347921481}, { 0.269668325572915106525464462},
+    { 0.950486073949481721759926101}, { 0.310767152749611495835997250},
+    {-0.310767152749611495835997250}, { 0.950486073949481721759926101},
+    { 0.452349587233770874133026703}, { 0.891840709392342727796478697},
+    {-0.891840709392342727796478697}, { 0.452349587233770874133026703},
+    { 0.759209188978388033485525443}, { 0.650846684996380915068975573},
+    {-0.650846684996380915068975573}, { 0.759209188978388033485525443},
+    { 0.076623861392031492278332463}, { 0.997060070339482978987989949},
+    {-0.997060070339482978987989949}, { 0.076623861392031492278332463},
+    { 0.997511456140303459699448390}, { 0.070504573389613863027351471},
+    {-0.070504573389613863027351471}, { 0.997511456140303459699448390},
+    { 0.655492852999615385312679701}, { 0.755201376896536527598710756},
+    {-0.755201376896536527598710756}, { 0.655492852999615385312679701},
+    { 0.894599485631382678433072126}, { 0.446868840162374195353044389},
+    {-0.446868840162374195353044389}, { 0.894599485631382678433072126},
+    { 0.316593375556165867243047035}, { 0.948561349915730288158494826},
+    {-0.948561349915730288158494826}, { 0.316593375556165867243047035},
+    { 0.964589793289812723836432159}, { 0.263754678974831383611349322},
+    {-0.263754678974831383611349322}, { 0.964589793289812723836432159},
+    { 0.495565261825772531150266670}, { 0.868570705971340895340449876},
+    {-0.868570705971340895340449876}, { 0.495565261825772531150266670},
+    { 0.790230221437310055030217152}, { 0.612810082429409703935211936},
+    {-0.612810082429409703935211936}, { 0.790230221437310055030217152},
+    { 0.125454983411546238542336453}, { 0.992099313142191757112085445},
+    {-0.992099313142191757112085445}, { 0.125454983411546238542336453},
+    { 0.985797509167567424700995000}, { 0.167938294974731178054745536},
+    {-0.167938294974731178054745536}, { 0.985797509167567424700995000},
+    { 0.578313796411655563342245019}, { 0.815814410806733789010772660},
+    {-0.815814410806733789010772660}, { 0.578313796411655563342245019},
+    { 0.846490938774052078300544488}, { 0.532403127877197971442805218},
+    {-0.532403127877197971442805218}, { 0.846490938774052078300544488},
+    { 0.222093620973203534094094721}, { 0.975025345066994146844913468},
+    {-0.975025345066994146844913468}, { 0.222093620973203534094094721},
+    { 0.934092550404258914729877883}, { 0.357030961233430032614954036},
+    {-0.357030961233430032614954036}, { 0.934092550404258914729877883},
+    { 0.408044162864978680820747499}, { 0.912962190428398164628018233},
+    {-0.912962190428398164628018233}, { 0.408044162864978680820747499},
+    { 0.726359155084345976817494315}, { 0.687315340891759108199186948},
+    {-0.687315340891759108199186948}, { 0.726359155084345976817494315},
+    { 0.027608145778965741612354872}, { 0.999618822495178597116830637},
+    {-0.999618822495178597116830637}, { 0.027608145778965741612354872},
+    { 0.998941293186856850633930266}, { 0.046003182130914628814301788},
+    {-0.046003182130914628814301788}, { 0.998941293186856850633930266},
+    { 0.673829000378756060917568372}, { 0.738887324460615147933116508},
+    {-0.738887324460615147933116508}, { 0.673829000378756060917568372},
+    { 0.905296759318118774354048329}, { 0.424779681209108833357226189},
+    {-0.424779681209108833357226189}, { 0.905296759318118774354048329},
+    { 0.339776884406826857828825803}, { 0.940506070593268323787291309},
+    {-0.940506070593268323787291309}, { 0.339776884406826857828825803},
+    { 0.970772140728950302138169611}, { 0.240003022448741486568922365},
+    {-0.240003022448741486568922365}, { 0.970772140728950302138169611},
+    { 0.516731799017649881508753876}, { 0.856147328375194481019630732},
+    {-0.856147328375194481019630732}, { 0.516731799017649881508753876},
+    { 0.805031331142963597922659282}, { 0.593232295039799808047809426},
+    {-0.593232295039799808047809426}, { 0.805031331142963597922659282},
+    { 0.149764534677321517229695737}, { 0.988721691960323767604516485},
+    {-0.988721691960323767604516485}, { 0.149764534677321517229695737},
+    { 0.989622017463200834623694454}, { 0.143695033150294454819773349},
+    {-0.143695033150294454819773349}, { 0.989622017463200834623694454},
+    { 0.598160706996342311724958652}, { 0.801376171723140219430247777},
+    {-0.801376171723140219430247777}, { 0.598160706996342311724958652},
+    { 0.859301818357008404783582139}, { 0.511468850437970399504391001},
+    {-0.511468850437970399504391001}, { 0.859301818357008404783582139},
+    { 0.245955050335794611599924709}, { 0.969281235356548486048290738},
+    {-0.969281235356548486048290738}, { 0.245955050335794611599924709},
+    { 0.942573197601446879280758735}, { 0.333999651442009404650865481},
+    {-0.333999651442009404650865481}, { 0.942573197601446879280758735},
+    { 0.430326481340082633908199031}, { 0.902673318237258806751502391},
+    {-0.902673318237258806751502391}, { 0.430326481340082633908199031},
+    { 0.743007952135121693517362293}, { 0.669282588346636065720696366},
+    {-0.669282588346636065720696366}, { 0.743007952135121693517362293},
+    { 0.052131704680283321236358216}, { 0.998640218180265222418199049},
+    {-0.998640218180265222418199049}, { 0.052131704680283321236358216},
+    { 0.995480755491926941769171600}, { 0.094963495329638998938034312},
+    {-0.094963495329638998938034312}, { 0.995480755491926941769171600},
+    { 0.636761861236284230413943435}, { 0.771060524261813773200605759},
+    {-0.771060524261813773200605759}, { 0.636761861236284230413943435},
+    { 0.883363338665731594736308015}, { 0.468688822035827933697617870},
+    {-0.468688822035827933697617870}, { 0.883363338665731594736308015},
+    { 0.293219162694258650606608599}, { 0.956045251349996443270479823},
+    {-0.956045251349996443270479823}, { 0.293219162694258650606608599},
+    { 0.957826413027532890321037029}, { 0.287347459544729526477331841},
+    {-0.287347459544729526477331841}, { 0.957826413027532890321037029},
+    { 0.474100214650550014398580015}, { 0.880470889052160770806542929},
+    {-0.880470889052160770806542929}, { 0.474100214650550014398580015},
+    { 0.774953106594873878359129282}, { 0.632018735939809021909403706},
+    {-0.632018735939809021909403706}, { 0.774953106594873878359129282},
+    { 0.101069862754827824987887585}, { 0.994879330794805620591166107},
+    {-0.994879330794805620591166107}, { 0.101069862754827824987887585},
+    { 0.981379193313754574318224190}, { 0.192080397049892441679288205},
+    {-0.192080397049892441679288205}, { 0.981379193313754574318224190},
+    { 0.558118531220556115693702964}, { 0.829761233794523042469023765},
+    {-0.829761233794523042469023765}, { 0.558118531220556115693702964},
+    { 0.833170164701913186439915922}, { 0.553016705580027531764226988},
+    {-0.553016705580027531764226988}, { 0.833170164701913186439915922},
+    { 0.198098410717953586179324918}, { 0.980182135968117392690210009},
+    {-0.980182135968117392690210009}, { 0.198098410717953586179324918},
+    { 0.925049240782677590302371869}, { 0.379847208924051170576281147},
+    {-0.379847208924051170576281147}, { 0.925049240782677590302371869},
+    { 0.385516053843918864075607949}, { 0.922701128333878570437264227},
+    {-0.922701128333878570437264227}, { 0.385516053843918864075607949},
+    { 0.709272826438865651316533772}, { 0.704934080375904908852523758},
+    {-0.704934080375904908852523758}, { 0.709272826438865651316533772},
+    { 0.003067956762965976270145365}, { 0.999995293809576171511580126},
+    {-0.999995293809576171511580126}, { 0.003067956762965976270145365}
+};
+
+const fpr fpr_p2_tab[] = {
+    { 2.00000000000 },
+    { 1.00000000000 },
+    { 0.50000000000 },
+    { 0.25000000000 },
+    { 0.12500000000 },
+    { 0.06250000000 },
+    { 0.03125000000 },
+    { 0.01562500000 },
+    { 0.00781250000 },
+    { 0.00390625000 },
+    { 0.00195312500 }
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.h b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.h
new file mode 100644
index 000000000..6073efff3
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.h
@@ -0,0 +1,362 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/* ====================================================================== */
+
+#include <math.h>
+
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef struct {
+    double v;
+} fpr;
+
+static inline fpr
+FPR(double v) {
+    fpr x;
+
+    x.v = v;
+    return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+    return FPR((double)i);
+}
+
+static const fpr fpr_q = { 12289.0 };
+static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
+static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
+static const fpr fpr_inv_sigma[] = {
+    { 0.0 }, /* unused */
+    { 0.0069054793295940891952143765991630516 },
+    { 0.0068102267767177975961393730687908629 },
+    { 0.0067188101910722710707826117910434131 },
+    { 0.0065883354370073665545865037227681924 },
+    { 0.0064651781207602900738053897763485516 },
+    { 0.0063486788828078995327741182928037856 },
+    { 0.0062382586529084374473367528433697537 },
+    { 0.0061334065020930261548984001431770281 },
+    { 0.0060336696681577241031668062510953022 },
+    { 0.0059386453095331159950250124336477482 }
+};
+static const fpr fpr_sigma_min[] = {
+    { 0.0 }, /* unused */
+    { 1.1165085072329102588881898380334015 },
+    { 1.1321247692325272405718031785357108 },
+    { 1.1475285353733668684571123112513188 },
+    { 1.1702540788534828939713084716509250 },
+    { 1.1925466358390344011122170489094133 },
+    { 1.2144300507766139921088487776957699 },
+    { 1.2359260567719808790104525941706723 },
+    { 1.2570545284063214162779743112075080 },
+    { 1.2778336969128335860256340575729042 },
+    { 1.2982803343442918539708792538826807 }
+};
+static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
+static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
+static const fpr fpr_bnorm_max = { 16822.4121 };
+static const fpr fpr_zero = { 0.0 };
+static const fpr fpr_one = { 1.0 };
+static const fpr fpr_two = { 2.0 };
+static const fpr fpr_onehalf = { 0.5 };
+static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
+static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
+static const fpr fpr_ptwo31 = { 2147483648.0 };
+static const fpr fpr_ptwo31m1 = { 2147483647.0 };
+static const fpr fpr_mtwo31m1 = { -2147483647.0 };
+static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
+static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
+static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
+
+static inline int64_t
+fpr_rint(fpr x) {
+    /*
+     * We do not want to use llrint() since it might be not
+     * constant-time.
+     *
+     * Suppose that x >= 0. If x >= 2^52, then it is already an
+     * integer. Otherwise, if x < 2^52, then computing x+2^52 will
+     * yield a value that will be rounded to the nearest integer
+     * with exactly the right rules (round-to-nearest-even).
+     *
+     * In order to have constant-time processing, we must do the
+     * computation for both x >= 0 and x < 0 cases, and use a
+     * cast to an integer to access the sign and select the proper
+     * value. Such casts also allow us to find out if |x| < 2^52.
+     */
+    int64_t sx, tx, rp, rn, m;
+    uint32_t ub;
+
+    sx = (int64_t)(x.v - 1.0);
+    tx = (int64_t)x.v;
+    rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
+    rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
+
+    /*
+     * If tx >= 2^52 or tx < -2^52, then result is tx.
+     * Otherwise, if sx >= 0, then result is rp.
+     * Otherwise, result is rn. We use the fact that when x is
+     * close to 0 (|x| <= 0.25) then both rp and rn are correct;
+     * and if x is not close to 0, then trunc(x-1.0) yields the
+     * appropriate sign.
+     */
+
+    /*
+     * Clamp rp to zero if tx < 0.
+     * Clamp rn to zero if tx >= 0.
+     */
+    m = sx >> 63;
+    rn &= m;
+    rp &= ~m;
+
+    /*
+     * Get the 12 upper bits of tx; if they are not all zeros or
+     * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
+     * rp and rn to zero. Otherwise, we clamp tx to zero.
+     */
+    ub = (uint32_t)((uint64_t)tx >> 52);
+    m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
+    rp &= m;
+    rn &= m;
+    tx &= ~m;
+
+    /*
+     * Only one of tx, rn or rp (at most) can be non-zero at this
+     * point.
+     */
+    return tx | rn | rp;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+    int64_t r;
+
+    /*
+     * The cast performs a trunc() (rounding toward 0) and thus is
+     * wrong by 1 for most negative values. The correction below is
+     * constant-time as long as the compiler turns the
+     * floating-point conversion result into a 0/1 integer without a
+     * conditional branch or another non-constant-time construction.
+     * This should hold on all modern architectures with an FPU (and
+     * if it is false on a given arch, then chances are that the FPU
+     * itself is not constant-time, making the point moot).
+     */
+    r = (int64_t)x.v;
+    return r - (x.v < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+    return (int64_t)x.v;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+    return FPR(x.v + y.v);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+    return FPR(x.v - y.v);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+    return FPR(-x.v);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+    return FPR(x.v * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+    return FPR(x.v + x.v);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+    return FPR(x.v * y.v);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+    return FPR(x.v * x.v);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+    return FPR(1.0 / x.v);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+    return FPR(x.v / y.v);
+}
+
+static inline void
+fpr_sqrt_avx2(double *t) {
+    __m128d x;
+
+    x = _mm_load1_pd(t);
+    x = _mm_sqrt_pd(x);
+    _mm_storel_pd(t, x);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+    /*
+     * We prefer not to have a dependency on libm when it can be
+     * avoided. On x86, calling the sqrt() libm function inlines
+     * the relevant opcode (fsqrt or sqrtsd, depending on whether
+     * the 387 FPU or SSE2 is used for floating-point operations)
+     * but then makes an optional call to the library function
+     * for proper error handling, in case the operand is negative.
+     *
+     * To avoid this dependency, we use intrinsics or inline assembly
+     * on recognized platforms:
+     *
+     *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
+     *
+     *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
+     *
+     *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
+     *    to call the 387 FPU fsqrt opcode.
+     *
+     *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
+     *    the fsqrt opcode (Clang needs a special hack).
+     *
+     *  - On GCC/Clang on ARM with hardware floating-point, we use
+     *    inline assembly to call the vqsrt.f64 opcode. Due to a
+     *    complex ecosystem of compilers and assembly syntaxes, we
+     *    have to call it "fsqrt" or "fsqrtd", depending on case.
+     *
+     * If the platform is not recognized, a call to the system
+     * library function sqrt() is performed. On some compilers, this
+     * may actually inline the relevant opcode, and call the library
+     * function only when the input is invalid (e.g. negative);
+     * Falcon never actually calls sqrt() on a negative value, but
+     * the dependency to libm will still be there.
+     */
+
+    fpr_sqrt_avx2(&x.v);
+    return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+    return x.v < y.v;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+    /*
+     * Polynomial approximation of exp(-x) is taken from FACCT:
+     *   https://eprint.iacr.org/2018/1234
+     * Specifically, values are extracted from the implementation
+     * referenced from the FACCT article, and available at:
+     *   https://github.com/raykzhao/gaussian
+     * Tests over more than 24 billions of random inputs in the
+     * 0..log(2) range have never shown a deviation larger than
+     * 2^(-50) from the true mathematical value.
+     */
+
+    /*
+     * AVX2 implementation uses more operations than Horner's method,
+     * but with a lower expression tree depth. This helps because
+     * additions and multiplications have a latency of 4 cycles on
+     * a Skylake, but the CPU can issue two of them per cycle.
+     */
+
+    static const union {
+        double d[12];
+        __m256d v[3];
+    } c = {
+        {
+            0.999999999999994892974086724280,
+            0.500000000000019206858326015208,
+            0.166666666666984014666397229121,
+            0.041666666666110491190622155955,
+            0.008333333327800835146903501993,
+            0.001388888894063186997887560103,
+            0.000198412739277311890541063977,
+            0.000024801566833585381209939524,
+            0.000002755586350219122514855659,
+            0.000000275607356160477811864927,
+            0.000000025299506379442070029551,
+            0.000000002073772366009083061987
+        }
+    };
+
+    double d1, d2, d4, d8, y;
+    __m256d d14, d58, d9c;
+
+    d1 = -x.v;
+    d2 = d1 * d1;
+    d4 = d2 * d2;
+    d8 = d4 * d4;
+    d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
+    d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
+    d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
+    d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
+    d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
+    d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
+    d9c = _mm256_hadd_pd(d9c, d9c);
+    y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
+        + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
+    y *= ccs.v;
+
+    /*
+     * Final conversion goes through int64_t first, because that's what
+     * the underlying opcode (vcvttsd2si) will do, and we know that the
+     * result will fit, since x >= 0 and ccs < 1. If we did the
+     * conversion directly to uint64_t, then the compiler would add some
+     * extra code to cover the case of a source value of 2^63 or more,
+     * and though the alternate path would never be exercised, the
+     * extra comparison would cost us some cycles.
+     */
+    return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
+
+}
+
+#define fpr_gm_tab   PQCLEAN_FALCONPADDED1024_AVX2_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab   PQCLEAN_FALCONPADDED1024_AVX2_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/inner.h b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/inner.h
new file mode 100644
index 000000000..5c0d57b22
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/inner.h
@@ -0,0 +1,827 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCONPADDED1024_AVX2_ macro (e.g. PQCLEAN_FALCONPADDED1024_AVX2_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * This implementation uses AVX2 and optionally FMA intrinsics.
+ */
+#include <immintrin.h>
+#define FMADD(a, b, c)   _mm256_add_pd(_mm256_mul_pd(a, b), c)
+#define FMSUB(a, b, c)   _mm256_sub_pd(_mm256_mul_pd(a, b), c)
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc)        shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ *   - modq: sequence of values modulo 12289, each encoded over exactly
+ *     14 bits. The encoder and decoder verify that integers are within
+ *     the valid range (0..12288). Values are arrays of uint16.
+ *
+ *   - trim: sequence of signed integers, a specified number of bits
+ *     each. The number of bits is provided as parameter and includes
+ *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ *     (which means that the -2^(bits-1) value is forbidden); encode and
+ *     decode functions check that property. Values are arrays of
+ *     int16_t or int8_t, corresponding to names 'trim_i16' and
+ *     'trim_i8', respectively.
+ *
+ *   - comp: variable-length encoding for signed integers; each integer
+ *     uses a minimum of 9 bits, possibly more. This is normally used
+ *     only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED1024_AVX2_modq_encode(void *out, size_t max_out_len,
+        const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(void *out, size_t max_out_len,
+        const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_comp_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED1024_AVX2_modq_decode(uint16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_comp_decode(int16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ *   c0[]      contains the hashed nonce+message
+ *   s2[]      is the decoded signature
+ *   h[]       contains the public key, in NTT + Montgomery format
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+        const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ *   fpr
+ *         type for a real number
+ *
+ *   fpr fpr_of(int64_t i)
+ *         cast an integer into a real number; source must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_scaled(int64_t i, int sc)
+ *         compute i*2^sc as a real number; source 'i' must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_ldexp(fpr x, int e)
+ *         compute x*2^e
+ *
+ *   int64_t fpr_rint(fpr x)
+ *         round x to the nearest integer; x must be in the -(2^63-1)
+ *         to +(2^63-1) range
+ *
+ *   int64_t fpr_trunc(fpr x)
+ *         round to an integer; this rounds towards zero; value must
+ *         be in the -(2^63-1) to +(2^63-1) range
+ *
+ *   fpr fpr_add(fpr x, fpr y)
+ *         compute x + y
+ *
+ *   fpr fpr_sub(fpr x, fpr y)
+ *         compute x - y
+ *
+ *   fpr fpr_neg(fpr x)
+ *         compute -x
+ *
+ *   fpr fpr_half(fpr x)
+ *         compute x/2
+ *
+ *   fpr fpr_double(fpr x)
+ *         compute x*2
+ *
+ *   fpr fpr_mul(fpr x, fpr y)
+ *         compute x * y
+ *
+ *   fpr fpr_sqr(fpr x)
+ *         compute x * x
+ *
+ *   fpr fpr_inv(fpr x)
+ *         compute 1/x
+ *
+ *   fpr fpr_div(fpr x, fpr y)
+ *         compute x/y
+ *
+ *   fpr fpr_sqrt(fpr x)
+ *         compute the square root of x
+ *
+ *   int fpr_lt(fpr x, fpr y)
+ *         return 1 if x < y, 0 otherwise
+ *
+ *   uint64_t fpr_expm_p63(fpr x)
+ *         return exp(x), assuming that 0 <= x < log(2). Returned value
+ *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ *         rounded to the nearest integer). Computation should have a
+ *         precision of at least 45 bits.
+ *
+ *   const fpr fpr_gm_tab[]
+ *         array of constants for FFT / iFFT
+ *
+ *   const fpr fpr_p2_tab[]
+ *         precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ *   fpr fpr_q                 12289
+ *   fpr fpr_inverse_of_q      1/12289
+ *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
+ *   fpr fpr_inv_sigma[]       1/sigma (indexed by logn, 1 to 10)
+ *   fpr fpr_sigma_min[]       1/sigma_min (indexed by logn, 1 to 10)
+ *   fpr fpr_log2              log(2)
+ *   fpr fpr_inv_log2          1/log(2)
+ *   fpr fpr_bnorm_max         16822.4121
+ *   fpr fpr_zero              0
+ *   fpr fpr_one               1
+ *   fpr fpr_two               2
+ *   fpr fpr_onehalf           0.5
+ *   fpr fpr_ptwo31            2^31
+ *   fpr fpr_ptwo31m1          2^31-1
+ *   fpr fpr_mtwo31m1          -(2^31-1)
+ *   fpr fpr_ptwo63m1          2^63-1
+ *   fpr fpr_mtwo63m1          -(2^63-1)
+ *   fpr fpr_ptwo63            2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+    union {
+        uint8_t d[512]; /* MUST be 512, exactly */
+        uint64_t dummy_u64;
+    } buf;
+    size_t ptr;
+    union {
+        uint8_t d[256];
+        uint64_t dummy_u64;
+    } state;
+    int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+    }
+    return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_div_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_LDL_fft(const fpr *g00,
+        fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(fpr *d11, fpr *l10,
+        const fpr *g00, const fpr *g01,
+        const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(fpr *f0, fpr *f1,
+        const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(fpr *f,
+        const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1      136
+#define FALCON_KEYGEN_TEMP_2      272
+#define FALCON_KEYGEN_TEMP_3      224
+#define FALCON_KEYGEN_TEMP_4      448
+#define FALCON_KEYGEN_TEMP_5      896
+#define FALCON_KEYGEN_TEMP_6     1792
+#define FALCON_KEYGEN_TEMP_7     3584
+#define FALCON_KEYGEN_TEMP_8     7168
+#define FALCON_KEYGEN_TEMP_9    14336
+#define FALCON_KEYGEN_TEMP_10   28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_keygen(inner_shake256_context *rng,
+        int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED1024_AVX2_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *expanded_key,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED1024_AVX2_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED1024_AVX2_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/keygen.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/keygen.c
new file mode 100644
index 000000000..d3197b8c7
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/keygen.c
@@ -0,0 +1,4233 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ *   Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ *   Montgomery multiplication computes (x*y)/R mod p for
+ *   operands x and y. Therefore:
+ *
+ *    - if operands are x*R and y*R (Montgomery representations of x and
+ *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ *      mod p, which is the Montgomery representation of the product x*y;
+ *
+ *    - if operands are x*R and y (or x and y*R), then Montgomery
+ *      multiplication returns x*y mod p: mixed-representation
+ *      multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ *  p   The prime itself.
+ *
+ *  g   A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ *  s   The inverse of the product of all previous primes in the array,
+ *      computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+    uint32_t p;
+    uint32_t g;
+    uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+    { 2147473409,  383167813,      10239 },
+    { 2147389441,  211808905,  471403745 },
+    { 2147387393,   37672282, 1329335065 },
+    { 2147377153, 1977035326,  968223422 },
+    { 2147358721, 1067163706,  132460015 },
+    { 2147352577, 1606082042,  598693809 },
+    { 2147346433, 2033915641, 1056257184 },
+    { 2147338241, 1653770625,  421286710 },
+    { 2147309569,  631200819, 1111201074 },
+    { 2147297281, 2038364663, 1042003613 },
+    { 2147295233, 1962540515,   19440033 },
+    { 2147239937, 2100082663,  353296760 },
+    { 2147235841, 1991153006, 1703918027 },
+    { 2147217409,  516405114, 1258919613 },
+    { 2147205121,  409347988, 1089726929 },
+    { 2147196929,  927788991, 1946238668 },
+    { 2147178497, 1136922411, 1347028164 },
+    { 2147100673,  868626236,  701164723 },
+    { 2147082241, 1897279176,  617820870 },
+    { 2147074049, 1888819123,  158382189 },
+    { 2147051521,   25006327,  522758543 },
+    { 2147043329,  327546255,   37227845 },
+    { 2147039233,  766324424, 1133356428 },
+    { 2146988033, 1862817362,   73861329 },
+    { 2146963457,  404622040,  653019435 },
+    { 2146959361, 1936581214,  995143093 },
+    { 2146938881, 1559770096,  634921513 },
+    { 2146908161,  422623708, 1985060172 },
+    { 2146885633, 1751189170,  298238186 },
+    { 2146871297,  578919515,  291810829 },
+    { 2146846721, 1114060353,  915902322 },
+    { 2146834433, 2069565474,   47859524 },
+    { 2146818049, 1552824584,  646281055 },
+    { 2146775041, 1906267847, 1597832891 },
+    { 2146756609, 1847414714, 1228090888 },
+    { 2146744321, 1818792070, 1176377637 },
+    { 2146738177, 1118066398, 1054971214 },
+    { 2146736129,   52057278,  933422153 },
+    { 2146713601,  592259376, 1406621510 },
+    { 2146695169,  263161877, 1514178701 },
+    { 2146656257,  685363115,  384505091 },
+    { 2146650113,  927727032,  537575289 },
+    { 2146646017,   52575506, 1799464037 },
+    { 2146643969, 1276803876, 1348954416 },
+    { 2146603009,  814028633, 1521547704 },
+    { 2146572289, 1846678872, 1310832121 },
+    { 2146547713,  919368090, 1019041349 },
+    { 2146508801,  671847612,   38582496 },
+    { 2146492417,  283911680,  532424562 },
+    { 2146490369, 1780044827,  896447978 },
+    { 2146459649,  327980850, 1327906900 },
+    { 2146447361, 1310561493,  958645253 },
+    { 2146441217,  412148926,  287271128 },
+    { 2146437121,  293186449, 2009822534 },
+    { 2146430977,  179034356, 1359155584 },
+    { 2146418689, 1517345488, 1790248672 },
+    { 2146406401, 1615820390, 1584833571 },
+    { 2146404353,  826651445,  607120498 },
+    { 2146379777,    3816988, 1897049071 },
+    { 2146363393, 1221409784, 1986921567 },
+    { 2146355201, 1388081168,  849968120 },
+    { 2146336769, 1803473237, 1655544036 },
+    { 2146312193, 1023484977,  273671831 },
+    { 2146293761, 1074591448,  467406983 },
+    { 2146283521,  831604668, 1523950494 },
+    { 2146203649,  712865423, 1170834574 },
+    { 2146154497, 1764991362, 1064856763 },
+    { 2146142209,  627386213, 1406840151 },
+    { 2146127873, 1638674429, 2088393537 },
+    { 2146099201, 1516001018,  690673370 },
+    { 2146093057, 1294931393,  315136610 },
+    { 2146091009, 1942399533,  973539425 },
+    { 2146078721, 1843461814, 2132275436 },
+    { 2146060289, 1098740778,  360423481 },
+    { 2146048001, 1617213232, 1951981294 },
+    { 2146041857, 1805783169, 2075683489 },
+    { 2146019329,  272027909, 1753219918 },
+    { 2145986561, 1206530344, 2034028118 },
+    { 2145976321, 1243769360, 1173377644 },
+    { 2145964033,  887200839, 1281344586 },
+    { 2145906689, 1651026455,  906178216 },
+    { 2145875969, 1673238256, 1043521212 },
+    { 2145871873, 1226591210, 1399796492 },
+    { 2145841153, 1465353397, 1324527802 },
+    { 2145832961, 1150638905,  554084759 },
+    { 2145816577,  221601706,  427340863 },
+    { 2145785857,  608896761,  316590738 },
+    { 2145755137, 1712054942, 1684294304 },
+    { 2145742849, 1302302867,  724873116 },
+    { 2145728513,  516717693,  431671476 },
+    { 2145699841,  524575579, 1619722537 },
+    { 2145691649, 1925625239,  982974435 },
+    { 2145687553,  463795662, 1293154300 },
+    { 2145673217,  771716636,  881778029 },
+    { 2145630209, 1509556977,  837364988 },
+    { 2145595393,  229091856,  851648427 },
+    { 2145587201, 1796903241,  635342424 },
+    { 2145525761,  715310882, 1677228081 },
+    { 2145495041, 1040930522,  200685896 },
+    { 2145466369,  949804237, 1809146322 },
+    { 2145445889, 1673903706,   95316881 },
+    { 2145390593,  806941852, 1428671135 },
+    { 2145372161, 1402525292,  159350694 },
+    { 2145361921, 2124760298, 1589134749 },
+    { 2145359873, 1217503067, 1561543010 },
+    { 2145355777,  338341402,   83865711 },
+    { 2145343489, 1381532164,  641430002 },
+    { 2145325057, 1883895478, 1528469895 },
+    { 2145318913, 1335370424,   65809740 },
+    { 2145312769, 2000008042, 1919775760 },
+    { 2145300481,  961450962, 1229540578 },
+    { 2145282049,  910466767, 1964062701 },
+    { 2145232897,  816527501,  450152063 },
+    { 2145218561, 1435128058, 1794509700 },
+    { 2145187841,   33505311, 1272467582 },
+    { 2145181697,  269767433, 1380363849 },
+    { 2145175553,   56386299, 1316870546 },
+    { 2145079297, 2106880293, 1391797340 },
+    { 2145021953, 1347906152,  720510798 },
+    { 2145015809,  206769262, 1651459955 },
+    { 2145003521, 1885513236, 1393381284 },
+    { 2144960513, 1810381315,   31937275 },
+    { 2144944129, 1306487838, 2019419520 },
+    { 2144935937,   37304730, 1841489054 },
+    { 2144894977, 1601434616,  157985831 },
+    { 2144888833,   98749330, 2128592228 },
+    { 2144880641, 1772327002, 2076128344 },
+    { 2144864257, 1404514762, 2029969964 },
+    { 2144827393,  801236594,  406627220 },
+    { 2144806913,  349217443, 1501080290 },
+    { 2144796673, 1542656776, 2084736519 },
+    { 2144778241, 1210734884, 1746416203 },
+    { 2144759809, 1146598851,  716464489 },
+    { 2144757761,  286328400, 1823728177 },
+    { 2144729089, 1347555695, 1836644881 },
+    { 2144727041, 1795703790,  520296412 },
+    { 2144696321, 1302475157,  852964281 },
+    { 2144667649, 1075877614,  504992927 },
+    { 2144573441,  198765808, 1617144982 },
+    { 2144555009,  321528767,  155821259 },
+    { 2144550913,  814139516, 1819937644 },
+    { 2144536577,  571143206,  962942255 },
+    { 2144524289, 1746733766,    2471321 },
+    { 2144512001, 1821415077,  124190939 },
+    { 2144468993,  917871546, 1260072806 },
+    { 2144458753,  378417981, 1569240563 },
+    { 2144421889,  175229668, 1825620763 },
+    { 2144409601, 1699216963,  351648117 },
+    { 2144370689, 1071885991,  958186029 },
+    { 2144348161, 1763151227,  540353574 },
+    { 2144335873, 1060214804,  919598847 },
+    { 2144329729,  663515846, 1448552668 },
+    { 2144327681, 1057776305,  590222840 },
+    { 2144309249, 1705149168, 1459294624 },
+    { 2144296961,  325823721, 1649016934 },
+    { 2144290817,  738775789,  447427206 },
+    { 2144243713,  962347618,  893050215 },
+    { 2144237569, 1655257077,  900860862 },
+    { 2144161793,  242206694, 1567868672 },
+    { 2144155649,  769415308, 1247993134 },
+    { 2144137217,  320492023,  515841070 },
+    { 2144120833, 1639388522,  770877302 },
+    { 2144071681, 1761785233,  964296120 },
+    { 2144065537,  419817825,  204564472 },
+    { 2144028673,  666050597, 2091019760 },
+    { 2144010241, 1413657615, 1518702610 },
+    { 2143952897, 1238327946,  475672271 },
+    { 2143940609,  307063413, 1176750846 },
+    { 2143918081, 2062905559,  786785803 },
+    { 2143899649, 1338112849, 1562292083 },
+    { 2143891457,   68149545,   87166451 },
+    { 2143885313,  921750778,  394460854 },
+    { 2143854593,  719766593,  133877196 },
+    { 2143836161, 1149399850, 1861591875 },
+    { 2143762433, 1848739366, 1335934145 },
+    { 2143756289, 1326674710,  102999236 },
+    { 2143713281,  808061791, 1156900308 },
+    { 2143690753,  388399459, 1926468019 },
+    { 2143670273, 1427891374, 1756689401 },
+    { 2143666177, 1912173949,  986629565 },
+    { 2143645697, 2041160111,  371842865 },
+    { 2143641601, 1279906897, 2023974350 },
+    { 2143635457,  720473174, 1389027526 },
+    { 2143621121, 1298309455, 1732632006 },
+    { 2143598593, 1548762216, 1825417506 },
+    { 2143567873,  620475784, 1073787233 },
+    { 2143561729, 1932954575,  949167309 },
+    { 2143553537,  354315656, 1652037534 },
+    { 2143541249,  577424288, 1097027618 },
+    { 2143531009,  357862822,  478640055 },
+    { 2143522817, 2017706025, 1550531668 },
+    { 2143506433, 2078127419, 1824320165 },
+    { 2143488001,  613475285, 1604011510 },
+    { 2143469569, 1466594987,  502095196 },
+    { 2143426561, 1115430331, 1044637111 },
+    { 2143383553,    9778045, 1902463734 },
+    { 2143377409, 1557401276, 2056861771 },
+    { 2143363073,  652036455, 1965915971 },
+    { 2143260673, 1464581171, 1523257541 },
+    { 2143246337, 1876119649,  764541916 },
+    { 2143209473, 1614992673, 1920672844 },
+    { 2143203329,  981052047, 2049774209 },
+    { 2143160321, 1847355533,  728535665 },
+    { 2143129601,  965558457,  603052992 },
+    { 2143123457, 2140817191,    8348679 },
+    { 2143100929, 1547263683,  694209023 },
+    { 2143092737,  643459066, 1979934533 },
+    { 2143082497,  188603778, 2026175670 },
+    { 2143062017, 1657329695,  377451099 },
+    { 2143051777,  114967950,  979255473 },
+    { 2143025153, 1698431342, 1449196896 },
+    { 2143006721, 1862741675, 1739650365 },
+    { 2142996481,  756660457,  996160050 },
+    { 2142976001,  927864010, 1166847574 },
+    { 2142965761,  905070557,  661974566 },
+    { 2142916609,   40932754, 1787161127 },
+    { 2142892033, 1987985648,  675335382 },
+    { 2142885889,  797497211, 1323096997 },
+    { 2142871553, 2068025830, 1411877159 },
+    { 2142861313, 1217177090, 1438410687 },
+    { 2142830593,  409906375, 1767860634 },
+    { 2142803969, 1197788993,  359782919 },
+    { 2142785537,  643817365,  513932862 },
+    { 2142779393, 1717046338,  218943121 },
+    { 2142724097,   89336830,  416687049 },
+    { 2142707713,    5944581, 1356813523 },
+    { 2142658561,  887942135, 2074011722 },
+    { 2142638081,  151851972, 1647339939 },
+    { 2142564353, 1691505537, 1483107336 },
+    { 2142533633, 1989920200, 1135938817 },
+    { 2142529537,  959263126, 1531961857 },
+    { 2142527489,  453251129, 1725566162 },
+    { 2142502913, 1536028102,  182053257 },
+    { 2142498817,  570138730,  701443447 },
+    { 2142416897,  326965800,  411931819 },
+    { 2142363649, 1675665410, 1517191733 },
+    { 2142351361,  968529566, 1575712703 },
+    { 2142330881, 1384953238, 1769087884 },
+    { 2142314497, 1977173242, 1833745524 },
+    { 2142289921,   95082313, 1714775493 },
+    { 2142283777,  109377615, 1070584533 },
+    { 2142277633,   16960510,  702157145 },
+    { 2142263297,  553850819,  431364395 },
+    { 2142208001,  241466367, 2053967982 },
+    { 2142164993, 1795661326, 1031836848 },
+    { 2142097409, 1212530046,  712772031 },
+    { 2142087169, 1763869720,  822276067 },
+    { 2142078977,  644065713, 1765268066 },
+    { 2142074881,  112671944,  643204925 },
+    { 2142044161, 1387785471, 1297890174 },
+    { 2142025729,  783885537, 1000425730 },
+    { 2142011393,  905662232, 1679401033 },
+    { 2141974529,  799788433,  468119557 },
+    { 2141943809, 1932544124,  449305555 },
+    { 2141933569, 1527403256,  841867925 },
+    { 2141931521, 1247076451,  743823916 },
+    { 2141902849, 1199660531,  401687910 },
+    { 2141890561,  150132350, 1720336972 },
+    { 2141857793, 1287438162,  663880489 },
+    { 2141833217,  618017731, 1819208266 },
+    { 2141820929,  999578638, 1403090096 },
+    { 2141786113,   81834325, 1523542501 },
+    { 2141771777,  120001928,  463556492 },
+    { 2141759489,  122455485, 2124928282 },
+    { 2141749249,  141986041,  940339153 },
+    { 2141685761,  889088734,  477141499 },
+    { 2141673473,  324212681, 1122558298 },
+    { 2141669377, 1175806187, 1373818177 },
+    { 2141655041, 1113654822,  296887082 },
+    { 2141587457,  991103258, 1585913875 },
+    { 2141583361, 1401451409, 1802457360 },
+    { 2141575169, 1571977166,  712760980 },
+    { 2141546497, 1107849376, 1250270109 },
+    { 2141515777,  196544219,  356001130 },
+    { 2141495297, 1733571506, 1060744866 },
+    { 2141483009,  321552363, 1168297026 },
+    { 2141458433,  505818251,  733225819 },
+    { 2141360129, 1026840098,  948342276 },
+    { 2141325313,  945133744, 2129965998 },
+    { 2141317121, 1871100260, 1843844634 },
+    { 2141286401, 1790639498, 1750465696 },
+    { 2141267969, 1376858592,  186160720 },
+    { 2141255681, 2129698296, 1876677959 },
+    { 2141243393, 2138900688, 1340009628 },
+    { 2141214721, 1933049835, 1087819477 },
+    { 2141212673, 1898664939, 1786328049 },
+    { 2141202433,  990234828,  940682169 },
+    { 2141175809, 1406392421,  993089586 },
+    { 2141165569, 1263518371,  289019479 },
+    { 2141073409, 1485624211,  507864514 },
+    { 2141052929, 1885134788,  311252465 },
+    { 2141040641, 1285021247,  280941862 },
+    { 2141028353, 1527610374,  375035110 },
+    { 2141011969, 1400626168,  164696620 },
+    { 2140999681,  632959608,  966175067 },
+    { 2140997633, 2045628978, 1290889438 },
+    { 2140993537, 1412755491,  375366253 },
+    { 2140942337,  719477232,  785367828 },
+    { 2140925953,   45224252,  836552317 },
+    { 2140917761, 1157376588, 1001839569 },
+    { 2140887041,  278480752, 2098732796 },
+    { 2140837889, 1663139953,  924094810 },
+    { 2140788737,  802501511, 2045368990 },
+    { 2140766209, 1820083885, 1800295504 },
+    { 2140764161, 1169561905, 2106792035 },
+    { 2140696577,  127781498, 1885987531 },
+    { 2140684289,   16014477, 1098116827 },
+    { 2140653569,  665960598, 1796728247 },
+    { 2140594177, 1043085491,  377310938 },
+    { 2140579841, 1732838211, 1504505945 },
+    { 2140569601,  302071939,  358291016 },
+    { 2140567553,  192393733, 1909137143 },
+    { 2140557313,  406595731, 1175330270 },
+    { 2140549121, 1748850918,  525007007 },
+    { 2140477441,  499436566, 1031159814 },
+    { 2140469249, 1886004401, 1029951320 },
+    { 2140426241, 1483168100, 1676273461 },
+    { 2140420097, 1779917297,  846024476 },
+    { 2140413953,  522948893, 1816354149 },
+    { 2140383233, 1931364473, 1296921241 },
+    { 2140366849, 1917356555,  147196204 },
+    { 2140354561,   16466177, 1349052107 },
+    { 2140348417, 1875366972, 1860485634 },
+    { 2140323841,  456498717, 1790256483 },
+    { 2140321793, 1629493973,  150031888 },
+    { 2140315649, 1904063898,  395510935 },
+    { 2140280833, 1784104328,  831417909 },
+    { 2140250113,  256087139,  697349101 },
+    { 2140229633,  388553070,  243875754 },
+    { 2140223489,  747459608, 1396270850 },
+    { 2140200961,  507423743, 1895572209 },
+    { 2140162049,  580106016, 2045297469 },
+    { 2140149761,  712426444,  785217995 },
+    { 2140137473, 1441607584,  536866543 },
+    { 2140119041,  346538902, 1740434653 },
+    { 2140090369,  282642885,   21051094 },
+    { 2140076033, 1407456228,  319910029 },
+    { 2140047361, 1619330500, 1488632070 },
+    { 2140041217, 2089408064, 2012026134 },
+    { 2140008449, 1705524800, 1613440760 },
+    { 2139924481, 1846208233, 1280649481 },
+    { 2139906049,  989438755, 1185646076 },
+    { 2139867137, 1522314850,  372783595 },
+    { 2139842561, 1681587377,  216848235 },
+    { 2139826177, 2066284988, 1784999464 },
+    { 2139824129,  480888214, 1513323027 },
+    { 2139789313,  847937200,  858192859 },
+    { 2139783169, 1642000434, 1583261448 },
+    { 2139770881,  940699589,  179702100 },
+    { 2139768833,  315623242,  964612676 },
+    { 2139666433,  331649203,  764666914 },
+    { 2139641857, 2118730799, 1313764644 },
+    { 2139635713,  519149027,  519212449 },
+    { 2139598849, 1526413634, 1769667104 },
+    { 2139574273,  551148610,  820739925 },
+    { 2139568129, 1386800242,  472447405 },
+    { 2139549697,  813760130, 1412328531 },
+    { 2139537409, 1615286260, 1609362979 },
+    { 2139475969, 1352559299, 1696720421 },
+    { 2139455489, 1048691649, 1584935400 },
+    { 2139432961,  836025845,  950121150 },
+    { 2139424769, 1558281165, 1635486858 },
+    { 2139406337, 1728402143, 1674423301 },
+    { 2139396097, 1727715782, 1483470544 },
+    { 2139383809, 1092853491, 1741699084 },
+    { 2139369473,  690776899, 1242798709 },
+    { 2139351041, 1768782380, 2120712049 },
+    { 2139334657, 1739968247, 1427249225 },
+    { 2139332609, 1547189119,  623011170 },
+    { 2139310081, 1346827917, 1605466350 },
+    { 2139303937,  369317948,  828392831 },
+    { 2139301889, 1560417239, 1788073219 },
+    { 2139283457, 1303121623,  595079358 },
+    { 2139248641, 1354555286,  573424177 },
+    { 2139240449,   60974056,  885781403 },
+    { 2139222017,  355573421, 1221054839 },
+    { 2139215873,  566477826, 1724006500 },
+    { 2139150337,  871437673, 1609133294 },
+    { 2139144193, 1478130914, 1137491905 },
+    { 2139117569, 1854880922,  964728507 },
+    { 2139076609,  202405335,  756508944 },
+    { 2139062273, 1399715741,  884826059 },
+    { 2139045889, 1051045798, 1202295476 },
+    { 2139033601, 1707715206,  632234634 },
+    { 2139006977, 2035853139,  231626690 },
+    { 2138951681,  183867876,  838350879 },
+    { 2138945537, 1403254661,  404460202 },
+    { 2138920961,  310865011, 1282911681 },
+    { 2138910721, 1328496553,  103472415 },
+    { 2138904577,   78831681,  993513549 },
+    { 2138902529, 1319697451, 1055904361 },
+    { 2138816513,  384338872, 1706202469 },
+    { 2138810369, 1084868275,  405677177 },
+    { 2138787841,  401181788, 1964773901 },
+    { 2138775553, 1850532988, 1247087473 },
+    { 2138767361,  874261901, 1576073565 },
+    { 2138757121, 1187474742,  993541415 },
+    { 2138748929, 1782458888, 1043206483 },
+    { 2138744833, 1221500487,  800141243 },
+    { 2138738689,  413465368, 1450660558 },
+    { 2138695681,  739045140,  342611472 },
+    { 2138658817, 1355845756,  672674190 },
+    { 2138644481,  608379162, 1538874380 },
+    { 2138632193, 1444914034,  686911254 },
+    { 2138607617,  484707818, 1435142134 },
+    { 2138591233,  539460669, 1290458549 },
+    { 2138572801, 2093538990, 2011138646 },
+    { 2138552321, 1149786988, 1076414907 },
+    { 2138546177,  840688206, 2108985273 },
+    { 2138533889,  209669619,  198172413 },
+    { 2138523649, 1975879426, 1277003968 },
+    { 2138490881, 1351891144, 1976858109 },
+    { 2138460161, 1817321013, 1979278293 },
+    { 2138429441, 1950077177,  203441928 },
+    { 2138400769,  908970113,  628395069 },
+    { 2138398721,  219890864,  758486760 },
+    { 2138376193, 1306654379,  977554090 },
+    { 2138351617,  298822498, 2004708503 },
+    { 2138337281,  441457816, 1049002108 },
+    { 2138320897, 1517731724, 1442269609 },
+    { 2138290177, 1355911197, 1647139103 },
+    { 2138234881,  531313247, 1746591962 },
+    { 2138214401, 1899410930,  781416444 },
+    { 2138202113, 1813477173, 1622508515 },
+    { 2138191873, 1086458299, 1025408615 },
+    { 2138183681, 1998800427,  827063290 },
+    { 2138173441, 1921308898,  749670117 },
+    { 2138103809, 1620902804, 2126787647 },
+    { 2138099713,  828647069, 1892961817 },
+    { 2138085377,  179405355, 1525506535 },
+    { 2138060801,  615683235, 1259580138 },
+    { 2138044417, 2030277840, 1731266562 },
+    { 2138042369, 2087222316, 1627902259 },
+    { 2138032129,  126388712, 1108640984 },
+    { 2138011649,  715026550, 1017980050 },
+    { 2137993217, 1693714349, 1351778704 },
+    { 2137888769, 1289762259, 1053090405 },
+    { 2137853953,  199991890, 1254192789 },
+    { 2137833473,  941421685,  896995556 },
+    { 2137817089,  750416446, 1251031181 },
+    { 2137792513,  798075119,  368077456 },
+    { 2137786369,  878543495, 1035375025 },
+    { 2137767937,    9351178, 1156563902 },
+    { 2137755649, 1382297614, 1686559583 },
+    { 2137724929, 1345472850, 1681096331 },
+    { 2137704449,  834666929,  630551727 },
+    { 2137673729, 1646165729, 1892091571 },
+    { 2137620481,  778943821,   48456461 },
+    { 2137618433, 1730837875, 1713336725 },
+    { 2137581569,  805610339, 1378891359 },
+    { 2137538561,  204342388, 1950165220 },
+    { 2137526273, 1947629754, 1500789441 },
+    { 2137516033,  719902645, 1499525372 },
+    { 2137491457,  230451261,  556382829 },
+    { 2137440257,  979573541,  412760291 },
+    { 2137374721,  927841248, 1954137185 },
+    { 2137362433, 1243778559,  861024672 },
+    { 2137313281, 1341338501,  980638386 },
+    { 2137311233,  937415182, 1793212117 },
+    { 2137255937,  795331324, 1410253405 },
+    { 2137243649,  150756339, 1966999887 },
+    { 2137182209,  163346914, 1939301431 },
+    { 2137171969, 1952552395,  758913141 },
+    { 2137159681,  570788721,  218668666 },
+    { 2137147393, 1896656810, 2045670345 },
+    { 2137141249,  358493842,  518199643 },
+    { 2137139201, 1505023029,  674695848 },
+    { 2137133057,   27911103,  830956306 },
+    { 2137122817,  439771337, 1555268614 },
+    { 2137116673,  790988579, 1871449599 },
+    { 2137110529,  432109234,  811805080 },
+    { 2137102337, 1357900653, 1184997641 },
+    { 2137098241,  515119035, 1715693095 },
+    { 2137090049,  408575203, 2085660657 },
+    { 2137085953, 2097793407, 1349626963 },
+    { 2137055233, 1556739954, 1449960883 },
+    { 2137030657, 1545758650, 1369303716 },
+    { 2136987649,  332602570,  103875114 },
+    { 2136969217, 1499989506, 1662964115 },
+    { 2136924161,  857040753,    4738842 },
+    { 2136895489, 1948872712,  570436091 },
+    { 2136893441,   58969960, 1568349634 },
+    { 2136887297, 2127193379,  273612548 },
+    { 2136850433,  111208983, 1181257116 },
+    { 2136809473, 1627275942, 1680317971 },
+    { 2136764417, 1574888217,   14011331 },
+    { 2136741889,   14011055, 1129154251 },
+    { 2136727553,   35862563, 1838555253 },
+    { 2136721409,  310235666, 1363928244 },
+    { 2136698881, 1612429202, 1560383828 },
+    { 2136649729, 1138540131,  800014364 },
+    { 2136606721,  602323503, 1433096652 },
+    { 2136563713,  182209265, 1919611038 },
+    { 2136555521,  324156477,  165591039 },
+    { 2136549377,  195513113,  217165345 },
+    { 2136526849, 1050768046,  939647887 },
+    { 2136508417, 1886286237, 1619926572 },
+    { 2136477697,  609647664,   35065157 },
+    { 2136471553,  679352216, 1452259468 },
+    { 2136457217,  128630031,  824816521 },
+    { 2136422401,   19787464, 1526049830 },
+    { 2136420353,  698316836, 1530623527 },
+    { 2136371201, 1651862373, 1804812805 },
+    { 2136334337,  326596005,  336977082 },
+    { 2136322049,   63253370, 1904972151 },
+    { 2136297473,  312176076,  172182411 },
+    { 2136248321,  381261841,  369032670 },
+    { 2136242177,  358688773, 1640007994 },
+    { 2136229889,  512677188,   75585225 },
+    { 2136219649, 2095003250, 1970086149 },
+    { 2136207361, 1909650722,  537760675 },
+    { 2136176641, 1334616195, 1533487619 },
+    { 2136158209, 2096285632, 1793285210 },
+    { 2136143873, 1897347517,  293843959 },
+    { 2136133633,  923586222, 1022655978 },
+    { 2136096769, 1464868191, 1515074410 },
+    { 2136094721, 2020679520, 2061636104 },
+    { 2136076289,  290798503, 1814726809 },
+    { 2136041473,  156415894, 1250757633 },
+    { 2135996417,  297459940, 1132158924 },
+    { 2135955457,  538755304, 1688831340 },
+    { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+    uint32_t w;
+
+    w = (uint32_t)x;
+    w += p & -(w >> 31);
+    return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+    return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+    uint32_t y;
+
+    y = 2 - p;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+    /*
+     * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+     * 2^31 - p.
+     */
+    return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a + b - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a - b;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+    a += p & -(a & 1);
+    return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+    uint64_t z, w;
+    uint32_t d;
+
+    z = (uint64_t)a * (uint64_t)b;
+    w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+    d = (uint32_t)((z + w) >> 31) - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+    uint32_t z;
+
+    /*
+     * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+     * representation), then double it with an addition.
+     */
+    z = modp_R(p);
+    z = modp_add(z, z, p);
+
+    /*
+     * Square it five times to obtain 2^32 in Montgomery representation
+     * (i.e. 2^63 mod p).
+     */
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+
+    /*
+     * Halve the value mod p to get 2^62.
+     */
+    z = (z + (p & -(z & 1))) >> 1;
+    return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+    int i;
+    uint32_t r, z;
+
+    /*
+     * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+     * representation of (2^31)^e mod p, where e = x-1.
+     * R2 is 2^31 in Montgomery representation.
+     */
+    x --;
+    r = R2;
+    z = modp_R(p);
+    for (i = 0; (1U << i) <= x; i ++) {
+        if ((x & (1U << i)) != 0) {
+            z = modp_montymul(z, r, p, p0i);
+        }
+        r = modp_montymul(r, r, p, p0i);
+    }
+    return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ *   a     dividend
+ *   b     divisor
+ *   p     odd prime modulus
+ *   p0i   -1/p mod 2^31
+ *   R     2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+    uint32_t z, e;
+    int i;
+
+    e = p - 2;
+    z = R;
+    for (i = 30; i >= 0; i --) {
+        uint32_t z2;
+
+        z = modp_montymul(z, z, p, p0i);
+        z2 = modp_montymul(z, b, p, p0i);
+        z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+    }
+
+    /*
+     * The loop above just assumed that b was in Montgomery
+     * representation, i.e. really contained b*R; under that
+     * assumption, it returns 1/b in Montgomery representation,
+     * which is R/b. But we gave it b in normal representation,
+     * so the loop really returned R/(b/R) = R^2/b.
+     *
+     * We want a/b, so we need one Montgomery multiplication with a,
+     * which also remove one of the R factors, and another such
+     * multiplication to remove the second R factor.
+     */
+    z = modp_montymul(z, 1, p, p0i);
+    return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+    0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
+    192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
+    96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
+    144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
+    48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
+    240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
+    72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
+    168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
+    24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
+    216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
+    120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
+    132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
+    36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
+    228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
+    84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
+    180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
+    12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
+    204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
+    108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
+    156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
+    60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
+    252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
+    66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
+    162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
+    18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
+    210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
+    114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
+    138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
+    42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
+    234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
+    90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
+    186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
+    6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
+    198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
+    102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
+    150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
+    54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
+    246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
+    78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
+    174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
+    30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
+    222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
+    126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
+    129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
+    33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
+    225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
+    81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
+    177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
+    9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
+    201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
+    105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
+    153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
+    57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
+    249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
+    69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
+    165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
+    21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
+    213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
+    117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
+    141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
+    45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
+    237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
+    93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
+    189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
+    3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
+    195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
+    99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
+    147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
+    51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
+    243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
+    75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
+    171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
+    27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
+    219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
+    123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
+    135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
+    39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
+    231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
+    87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
+    183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
+    15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
+    207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
+    111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
+    159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
+    63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
+    255,  767,  511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ *   gm[rev(i)] = g^i mod p
+ *   igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+           uint32_t g, uint32_t p, uint32_t p0i) {
+    size_t u, n;
+    unsigned k;
+    uint32_t ig, x1, x2, R2;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * We want g such that g^(2N) = 1 mod p, but the provided
+     * generator has order 2048. We must square it a few times.
+     */
+    R2 = modp_R2(p, p0i);
+    g = modp_montymul(g, R2, p, p0i);
+    for (k = logn; k < 10; k ++) {
+        g = modp_montymul(g, g, p, p0i);
+    }
+
+    ig = modp_div(R2, g, p, p0i, modp_R(p));
+    k = 10 - logn;
+    x1 = x2 = modp_R(p);
+    for (u = 0; u < n; u ++) {
+        size_t v;
+
+        v = REV10[u << k];
+        gm[v] = x1;
+        igm[v] = x2;
+        x1 = modp_montymul(x1, g, p, p0i);
+        x2 = modp_montymul(x2, ig, p, p0i);
+    }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+              uint32_t p, uint32_t p0i) {
+    size_t t, m, n;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, u, v1;
+
+        ht = t >> 1;
+        for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = gm[m + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + ht * stride;
+            for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = modp_montymul(*r2, s, p, p0i);
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_sub(x, y, p);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+               uint32_t p, uint32_t p0i) {
+    size_t t, m, n, k;
+    uint32_t ni;
+    uint32_t *r;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = 1;
+    for (m = n; m > 1; m >>= 1) {
+        size_t hm, dt, u, v1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = igm[hm + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + t * stride;
+            for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = *r2;
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_montymul(
+                          modp_sub(x, y, p), s, p, p0i);;
+            }
+        }
+        t = dt;
+    }
+
+    /*
+     * We need 1/n in Montgomery representation, i.e. R/n. Since
+     * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+     * thus a simple shift will do.
+     */
+    ni = (uint32_t)1 << (31 - logn);
+    for (k = 0, r = a; k < n; k ++, r += stride) {
+        *r = modp_montymul(*r, ni, p, p0i);
+    }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+                  uint32_t p, uint32_t p0i, uint32_t R2) {
+    size_t hn, u;
+
+    hn = (size_t)1 << (logn - 1);
+    for (u = 0; u < hn; u ++) {
+        uint32_t w0, w1;
+
+        w0 = f[(u << 1) + 0];
+        w1 = f[(u << 1) + 1];
+        f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+    }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ *  - Rebuild the resultant and the polynomial coefficients from their
+ *    values modulo small primes (of length 31 bits each).
+ *
+ *  - Compute an extended GCD between the two computed resultants.
+ *
+ *  - Extract top bits and add scaled values during the successive steps
+ *    of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+         uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    cc = 0;
+    m = -ctl;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, w;
+
+        aw = a[u];
+        w = aw - b[u] - cc;
+        cc = w >> 31;
+        aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+        a[u] = aw;
+    }
+    return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < mlen; u ++) {
+        uint64_t z;
+
+        z = (uint64_t)m[u] * (uint64_t)x + cc;
+        m[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ *  d is unsigned
+ *  p is prime
+ *  2^30 < p < 2^31
+ *  p0i = -(1/p) mod 2^31
+ *  R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+                        uint32_t p, uint32_t p0i, uint32_t R2) {
+    uint32_t x;
+    size_t u;
+
+    /*
+     * Algorithm: we inject words one by one, starting with the high
+     * word. Each step is:
+     *  - multiply x by 2^31
+     *  - add new word
+     */
+    x = 0;
+    u = dlen;
+    while (u -- > 0) {
+        uint32_t w;
+
+        x = modp_montymul(x, R2, p, p0i);
+        w = d[u] - p;
+        w += p & -(w >> 31);
+        x = modp_add(x, w, p);
+    }
+    return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+                      uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+    uint32_t z;
+
+    if (dlen == 0) {
+        return 0;
+    }
+    z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+    z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+    return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+                   const uint32_t *y, size_t len, uint32_t s) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t xw, yw;
+        uint64_t z;
+
+        xw = x[u];
+        yw = y[u];
+        z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+    size_t u;
+    uint32_t r, bb;
+
+    /*
+     * Compare x with p/2. We use the shifted version of p, and p
+     * is odd, so we really compare with (p-1)/2; we want to perform
+     * the subtraction if and only if x > (p-1)/2.
+     */
+    r = 0;
+    bb = 0;
+    u = len;
+    while (u -- > 0) {
+        uint32_t wx, wp, cc;
+
+        /*
+         * Get the two words to compare in wx and wp (both over
+         * 31 bits exactly).
+         */
+        wx = x[u];
+        wp = (p[u] >> 1) | (bb << 30);
+        bb = p[u] & 1;
+
+        /*
+         * We set cc to -1, 0 or 1, depending on whether wp is
+         * lower than, equal to, or greater than wx.
+         */
+        cc = wp - wx;
+        cc = ((-cc) >> 31) | -(cc >> 31);
+
+        /*
+         * If r != 0 then it is either 1 or -1, and we keep its
+         * value. Otherwise, if r = 0, then we replace it with cc.
+         */
+        r |= cc & ((r & 1) - 1);
+    }
+
+    /*
+     * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+     * is lower than, equal to, or greater than x. We thus want to
+     * do the subtraction only if r = -1.
+     */
+    zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+                 size_t num, const small_prime *primes, int normalize_signed,
+                 uint32_t *tmp) {
+    size_t u;
+    uint32_t *x;
+
+    tmp[0] = primes[0].p;
+    for (u = 1; u < xlen; u ++) {
+        /*
+         * At the entry of each loop iteration:
+         *  - the first u words of each array have been
+         *    reassembled;
+         *  - the first u words of tmp[] contains the
+         * product of the prime moduli processed so far.
+         *
+         * We call 'q' the product of all previous primes.
+         */
+        uint32_t p, p0i, s, R2;
+        size_t v;
+
+        p = primes[u].p;
+        s = primes[u].s;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        for (v = 0, x = xx; v < num; v ++, x += xstride) {
+            uint32_t xp, xq, xr;
+            /*
+             * xp = the integer x modulo the prime p for this
+             *      iteration
+             * xq = (x mod q) mod p
+             */
+            xp = x[u];
+            xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+            /*
+             * New value is (x mod q) + q * (s * (xp - xq) mod p)
+             */
+            xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+            zint_add_mul_small(x, tmp, u, xr);
+        }
+
+        /*
+         * Update product of primes in tmp[].
+         */
+        tmp[u] = zint_mul_small(tmp, u, p);
+    }
+
+    /*
+     * Normalize the reconstructed values around 0.
+     */
+    if (normalize_signed) {
+        for (u = 0, x = xx; u < num; u ++, x += xstride) {
+            zint_norm_zero(x, tmp, xlen);
+        }
+    }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    /*
+     * If ctl = 1 then we flip the bits of a by XORing with
+     * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+     * with 0 and add 0, which leaves the value unchanged.
+     */
+    cc = ctl;
+    m = -ctl >> 1;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw;
+
+        aw = a[u];
+        aw = (aw ^ m) + cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ *  0  both values were positive
+ *  1  new a had to be negated
+ *  2  new b had to be negated
+ *  3  both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+               int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t nega, negb;
+
+    cca = 0;
+    ccb = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    nega = (uint32_t)((uint64_t)cca >> 63);
+    negb = (uint32_t)((uint64_t)ccb >> 63);
+    zint_negate(a, len, nega);
+    zint_negate(b, len, negb);
+    return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+    size_t u;
+    uint32_t cc, xm, ym;
+
+    /*
+     * First pass: compare a (assumed nonnegative) with m. Note that
+     * if the top word uses 32 bits, subtracting m must yield a
+     * value less than 2^31 since a < 2*m.
+     */
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        cc = (a[u] - m[u] - cc) >> 31;
+    }
+
+    /*
+     * If neg = 1 then we must add m (regardless of cc)
+     * If neg = 0 and cc = 0 then we must subtract m
+     * If neg = 0 and cc = 1 then we must do nothing
+     *
+     * In the loop below, we conditionally subtract either m or -m
+     * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+     * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+     */
+    xm = -neg >> 1;
+    ym = -(neg | (1 - cc));
+    cc = neg;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, mw;
+
+        aw = a[u];
+        mw = (m[u] ^ xm) & ym;
+        aw = aw - mw - cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+                   uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t fa, fb;
+
+    /*
+     * These are actually four combined Montgomery multiplications.
+     */
+    cca = 0;
+    ccb = 0;
+    fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+    fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb
+             + m[u] * (uint64_t)fa + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+             + m[u] * (uint64_t)fb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    /*
+     * At this point:
+     *   -m <= a < 2*m
+     *   -m <= b < 2*m
+     * (this is a case of Montgomery reduction)
+     * The top words of 'a' and 'b' may have a 32-th bit set.
+     * We want to add or subtract the modulus, as required.
+     */
+    zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+    zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ *   0 <= u <= y
+ *   0 <= v <= x
+ *   x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+            const uint32_t *x, const uint32_t *y,
+            size_t len, uint32_t *tmp) {
+    /*
+     * Algorithm is an extended binary GCD. We maintain 6 values
+     * a, b, u0, u1, v0 and v1 with the following invariants:
+     *
+     *  a = x*u0 - y*v0
+     *  b = x*u1 - y*v1
+     *  0 <= a <= x
+     *  0 <= b <= y
+     *  0 <= u0 < y
+     *  0 <= v0 < x
+     *  0 <= u1 <= y
+     *  0 <= v1 < x
+     *
+     * Initial values are:
+     *
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     *
+     * Each iteration reduces either a or b, and maintains the
+     * invariants. Algorithm stops when a = b, at which point their
+     * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+     * the values (u,v) we want to return.
+     *
+     * The formal definition of the algorithm is a sequence of steps:
+     *
+     *  - If a is even, then:
+     *        a <- a/2
+     *        u0 <- u0/2 mod y
+     *        v0 <- v0/2 mod x
+     *
+     *  - Otherwise, if b is even, then:
+     *        b <- b/2
+     *        u1 <- u1/2 mod y
+     *        v1 <- v1/2 mod x
+     *
+     *  - Otherwise, if a > b, then:
+     *        a <- (a-b)/2
+     *        u0 <- (u0-u1)/2 mod y
+     *        v0 <- (v0-v1)/2 mod x
+     *
+     *  - Otherwise:
+     *        b <- (b-a)/2
+     *        u1 <- (u1-u0)/2 mod y
+     *        v1 <- (v1-v0)/2 mod y
+     *
+     * We can show that the operations above preserve the invariants:
+     *
+     *  - If a is even, then u0 and v0 are either both even or both
+     *    odd (since a = x*u0 - y*v0, and x and y are both odd).
+     *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+     *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+     *    the a = x*u0 - y*v0 invariant is preserved.
+     *
+     *  - The same holds for the case where b is even.
+     *
+     *  - If a and b are odd, and a > b, then:
+     *
+     *      a-b = x*(u0-u1) - y*(v0-v1)
+     *
+     *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+     *    a-b > 0; therefore, it must be that v0 < v1, and the
+     *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+     *    which preserves the invariants. Otherwise, if u0 > u1,
+     *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+     *    b >= 0, hence a-b <= x. It follows that, in that case,
+     *    v0-v1 >= 0. The first part of the update is then:
+     *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+     *    invariants.
+     *
+     *    Either way, once the subtraction is done, the new value of
+     *    a, which is the difference of two odd values, is even,
+     *    and the remaining of this step is a subcase of the
+     *    first algorithm case (i.e. when a is even).
+     *
+     *  - If a and b are odd, and b > a, then the a similar
+     *    argument holds.
+     *
+     * The values a and b start at x and y, respectively. Since x
+     * and y are odd, their GCD is odd, and it is easily seen that
+     * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+     * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+     * or b is reduced by at least one bit at each iteration, so
+     * the algorithm necessarily converges on the case a = b, at
+     * which point the common value is the GCD.
+     *
+     * In the algorithm expressed above, when a = b, the fourth case
+     * applies, and sets b = 0. Since a contains the GCD of x and y,
+     * which are both odd, a must be odd, and subsequent iterations
+     * (if any) will simply divide b by 2 repeatedly, which has no
+     * consequence. Thus, the algorithm can run for more iterations
+     * than necessary; the final GCD will be in a, and the (u,v)
+     * coefficients will be (u0,v0).
+     *
+     *
+     * The presentation above is bit-by-bit. It can be sped up by
+     * noticing that all decisions are taken based on the low bits
+     * and high bits of a and b. We can extract the two top words
+     * and low word of each of a and b, and compute reduction
+     * parameters pa, pb, qa and qb such that the new values for
+     * a and b are:
+     *    a' = (a*pa + b*pb) / (2^31)
+     *    b' = (a*qa + b*qb) / (2^31)
+     * the two divisions being exact. The coefficients are obtained
+     * just from the extracted words, and may be slightly off, requiring
+     * an optional correction: if a' < 0, then we replace pa with -pa
+     * and pb with -pb. Each such step will reduce the total length
+     * (sum of lengths of a and b) by at least 30 bits at each
+     * iteration.
+     */
+    uint32_t *u0, *u1, *v0, *v1, *a, *b;
+    uint32_t x0i, y0i;
+    uint32_t num, rc;
+    size_t j;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    /*
+     * u0 and v0 are the u and v result buffers; the four other
+     * values (u1, v1, a and b) are taken from tmp[].
+     */
+    u0 = u;
+    v0 = v;
+    u1 = tmp;
+    v1 = u1 + len;
+    a = v1 + len;
+    b = a + len;
+
+    /*
+     * We'll need the Montgomery reduction coefficients.
+     */
+    x0i = modp_ninv31(x[0]);
+    y0i = modp_ninv31(y[0]);
+
+    /*
+     * Initialize a, b, u0, u1, v0 and v1.
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     * Note that x is odd, so computing x-1 is easy.
+     */
+    memcpy(a, x, len * sizeof * x);
+    memcpy(b, y, len * sizeof * y);
+    u0[0] = 1;
+    memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+    memset(v0, 0, len * sizeof * v0);
+    memcpy(u1, y, len * sizeof * u1);
+    memcpy(v1, x, len * sizeof * v1);
+    v1[0] --;
+
+    /*
+     * Each input operand may be as large as 31*len bits, and we
+     * reduce the total length by at least 30 bits at each iteration.
+     */
+    for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+        uint32_t c0, c1;
+        uint32_t a0, a1, b0, b1;
+        uint64_t a_hi, b_hi;
+        uint32_t a_lo, b_lo;
+        int64_t pa, pb, qa, qb;
+        int i;
+        uint32_t r;
+
+        /*
+         * Extract the top words of a and b. If j is the highest
+         * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+         * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+         * If a and b are down to one word each, then we use
+         * a[0] and b[0].
+         */
+        c0 = (uint32_t) -1;
+        c1 = (uint32_t) -1;
+        a0 = 0;
+        a1 = 0;
+        b0 = 0;
+        b1 = 0;
+        j = len;
+        while (j -- > 0) {
+            uint32_t aw, bw;
+
+            aw = a[j];
+            bw = b[j];
+            a0 ^= (a0 ^ aw) & c0;
+            a1 ^= (a1 ^ aw) & c1;
+            b0 ^= (b0 ^ bw) & c0;
+            b1 ^= (b1 ^ bw) & c1;
+            c1 = c0;
+            c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+        }
+
+        /*
+         * If c1 = 0, then we grabbed two words for a and b.
+         * If c1 != 0 but c0 = 0, then we grabbed one word. It
+         * is not possible that c1 != 0 and c0 != 0, because that
+         * would mean that both integers are zero.
+         */
+        a1 |= a0 & c1;
+        a0 &= ~c1;
+        b1 |= b0 & c1;
+        b0 &= ~c1;
+        a_hi = ((uint64_t)a0 << 31) + a1;
+        b_hi = ((uint64_t)b0 << 31) + b1;
+        a_lo = a[0];
+        b_lo = b[0];
+
+        /*
+         * Compute reduction factors:
+         *
+         *   a' = a*pa + b*pb
+         *   b' = a*qa + b*qb
+         *
+         * such that a' and b' are both multiple of 2^31, but are
+         * only marginally larger than a and b.
+         */
+        pa = 1;
+        pb = 0;
+        qa = 0;
+        qb = 1;
+        for (i = 0; i < 31; i ++) {
+            /*
+             * At each iteration:
+             *
+             *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+             *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+             *   a <- a/2 if: a is even
+             *   b <- b/2 if: a is odd, b is even
+             *
+             * We multiply a_lo and b_lo by 2 at each
+             * iteration, thus a division by 2 really is a
+             * non-multiplication by 2.
+             */
+            uint32_t rt, oa, ob, cAB, cBA, cA;
+            uint64_t rz;
+
+            /*
+             * rt = 1 if a_hi > b_hi, 0 otherwise.
+             */
+            rz = b_hi - a_hi;
+            rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+                                   & (a_hi ^ rz))) >> 63);
+
+            /*
+             * cAB = 1 if b must be subtracted from a
+             * cBA = 1 if a must be subtracted from b
+             * cA = 1 if a must be divided by 2
+             *
+             * Rules:
+             *
+             *   cAB and cBA cannot both be 1.
+             *   If a is not divided by 2, b is.
+             */
+            oa = (a_lo >> i) & 1;
+            ob = (b_lo >> i) & 1;
+            cAB = oa & ob & rt;
+            cBA = oa & ob & ~rt;
+            cA = cAB | (oa ^ 1);
+
+            /*
+             * Conditional subtractions.
+             */
+            a_lo -= b_lo & -cAB;
+            a_hi -= b_hi & -(uint64_t)cAB;
+            pa -= qa & -(int64_t)cAB;
+            pb -= qb & -(int64_t)cAB;
+            b_lo -= a_lo & -cBA;
+            b_hi -= a_hi & -(uint64_t)cBA;
+            qa -= pa & -(int64_t)cBA;
+            qb -= pb & -(int64_t)cBA;
+
+            /*
+             * Shifting.
+             */
+            a_lo += a_lo & (cA - 1);
+            pa += pa & ((int64_t)cA - 1);
+            pb += pb & ((int64_t)cA - 1);
+            a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+            b_lo += b_lo & -cA;
+            qa += qa & -(int64_t)cA;
+            qb += qb & -(int64_t)cA;
+            b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+        }
+
+        /*
+         * Apply the computed parameters to our values. We
+         * may have to correct pa and pb depending on the
+         * returned value of zint_co_reduce() (when a and/or b
+         * had to be negated).
+         */
+        r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+        pa -= (pa + pa) & -(int64_t)(r & 1);
+        pb -= (pb + pb) & -(int64_t)(r & 1);
+        qa -= (qa + qa) & -(int64_t)(r >> 1);
+        qb -= (qb + qb) & -(int64_t)(r >> 1);
+        zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+        zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+    }
+
+    /*
+     * At that point, array a[] should contain the GCD, and the
+     * results (u,v) should already be set. We check that the GCD
+     * is indeed 1. We also check that the two operands x and y
+     * are odd.
+     */
+    rc = a[0] ^ 1;
+    for (j = 1; j < len; j ++) {
+        rc |= a[j];
+    }
+    return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+                          const uint32_t *y, size_t ylen, int32_t k,
+                          uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    int32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t wy, wys, ccu;
+        uint64_t z;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        /*
+         * The expression below does not overflow.
+         */
+        z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+        /*
+         * Right-shifting the signed value z would yield
+         * implementation-defined results (arithmetic shift is
+         * not guaranteed). However, we can cast to unsigned,
+         * and get the next carry as an unsigned word. We can
+         * then convert it back to signed by using the guaranteed
+         * fact that 'int32_t' uses two's complement with no
+         * trap representation or padding bit, and with a layout
+         * compatible with that of 'uint32_t'.
+         */
+        ccu = (uint32_t)(z >> 31);
+        cc = *(int32_t *)&ccu;
+    }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+                const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    uint32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t w, wy, wys;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        w = x[u] - wys - cc;
+        x[u] = w & 0x7FFFFFFF;
+        cc = w >> 31;
+    }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+    uint32_t w;
+
+    w = x[0];
+    w |= (w & 0x40000000) << 1;
+    return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+               unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    if (flen == 0) {
+        for (u = 0; u < n; u ++) {
+            d[u] = fpr_zero;
+        }
+        return;
+    }
+    for (u = 0; u < n; u ++, f += fstride) {
+        size_t v;
+        uint32_t neg, cc, xm;
+        fpr x, fsc;
+
+        /*
+         * Get sign of the integer; if it is negative, then we
+         * will load its absolute value instead, and negate the
+         * result.
+         */
+        neg = -(f[flen - 1] >> 30);
+        xm = neg >> 1;
+        cc = neg & 1;
+        x = fpr_zero;
+        fsc = fpr_one;
+        for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+            uint32_t w;
+
+            w = (f[v] ^ xm) + cc;
+            cc = w >> 31;
+            w &= 0x7FFFFFFF;
+            w -= (w << 1) & neg;
+            x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+        }
+        d[u] = x;
+    }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = zint_one_to_plain(s + u);
+        if (z < -lim || z > lim) {
+            return 0;
+        }
+        d[u] = (int8_t)z;
+    }
+    return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+                const uint32_t *f, size_t flen, size_t fstride,
+                const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t kf;
+        size_t v;
+        uint32_t *x;
+        const uint32_t *y;
+
+        kf = -k[u];
+        x = F + u * Fstride;
+        y = f;
+        for (v = 0; v < n; v ++) {
+            zint_add_scaled_mul_small(
+                x, Flen, y, flen, kf, sch, scl);
+            if (u + v == n - 1) {
+                x = F;
+                kf = -kf;
+            } else {
+                x += Fstride;
+            }
+            y += fstride;
+        }
+    }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+                    const uint32_t *f, size_t flen, size_t fstride,
+                    const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+                    uint32_t *tmp) {
+    uint32_t *gm, *igm, *fk, *t1, *x;
+    const uint32_t *y;
+    size_t n, u, tlen;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    tlen = flen + 1;
+    gm = tmp;
+    igm = gm + MKN(logn);
+    fk = igm + MKN(logn);
+    t1 = fk + n * tlen;
+
+    primes = PRIMES;
+
+    /*
+     * Compute k*f in fk[], in RNS notation.
+     */
+    for (u = 0; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0; v < n; v ++) {
+            t1[v] = modp_set(k[v], p);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, y = f, x = fk + u;
+                v < n; v ++, y += fstride, x += tlen) {
+            *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+        }
+        modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+        for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+            *x = modp_montymul(
+                     modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+        }
+        modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild k*f.
+     */
+    zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+    /*
+     * Subtract k*f, scaled, from F.
+     */
+    for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+        zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+    }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT   inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+    /*
+     * We enforce little-endian representation.
+     */
+
+    /*
+     * On little-endian systems we just interpret the bytes "as is"
+     * (this is correct because the exact-width types such as
+     * 'uint64_t' are guaranteed to have no padding and no trap
+     * representation).
+     */
+    uint64_t r;
+
+    inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
+    return r;
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ *    D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+    1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
+    2353523259288686585u,  1227179971273316331u,   575931623374121527u,
+    242543240509105209u,    91437049221049666u,    30799446349977173u,
+    9255276791179340u,     2478152334826140u,      590642893610164u,
+    125206034929641u,       23590435911403u,        3948334035941u,
+    586753615614u,          77391054539u,           9056793210u,
+    940121950u,             86539696u,              7062824u,
+    510971u,                32764u,                 1862u,
+    94u,                    4u,                    0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+    unsigned u, g;
+    int val;
+
+    g = 1U << (10 - logn);
+    val = 0;
+    for (u = 0; u < g; u ++) {
+        /*
+         * Each iteration generates one value with the
+         * Gaussian distribution for N = 1024.
+         *
+         * We use two random 64-bit values. First value
+         * decides on whether the generated value is 0, and,
+         * if not, the sign of the value. Second random 64-bit
+         * word is used to generate the non-zero value.
+         *
+         * For constant-time code we have to read the complete
+         * table. This has negligible cost, compared with the
+         * remainder of the keygen process (solving the NTRU
+         * equation).
+         */
+        uint64_t r;
+        uint32_t f, v, k, neg;
+
+        /*
+         * First value:
+         *  - flag 'neg' is randomly selected to be 0 or 1.
+         *  - flag 'f' is set to 1 if the generated value is zero,
+         *    or set to 0 otherwise.
+         */
+        r = get_rng_u64(rng);
+        neg = (uint32_t)(r >> 63);
+        r &= ~((uint64_t)1 << 63);
+        f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+        /*
+         * We produce a new random 63-bit integer r, and go over
+         * the array, starting at index 1. We store in v the
+         * index of the first array element which is not greater
+         * than r, unless the flag f was already 1.
+         */
+        v = 0;
+        r = get_rng_u64(rng);
+        r &= ~((uint64_t)1 << 63);
+        for (k = 1; k < (sizeof gauss_1024_12289)
+                / (sizeof gauss_1024_12289[0]); k ++) {
+            uint32_t t;
+
+            t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+            v |= k & -(t & (f ^ 1));
+            f |= t;
+        }
+
+        /*
+         * We apply the sign ('neg' flag). If the value is zero,
+         * the sign has no effect.
+         */
+        v = (v ^ -neg) + neg;
+
+        /*
+         * Generated value is added to val.
+         */
+        val += *(int32_t *)&v;
+    }
+    return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ *  - Within an array, values grow.
+ *
+ *  - The 'SMALL' array must have an entry for maximum depth, corresponding
+ *    to the size of values used in the binary GCD. There is no such value
+ *    for the 'LARGE' array (the binary GCD yields already reduced
+ *    coefficients).
+ *
+ *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ *  - Values must be large enough to handle the common cases, with some
+ *    margins.
+ *
+ *  - Values must not be "too large" either because we will convert some
+ *    integers into floating-point values by considering the top 10 words,
+ *    i.e. 310 bits; hence, for values of length more than 10 words, we
+ *    should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
+ *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
+ *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
+ *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
+ *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
+ *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
+ *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
+ *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
+ *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
+ *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
+ *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+    1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+    2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+    int avg;
+    int std;
+} BITLENGTH[] = {
+    {    4,  0 },
+    {   11,  1 },
+    {   24,  1 },
+    {   50,  1 },
+    {  102,  1 },
+    {  202,  2 },
+    {  401,  4 },
+    {  794,  5 },
+    { 1577,  8 },
+    { 3138, 13 },
+    { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG   4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = MKN(logn);
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = f[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(fpr);
+    if (km) {
+        k += (sizeof(fpr)) - km;
+    }
+    return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(uint32_t);
+    if (km) {
+        k += (sizeof(uint32_t)) - km;
+    }
+    return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        x[u] = fpr_of(f[u]);
+    }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+             int in_ntt, int out_ntt) {
+    size_t n, hn, u;
+    size_t slen, tlen;
+    uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+    const small_prime *primes;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    slen = MAX_BL_SMALL[depth];
+    tlen = MAX_BL_SMALL[depth + 1];
+    primes = PRIMES;
+
+    /*
+     * Prepare room for the result.
+     */
+    fd = data;
+    gd = fd + hn * tlen;
+    fs = gd + hn * tlen;
+    gs = fs + n * slen;
+    gm = gs + n * slen;
+    igm = gm + n;
+    t1 = igm + n;
+    memmove(fs, data, 2 * n * slen * sizeof * data);
+
+    /*
+     * First slen words: we use the input values directly, and apply
+     * inverse NTT as we go.
+     */
+    for (u = 0; u < slen; u ++) {
+        uint32_t p, p0i, R2;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+        }
+
+        for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+
+    /*
+     * Since the fs and gs words have been de-NTTized, we can use the
+     * CRT to rebuild the values.
+     */
+    zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+    zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+    /*
+     * Remaining words: use modular reductions to extract the values.
+     */
+    for (u = slen; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+        for (v = 0, x = fs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        for (v = 0, x = gs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ *   0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+        unsigned logn, unsigned depth, int out_ntt) {
+    size_t n, u;
+    uint32_t *ft, *gt, p0;
+    unsigned d;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    ft = data;
+    gt = ft + n;
+    primes = PRIMES;
+    p0 = primes[0].p;
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p0);
+        gt[u] = modp_set(g[u], p0);
+    }
+
+    if (depth == 0 && out_ntt) {
+        uint32_t *gm, *igm;
+        uint32_t p, p0i;
+
+        p = primes[0].p;
+        p0i = modp_ninv31(p);
+        gm = gt + n;
+        igm = gm + MKN(logn);
+        modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+        modp_NTT2(ft, gm, logn, p, p0i);
+        modp_NTT2(gt, gm, logn, p, p0i);
+        return;
+    }
+
+    if (depth == 0) {
+        return;
+    }
+
+    if (depth == 1) {
+        make_fg_step(data, logn, 0, 0, out_ntt);
+        return;
+    }
+
+    make_fg_step(data, logn, 0, 0, 1);
+    for (d = 1; d + 1 < depth; d ++) {
+        make_fg_step(data, logn - d, d, 1, 1);
+    }
+    make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+                   const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t len;
+    uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+    const small_prime *primes;
+
+    len = MAX_BL_SMALL[logn_top];
+    primes = PRIMES;
+
+    Fp = tmp;
+    Gp = Fp + len;
+    fp = Gp + len;
+    gp = fp + len;
+    t1 = gp + len;
+
+    make_fg(fp, f, g, logn_top, logn_top, 0);
+
+    /*
+     * We use the CRT to rebuild the resultants as big integers.
+     * There are two such big integers. The resultants are always
+     * nonnegative.
+     */
+    zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+    /*
+     * Apply the binary GCD. The zint_bezout() function works only
+     * if both inputs are odd.
+     *
+     * We can test on the result and return 0 because that would
+     * imply failure of the NTRU solving equation, and the (f,g)
+     * values will be abandoned in that case.
+     */
+    if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+        return 0;
+    }
+
+    /*
+     * Multiply the two values by the target value q. Values must
+     * fit in the destination arrays.
+     * We can again test on the returned words: a non-zero output
+     * of zint_mul_small() means that we exceeded our array
+     * capacity, and that implies failure and rejection of (f,g).
+     */
+    q = 12289;
+    if (zint_mul_small(Fp, len, q) != 0
+            || zint_mul_small(Gp, len, q) != 0) {
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+                        const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+    /*
+     * In this function, 'logn' is the log2 of the degree for
+     * this step. If N = 2^logn, then:
+     *  - the F and G values already in fk->tmp (from the deeper
+     *    levels) have degree N/2;
+     *  - this function should return F and G of degree N.
+     */
+    unsigned logn;
+    size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+    int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+    uint32_t *x, *y;
+    int32_t *k;
+    const small_prime *primes;
+
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2 or N/3)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+    primes = PRIMES;
+
+    /*
+     * Fd and Gd are the F and G from the deeper level.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+
+    /*
+     * Compute the input f and g for this level. Note that we get f
+     * and g in RNS + NTT representation.
+     */
+    ft = Gd + dlen * hn;
+    make_fg(ft, f, g, logn_top, depth, 1);
+
+    /*
+     * Move the newly computed f and g to make room for our candidate
+     * F and G (unreduced).
+     */
+    Ft = tmp;
+    Gt = Ft + n * llen;
+    t1 = Gt + n * llen;
+    memmove(t1, ft, 2 * n * slen * sizeof * ft);
+    ft = t1;
+    gt = ft + slen * n;
+    t1 = gt + slen * n;
+
+    /*
+     * Move Fd and Gd _after_ f and g.
+     */
+    memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+    Fd = t1;
+    Gd = Fd + hn * dlen;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt (only n/2 values in each).
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * We do not need Fd and Gd after that point.
+     */
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * If we processed slen words, then f and g have been
+         * de-NTTized, and are in RNS; we can rebuild them.
+         */
+        if (u == slen) {
+            zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+            zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+        }
+
+        gm = t1;
+        igm = gm + n;
+        fx = igm + n;
+        gx = fx + n;
+
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        if (u < slen) {
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = *x;
+                gx[v] = *y;
+            }
+            modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+            modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+        } else {
+            uint32_t Rx;
+
+            Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+            for (v = 0, x = ft, y = gt;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = zint_mod_small_signed(x, slen,
+                                              p, p0i, R2, Rx);
+                gx[v] = zint_mod_small_signed(y, slen,
+                                              p, p0i, R2, Rx);
+            }
+            modp_NTT2(fx, gm, logn, p, p0i);
+            modp_NTT2(gx, gm, logn, p, p0i);
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed in
+         * a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * General case:
+         *
+         *   we divide degree by d = 2 or 3
+         *   f'(x^d) = N(f)(x^d) = f * adj(f)
+         *   g'(x^d) = N(g)(x^d) = g * adj(g)
+         *   f'*G' - g'*F' = q
+         *   F = F'(x^d) * adj(g)
+         *   G = G'(x^d) * adj(f)
+         *
+         * We compute things in the NTT. We group roots of phi
+         * such that all roots x in a group share the same x^d.
+         * If the roots in a group are x_1, x_2... x_d, then:
+         *
+         *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+         *
+         * Thus, we have:
+         *
+         *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   ...
+         *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+         *
+         * In all cases, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, in our chosen NTT representation, roots
+         * from the same group are consecutive in RAM.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+                v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild F and G with the CRT.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+    zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+    /*
+     * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+     * order).
+     */
+
+    /*
+     * Apply Babai reduction to bring back F and G to size slen.
+     *
+     * We use the FFT to compute successive approximations of the
+     * reduction coefficient. We first isolate the top bits of
+     * the coefficients of f and g, and convert them to floating
+     * point; with the FFT, we compute adj(f), adj(g), and
+     * 1/(f*adj(f)+g*adj(g)).
+     *
+     * Then, we repeatedly apply the following:
+     *
+     *   - Get the top bits of the coefficients of F and G into
+     *     floating point, and use the FFT to compute:
+     *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+     *
+     *   - Convert back that value into normal representation, and
+     *     round it to the nearest integers, yielding a polynomial k.
+     *     Proper scaling is applied to f, g, F and G so that the
+     *     coefficients fit on 32 bits (signed).
+     *
+     *   - Subtract k*f from F and k*g from G.
+     *
+     * Under normal conditions, this process reduces the size of F
+     * and G by some bits at each iteration. For constant-time
+     * operation, we do not want to measure the actual length of
+     * F and G; instead, we do the following:
+     *
+     *   - f and g are converted to floating-point, with some scaling
+     *     if necessary to keep values in the representable range.
+     *
+     *   - For each iteration, we _assume_ a maximum size for F and G,
+     *     and use the values at that size. If we overreach, then
+     *     we get zeros, which is harmless: the resulting coefficients
+     *     of k will be 0 and the value won't be reduced.
+     *
+     *   - We conservatively assume that F and G will be reduced by
+     *     at least 25 bits at each iteration.
+     *
+     * Even when reaching the bottom of the reduction, reduction
+     * coefficient will remain low. If it goes out-of-range, then
+     * something wrong occurred and the whole NTRU solving fails.
+     */
+
+    /*
+     * Memory layout:
+     *  - We need to compute and keep adj(f), adj(g), and
+     *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+     *    respectively).
+     *  - At each iteration we need two extra fp buffer (N fp values),
+     *    and produce a k (N 32-bit words). k will be shared with one
+     *    of the fp buffers.
+     *  - To compute k*f and k*g efficiently (with the NTT), we need
+     *    some extra room; we reuse the space of the temporary buffers.
+     *
+     * Arrays of 'fpr' are obtained from the temporary array itself.
+     * We ensure that the base is at a properly aligned offset (the
+     * source array tmp[] is supposed to be already aligned).
+     */
+
+    rt3 = align_fpr(tmp, t1);
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt1 = rt5 + (n >> 1);
+    k = (int32_t *)align_u32(tmp, rt1);
+    rt2 = align_fpr(tmp, k + n);
+    if (rt2 < (rt1 + n)) {
+        rt2 = rt1 + n;
+    }
+    t1 = (uint32_t *)k + n;
+
+    /*
+     * Get f and g into rt3 and rt4 as floating-point approximations.
+     *
+     * We need to "scale down" the floating-point representation of
+     * coefficients when they are too big. We want to keep the value
+     * below 2^310 or so. Thus, when values are larger than 10 words,
+     * we consider only the top 10 words. Array lengths have been
+     * computed so that average maximum length will fall in the
+     * middle or the upper half of these top 10 words.
+     */
+    if (slen > 10) {
+        rlen = 10;
+    } else {
+        rlen = slen;
+    }
+    poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+    poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+    /*
+     * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+     */
+    scale_fg = 31 * (int)(slen - rlen);
+
+    /*
+     * Estimated boundaries for the maximum size (in bits) of the
+     * coefficients of (f,g). We use the measured average, and
+     * allow for a deviation of at most six times the standard
+     * deviation.
+     */
+    minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+    maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+    /*
+     * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+     * and adj(g) in rt3 and rt4, respectively.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt4, logn);
+
+    /*
+     * Reduce F and G repeatedly.
+     *
+     * The expected maximum bit length of coefficients of F and G
+     * is kept in maxbl_FG, with the corresponding word length in
+     * FGlen.
+     */
+    FGlen = llen;
+    maxbl_FG = 31 * (int)llen;
+
+    /*
+     * Each reduction operation computes the reduction polynomial
+     * "k". We need that polynomial to have coefficients that fit
+     * on 32-bit signed integers, with some scaling; thus, we use
+     * a descending sequence of scaling values, down to zero.
+     *
+     * The size of the coefficients of k is (roughly) the difference
+     * between the size of the coefficients of (F,G) and the size
+     * of the coefficients of (f,g). Thus, the maximum size of the
+     * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+     * this is our starting scale value for k.
+     *
+     * We need to estimate the size of (F,G) during the execution of
+     * the algorithm; we are allowed some overestimation but not too
+     * much (poly_big_to_fp() uses a 310-bit window). Generally
+     * speaking, after applying a reduction with k scaled to
+     * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+     * where 'dd' is a few bits to account for the fact that the
+     * reduction is never perfect (intuitively, dd is on the order
+     * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+     * bits).
+     *
+     * The size of (f,g) is not known exactly, but maxbl_fg is an
+     * upper bound.
+     */
+    scale_k = maxbl_FG - minbl_fg;
+
+    for (;;) {
+        int scale_FG, dc, new_maxbl_FG;
+        uint32_t scl, sch;
+        fpr pdc, pt;
+
+        /*
+         * Convert current F and G into floating-point. We apply
+         * scaling if the current length is more than 10 words.
+         */
+        if (FGlen > 10) {
+            rlen = 10;
+        } else {
+            rlen = FGlen;
+        }
+        scale_FG = 31 * (int)(FGlen - rlen);
+        poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+        poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+        /*
+         * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+         */
+        PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt2, rt4, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_add(rt2, rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt2, rt5, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt2, logn);
+
+        /*
+         * (f,g) are scaled by 'scale_fg', meaning that the
+         * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+         * to have their true mathematical value.
+         *
+         * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+         * the value we computed in rt2 is scaled by
+         * 'scale_FG-scale_fg'.
+         *
+         * We want that value to be scaled by 'scale_k', hence we
+         * apply a corrective scaling. After scaling, the values
+         * should fit in -2^31-1..+2^31-1.
+         */
+        dc = scale_k - scale_FG + scale_fg;
+
+        /*
+         * We will need to multiply values by 2^(-dc). The value
+         * 'dc' is not secret, so we can compute 2^(-dc) with a
+         * non-constant-time process.
+         * (We could use ldexp(), but we prefer to avoid any
+         * dependency on libm. When using FP emulation, we could
+         * use our fpr_ldexp(), which is constant-time.)
+         */
+        if (dc < 0) {
+            dc = -dc;
+            pt = fpr_two;
+        } else {
+            pt = fpr_onehalf;
+        }
+        pdc = fpr_one;
+        while (dc != 0) {
+            if ((dc & 1) != 0) {
+                pdc = fpr_mul(pdc, pt);
+            }
+            dc >>= 1;
+            pt = fpr_sqr(pt);
+        }
+
+        for (u = 0; u < n; u ++) {
+            fpr xv;
+
+            xv = fpr_mul(rt2[u], pdc);
+
+            /*
+             * Sometimes the values can be out-of-bounds if
+             * the algorithm fails; we must not call
+             * fpr_rint() (and cast to int32_t) if the value
+             * is not in-bounds. Note that the test does not
+             * break constant-time discipline, since any
+             * failure here implies that we discard the current
+             * secret key (f,g).
+             */
+            if (!fpr_lt(fpr_mtwo31m1, xv)
+                    || !fpr_lt(xv, fpr_ptwo31m1)) {
+                return 0;
+            }
+            k[u] = (int32_t)fpr_rint(xv);
+        }
+
+        /*
+         * Values in k[] are integers. They really are scaled
+         * down by maxbl_FG - minbl_fg bits.
+         *
+         * If we are at low depth, then we use the NTT to
+         * compute k*f and k*g.
+         */
+        sch = (uint32_t)(scale_k / 31);
+        scl = (uint32_t)(scale_k % 31);
+        if (depth <= DEPTH_INT_FG) {
+            poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+                                k, sch, scl, logn, t1);
+            poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+                                k, sch, scl, logn, t1);
+        } else {
+            poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+                            k, sch, scl, logn);
+            poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+                            k, sch, scl, logn);
+        }
+
+        /*
+         * We compute the new maximum size of (F,G), assuming that
+         * (f,g) has _maximal_ length (i.e. that reduction is
+         * "late" instead of "early". We also adjust FGlen
+         * accordingly.
+         */
+        new_maxbl_FG = scale_k + maxbl_fg + 10;
+        if (new_maxbl_FG < maxbl_FG) {
+            maxbl_FG = new_maxbl_FG;
+            if ((int)FGlen * 31 >= maxbl_FG + 31) {
+                FGlen --;
+            }
+        }
+
+        /*
+         * We suppose that scaling down achieves a reduction by
+         * at least 25 bits per iteration. We stop when we have
+         * done the loop with an unscaled k.
+         */
+        if (scale_k <= 0) {
+            break;
+        }
+        scale_k -= 25;
+        if (scale_k < 0) {
+            scale_k = 0;
+        }
+    }
+
+    /*
+     * If (F,G) length was lowered below 'slen', then we must take
+     * care to re-extend the sign.
+     */
+    if (FGlen < slen) {
+        for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+            size_t v;
+            uint32_t sw;
+
+            sw = -(Ft[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Ft[v] = sw;
+            }
+            sw = -(Gt[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Gt[v] = sw;
+            }
+        }
+    }
+
+    /*
+     * Compress encoding of all values to 'slen' words (this is the
+     * expected output format).
+     */
+    for (u = 0, x = tmp, y = tmp;
+            u < (n << 1); u ++, x += slen, y += llen) {
+        memmove(x, y, slen * sizeof * y);
+    }
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    /*
+     * The first half of this function is a copy of the corresponding
+     * part in solve_NTRU_intermediate(), for the reconstruction of
+     * the unreduced F and G. The second half (Babai reduction) is
+     * done differently, because the unreduced F and G fit in 53 bits
+     * of precision, allowing a much simpler process with lower RAM
+     * usage.
+     */
+    unsigned depth, logn;
+    size_t n_top, n, hn, slen, dlen, llen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+    uint32_t *x, *y;
+
+    depth = 1;
+    n_top = (size_t)1 << logn_top;
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     */
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+
+    /*
+     * Fd and Gd are the F and G from the deeper level. Ft and Gt
+     * are the destination arrays for the unreduced F and G.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+    Ft = Gd + dlen * hn;
+    Gt = Ft + llen * n;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * Now Fd and Gd are not needed anymore; we can squeeze them out.
+     */
+    memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+    Ft = tmp;
+    memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+    Gt = Ft + llen * n;
+    ft = Gt + llen * n;
+    gt = ft + slen * n;
+
+    t1 = gt + slen * n;
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        unsigned e;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * We recompute things from the source f and g, of full
+         * degree. However, we will need only the n first elements
+         * of the inverse NTT table (igm); the call to modp_mkgm()
+         * below will fill n_top elements in igm[] (thus overflowing
+         * into fx[]) but later code will overwrite these extra
+         * elements.
+         */
+        gm = t1;
+        igm = gm + n_top;
+        fx = igm + n;
+        gx = fx + n_top;
+        modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+        /*
+         * Set ft and gt to f and g modulo p, respectively.
+         */
+        for (v = 0; v < n_top; v ++) {
+            fx[v] = modp_set(f[v], p);
+            gx[v] = modp_set(g[v], p);
+        }
+
+        /*
+         * Convert to NTT and compute our f and g.
+         */
+        modp_NTT2(fx, gm, logn_top, p, p0i);
+        modp_NTT2(gx, gm, logn_top, p, p0i);
+        for (e = logn_top; e > logn; e --) {
+            modp_poly_rec_res(fx, e, p, p0i, R2);
+            modp_poly_rec_res(gx, e, p, p0i, R2);
+        }
+
+        /*
+         * From that point onward, we only need tables for
+         * degree n, so we can save some space.
+         */
+        if (depth > 0) { /* always true */
+            memmove(gm + n, igm, n * sizeof * igm);
+            igm = gm + n;
+            memmove(igm + n, fx, n * sizeof * ft);
+            fx = igm + n;
+            memmove(fx + n, gx, n * sizeof * gt);
+            gx = fx + n;
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed
+         * in a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * Equations are:
+         *
+         *   f'(x^2) = N(f)(x^2) = f * adj(f)
+         *   g'(x^2) = N(g)(x^2) = g * adj(g)
+         *
+         *   f'*G' - g'*F' = q
+         *
+         *   F = F'(x^2) * adj(g)
+         *   G = G'(x^2) * adj(f)
+         *
+         * The NTT representation of f is f(w) for all w which
+         * are roots of phi. In the binary case, as well as in
+         * the ternary case for all depth except the deepest,
+         * these roots can be grouped in pairs (w,-w), and we
+         * then have:
+         *
+         *   f(w) = adj(f)(-w)
+         *   f(-w) = adj(f)(w)
+         *
+         * and w^2 is then a root for phi at the half-degree.
+         *
+         * At the deepest level in the ternary case, this still
+         * holds, in the following sense: the roots of x^2-x+1
+         * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+         * have:
+         *
+         *   f(w) = adj(f)(-w^2)
+         *   f(-w^2) = adj(f)(w)
+         *
+         * In all case, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, the two roots for each pair are consecutive
+         * in our bit-reversal encoding.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+        /*
+         * Also save ft and gt (only up to size slen).
+         */
+        if (u < slen) {
+            modp_iNTT2(fx, igm, logn, p, p0i);
+            modp_iNTT2(gx, igm, logn, p, p0i);
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                *x = fx[v];
+                *y = gx[v];
+            }
+        }
+    }
+
+    /*
+     * Rebuild f, g, F and G with the CRT. Note that the elements of F
+     * and G are consecutive, and thus can be rebuilt in a single
+     * loop; similarly, the elements of f and g are consecutive.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+    zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+    /*
+     * Here starts the Babai reduction, specialized for depth = 1.
+     *
+     * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+     * are converted to floating point. There is no scaling, and a
+     * single pass is sufficient.
+     */
+
+    /*
+     * Convert F and G into floating point (rt1 and rt2).
+     */
+    rt1 = align_fpr(tmp, gt + slen * n);
+    rt2 = rt1 + n;
+    poly_big_to_fp(rt1, Ft, llen, llen, logn);
+    poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+    /*
+     * Integer representation of F and G is no longer needed, we
+     * can remove it.
+     */
+    memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+    ft = tmp;
+    gt = ft + slen * n;
+    rt3 = align_fpr(tmp, gt + slen * n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * Convert f and g into floating point (rt3 and rt4).
+     */
+    poly_big_to_fp(rt3, ft, slen, slen, logn);
+    poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+    /*
+     * Remove unneeded ft and gt.
+     */
+    memmove(tmp, rt1, 4 * n * sizeof * rt1);
+    rt1 = (fpr *)tmp;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * We now have:
+     *   rt1 = F
+     *   rt2 = G
+     *   rt3 = f
+     *   rt4 = g
+     * in that order in RAM. We convert all of them to FFT.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt1, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt2, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = F*adj(f) + G*adj(g)
+     *   rt6 = 1 / (f*adj(f) + g*adj(g))
+     * (Note that rt6 is half-length.)
+     */
+    rt5 = rt4 + n;
+    rt6 = rt5 + n;
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+    /*
+     * Compute k as the rounded version of rt5. Check that none of
+     * the values is larger than 2^63-1 (in absolute value)
+     * because that would make the fpr_rint() do something undefined;
+     * note that any out-of-bounds value here implies a failure and
+     * (f,g) will be discarded, so we can make a simple test.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt5, logn);
+    for (u = 0; u < n; u ++) {
+        fpr z;
+
+        z = rt5[u];
+        if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+            return 0;
+        }
+        rt5[u] = fpr_of(fpr_rint(z));
+    }
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt5, logn);
+
+    /*
+     * Subtract k*f from F, and k*g from G.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt3, rt5, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt4, rt5, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(rt1, rt3, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(rt2, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt1, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt2, logn);
+
+    /*
+     * Convert back F and G to integers, and return.
+     */
+    Ft = tmp;
+    Gt = Ft + n;
+    rt3 = align_fpr(tmp, Gt + n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    for (u = 0; u < n; u ++) {
+        Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+        Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t n, hn, u;
+    uint32_t p, p0i, R2;
+    uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+    uint32_t *gm, *igm, *ft, *gt;
+    fpr *rt2, *rt3;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     *
+     * Everything should fit in 31-bit integers, hence we can just use
+     * the first small prime p = 2147473409.
+     */
+    p = PRIMES[0].p;
+    p0i = modp_ninv31(p);
+    R2 = modp_R2(p, p0i);
+
+    Fp = tmp;
+    Gp = Fp + hn;
+    ft = Gp + hn;
+    gt = ft + n;
+    gm = gt + n;
+    igm = gm + n;
+
+    modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F' anf G' in NTT representation.
+     */
+    for (u = 0; u < hn; u ++) {
+        Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+        Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+    }
+    modp_NTT2(Fp, gm, logn - 1, p, p0i);
+    modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+    /*
+     * Load f and g and convert them to NTT representation.
+     */
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+
+    /*
+     * Build the unreduced F,G in ft and gt.
+     */
+    for (u = 0; u < n; u += 2) {
+        uint32_t ftA, ftB, gtA, gtB;
+        uint32_t mFp, mGp;
+
+        ftA = ft[u + 0];
+        ftB = ft[u + 1];
+        gtA = gt[u + 0];
+        gtB = gt[u + 1];
+        mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+        mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+        ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+        ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+        gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+        gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+    }
+    modp_iNTT2(ft, igm, logn, p, p0i);
+    modp_iNTT2(gt, igm, logn, p, p0i);
+
+    Gp = Fp + n;
+    t1 = Gp + n;
+    memmove(Fp, ft, 2 * n * sizeof * ft);
+
+    /*
+     * We now need to apply the Babai reduction. At that point,
+     * we have F and G in two n-word arrays.
+     *
+     * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+     * modulo p, using the NTT. We still move memory around in
+     * order to save RAM.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+
+    /*
+     * Compute the NTT tables in t1 and t2. We do not keep t2
+     * (we'll recompute it later on).
+     */
+    modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F and G to NTT.
+     */
+    modp_NTT2(Fp, t1, logn, p, p0i);
+    modp_NTT2(Gp, t1, logn, p, p0i);
+
+    /*
+     * Load f and adj(f) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(f[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[n - u] = modp_set(-f[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Compute F*adj(f) in t2, and f*adj(f) in t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_montymul(w, Fp[u], p, p0i);
+        t3[u] = modp_montymul(w, t4[u], p, p0i);
+    }
+
+    /*
+     * Load g and adj(g) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(g[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(g[u], p);
+        t5[n - u] = modp_set(-g[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Add G*adj(g) to t2, and g*adj(g) to t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_add(t2[u],
+                         modp_montymul(w, Gp[u], p, p0i), p);
+        t3[u] = modp_add(t3[u],
+                         modp_montymul(w, t4[u], p, p0i), p);
+    }
+
+    /*
+     * Convert back t2 and t3 to normal representation (normalized
+     * around 0), and then
+     * move them to t1 and t2. We first need to recompute the
+     * inverse table for NTT.
+     */
+    modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+    modp_iNTT2(t2, t4, logn, p, p0i);
+    modp_iNTT2(t3, t4, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint32_t)modp_norm(t2[u], p);
+        t2[u] = (uint32_t)modp_norm(t3[u], p);
+    }
+
+    /*
+     * At that point, array contents are:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   F*adj(f)+G*adj(g) (t1)
+     *   f*adj(f)+g*adj(g) (t2)
+     *
+     * We want to divide t1 by t2. The result is not integral; it
+     * must be rounded. We thus need to use the FFT.
+     */
+
+    /*
+     * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+     * polynomial is auto-adjoint, all its coordinates in FFT
+     * representation are actually real, so we can truncate off
+     * the imaginary parts.
+     */
+    rt3 = align_fpr(tmp, t3);
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t2)[u]);
+    }
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+    rt2 = align_fpr(tmp, t2);
+    memmove(rt2, rt3, hn * sizeof * rt3);
+
+    /*
+     * Convert F*adj(f)+G*adj(g) in FFT representation.
+     */
+    rt3 = rt2 + hn;
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t1)[u]);
+    }
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+
+    /*
+     * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+     * its rounded normal representation in t1.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_div_autoadj_fft(rt3, rt2, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt3, logn);
+    for (u = 0; u < n; u ++) {
+        t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+    }
+
+    /*
+     * RAM contents are now:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   k (t1)
+     *
+     * We want to compute F-k*f, and G-k*g.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+    modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(t1, t2, logn, p, p0i);
+    modp_NTT2(t4, t2, logn, p, p0i);
+    modp_NTT2(t5, t2, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t kw;
+
+        kw = modp_montymul(t1[u], R2, p, p0i);
+        Fp[u] = modp_sub(Fp[u],
+                         modp_montymul(kw, t4[u], p, p0i), p);
+        Gp[u] = modp_sub(Gp[u],
+                         modp_montymul(kw, t5[u], p, p0i), p);
+    }
+    modp_iNTT2(Fp, t3, logn, p, p0i);
+    modp_iNTT2(Gp, t3, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+        Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+    }
+
+    return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+           const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+    size_t n, u;
+    uint32_t *ft, *gt, *Ft, *Gt, *gm;
+    uint32_t p, p0i, r;
+    const small_prime *primes;
+
+    n = MKN(logn);
+
+    if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+        return 0;
+    }
+
+    /*
+     * For logn <= 2, we need to use solve_NTRU_intermediate()
+     * directly, because coefficients are a bit too large and
+     * do not fit the hypotheses in solve_NTRU_binary_depth0().
+     */
+    if (logn <= 2) {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 0) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+    } else {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 2) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+        if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+            return 0;
+        }
+        if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+            return 0;
+        }
+    }
+
+    /*
+     * If no buffer has been provided for G, use a temporary one.
+     */
+    if (G == NULL) {
+        G = (int8_t *)(tmp + 2 * n);
+    }
+
+    /*
+     * Final F and G are in fk->tmp, one word per coefficient
+     * (signed value over 31 bits).
+     */
+    if (!poly_big_to_small(F, tmp, lim, logn)
+            || !poly_big_to_small(G, tmp + n, lim, logn)) {
+        return 0;
+    }
+
+    /*
+     * Verify that the NTRU equation is fulfilled. Since all elements
+     * have short lengths, verifying modulo a small prime p works, and
+     * allows using the NTT.
+     *
+     * We put Gt[] first in tmp[], and process it first, so that it does
+     * not overlap with G[] in case we allocated it ourselves.
+     */
+    Gt = tmp;
+    ft = Gt + n;
+    gt = ft + n;
+    Ft = gt + n;
+    gm = Ft + n;
+
+    primes = PRIMES;
+    p = primes[0].p;
+    p0i = modp_ninv31(p);
+    modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Gt[u] = modp_set(G[u], p);
+    }
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+        Ft[u] = modp_set(F[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+    modp_NTT2(Ft, gm, logn, p, p0i);
+    modp_NTT2(Gt, gm, logn, p, p0i);
+    r = modp_montymul(12289, 1, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t z;
+
+        z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+                     modp_montymul(gt[u], Ft[u], p, p0i), p);
+        if (z != r) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+    size_t n, u;
+    unsigned mod2;
+
+    n = MKN(logn);
+    mod2 = 0;
+    for (u = 0; u < n; u ++) {
+        int s;
+
+restart:
+        s = mkgauss(rng, logn);
+
+        /*
+         * We need the coefficient to fit within -127..+127;
+         * realistically, this is always the case except for
+         * the very low degrees (N = 2 or 4), for which there
+         * is no real security anyway.
+         */
+        if (s < -127 || s > 127) {
+            goto restart;
+        }
+
+        /*
+         * We need the sum of all coefficients to be 1; otherwise,
+         * the resultant of the polynomial with X^N+1 will be even,
+         * and the binary GCD will fail.
+         */
+        if (u == n - 1) {
+            if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+                goto restart;
+            }
+        } else {
+            mod2 ^= (unsigned)(s & 1);
+        }
+        f[u] = (int8_t)s;
+    }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_keygen(inner_shake256_context *rng,
+                                     int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+                                     unsigned logn, uint8_t *tmp) {
+    /*
+     * Algorithm is the following:
+     *
+     *  - Generate f and g with the Gaussian distribution.
+     *
+     *  - If either Res(f,phi) or Res(g,phi) is even, try again.
+     *
+     *  - If ||(f,g)|| is too large, try again.
+     *
+     *  - If ||B~_{f,g}|| is too large, try again.
+     *
+     *  - If f is not invertible mod phi mod q, try again.
+     *
+     *  - Compute h = g/f mod phi mod q.
+     *
+     *  - Solve the NTRU equation fG - gF = q; if the solving fails,
+     *    try again. Usual failure condition is when Res(f,phi)
+     *    and Res(g,phi) are not prime to each other.
+     */
+    size_t n, u;
+    uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
+
+    n = MKN(logn);
+    rc = rng;
+
+    /*
+     * We need to generate f and g randomly, until we find values
+     * such that the norm of (g,-f), and of the orthogonalized
+     * vector, are satisfying. The orthogonalized vector is:
+     *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+     * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+     *
+     * In the binary case, coefficients of f and g are generated
+     * independently of each other, with a discrete Gaussian
+     * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+     * the two vectors have expected norm 1.17*sqrt(q), which is
+     * also our acceptance bound: we require both vectors to be no
+     * larger than that (this will be satisfied about 1/4th of the
+     * time, thus we expect sampling new (f,g) about 4 times for that
+     * step).
+     *
+     * We require that Res(f,phi) and Res(g,phi) are both odd (the
+     * NTRU equation solver requires it).
+     */
+    for (;;) {
+        fpr *rt1, *rt2, *rt3;
+        fpr bnorm;
+        uint32_t normf, normg, norm;
+        int lim;
+
+        /*
+         * The poly_small_mkgauss() function makes sure
+         * that the sum of coefficients is 1 modulo 2
+         * (i.e. the resultant of the polynomial with phi
+         * will be odd).
+         */
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
+
+        /*
+         * Verify that all coefficients are within the bounds
+         * defined in max_fg_bits. This is the case with
+         * overwhelming probability; this guarantees that the
+         * key will be encodable with FALCON_COMP_TRIM.
+         */
+        lim = 1 << (PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[logn] - 1);
+        for (u = 0; u < n; u ++) {
+            /*
+             * We can use non-CT tests since on any failure
+             * we will discard f and g.
+             */
+            if (f[u] >= lim || f[u] <= -lim
+                    || g[u] >= lim || g[u] <= -lim) {
+                lim = -1;
+                break;
+            }
+        }
+        if (lim < 0) {
+            continue;
+        }
+
+        /*
+         * Bound is 1.17*sqrt(q). We compute the squared
+         * norms. With q = 12289, the squared bound is:
+         *   (1.17^2)* 12289 = 16822.4121
+         * Since f and g are integral, the squared norm
+         * of (g,-f) is an integer.
+         */
+        normf = poly_small_sqnorm(f, logn);
+        normg = poly_small_sqnorm(g, logn);
+        norm = (normf + normg) | -((normf | normg) >> 31);
+        if (norm >= 16823) {
+            continue;
+        }
+
+        /*
+         * We compute the orthogonalized vector norm.
+         */
+        rt1 = (fpr *)tmp;
+        rt2 = rt1 + n;
+        rt3 = rt2 + n;
+        poly_small_to_fp(rt1, f, logn);
+        poly_small_to_fp(rt2, g, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(rt1, fpr_q, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(rt2, fpr_q, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt2, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt2, logn);
+        bnorm = fpr_zero;
+        for (u = 0; u < n; u ++) {
+            bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+            bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+        }
+        if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+            continue;
+        }
+
+        /*
+         * Compute public key h = g/f mod X^N+1 mod q. If this
+         * fails, we must restart.
+         */
+        if (h == NULL) {
+            h2 = (uint16_t *)tmp;
+            tmp2 = h2 + n;
+        } else {
+            h2 = h;
+            tmp2 = (uint16_t *)tmp;
+        }
+        if (!PQCLEAN_FALCONPADDED1024_AVX2_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+            continue;
+        }
+
+        /*
+         * Solve the NTRU equation to get F and G.
+         */
+        lim = (1 << (PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[logn] - 1)) - 1;
+        if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+            continue;
+        }
+
+        /*
+         * Key pair is generated.
+         */
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/pqclean.c
new file mode 100644
index 000000000..06560ed5c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN   40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ *   private key:
+ *      header byte: 0101nnnn
+ *      private f  (6 or 5 bits by element, depending on degree)
+ *      private g  (6 or 5 bits by element, depending on degree)
+ *      private F  (8 bits by element)
+ *
+ *   public key:
+ *      header byte: 0000nnnn
+ *      public h   (14 bits by element)
+ *
+ *   signature:
+ *      header byte: 0011nnnn
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
+ *      padding    to 1280 bytes
+ *
+ *   message + signature:
+ *      signature  1280 bytes
+ *      message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk) {
+    union {
+        uint8_t b[FALCON_KEYGEN_TEMP_10];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[1024], g[1024], F[1024];
+    uint16_t h[1024];
+    unsigned char seed[48];
+    inner_shake256_context rng;
+    size_t u, v;
+
+    /*
+     * Generate key pair.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
+    PQCLEAN_FALCONPADDED1024_AVX2_keygen(&rng, f, g, F, NULL, h, 10, tmp.b);
+    inner_shake256_ctx_release(&rng);
+
+    /*
+     * Encode private key.
+     */
+    sk[0] = 0x50 + 10;
+    u = 1;
+    v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u,
+            f, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u,
+            g, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u,
+            F, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[10]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+
+    /*
+     * Encode public key.
+     */
+    pk[0] = 0x00 + 10;
+    v = PQCLEAN_FALCONPADDED1024_AVX2_modq_encode(
+            pk + 1, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1,
+            h, 10);
+    if (v != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+        const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    union {
+        uint8_t b[72 * 1024];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[1024], g[1024], F[1024], G[1024];
+    struct {
+        int16_t sig[1024];
+        uint16_t hm[1024];
+    } r;
+    unsigned char seed[48];
+    inner_shake256_context sc;
+    size_t u, v;
+
+    /*
+     * Decode the private key.
+     */
+    if (sk[0] != 0x50 + 10) {
+        return -1;
+    }
+    u = 1;
+    v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+            f, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10],
+            sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+            g, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10],
+            sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+            F, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[10],
+            sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+    if (!PQCLEAN_FALCONPADDED1024_AVX2_complete_private(G, f, g, F, 10, tmp.b)) {
+        return -1;
+    }
+
+    /*
+     * Create a random nonce (40 bytes).
+     */
+    randombytes(nonce, NONCELEN);
+
+    /*
+     * Hash message nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(&sc, r.hm, 10, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Initialize a RNG.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
+
+    /*
+     * Compute and return the signature. This loops until a signature
+     * value is found that fits in the provided buffer.
+     */
+    for (;;) {
+        PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+        v = PQCLEAN_FALCONPADDED1024_AVX2_comp_encode(sigbuf, sigbuflen, r.sig, 10);
+        if (v != 0) {
+            inner_shake256_ctx_release(&sc);
+            memset(sigbuf + v, 0, sigbuflen - v);
+            return 0;
+        }
+    }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+    const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    union {
+        uint8_t b[2 * 1024];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    uint16_t h[1024], hm[1024];
+    int16_t sig[1024];
+    inner_shake256_context sc;
+    size_t v;
+
+    /*
+     * Decode public key.
+     */
+    if (pk[0] != 0x00 + 10) {
+        return -1;
+    }
+    if (PQCLEAN_FALCONPADDED1024_AVX2_modq_decode(h, 10,
+            pk + 1, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1)
+            != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+    PQCLEAN_FALCONPADDED1024_AVX2_to_ntt_monty(h, 10);
+
+    /*
+     * Decode signature.
+     */
+    if (sigbuflen == 0) {
+        return -1;
+    }
+
+    v = PQCLEAN_FALCONPADDED1024_AVX2_comp_decode(sig, 10, sigbuf, sigbuflen);
+    if (v == 0) {
+        return -1;
+    }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
+
+    /*
+     * Hash nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(&sc, hm, 10, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Verify signature.
+     */
+    if (!PQCLEAN_FALCONPADDED1024_AVX2_verify_raw(hm, sig, h, 10, tmp.b)) {
+        return -1;
+    }
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    size_t vlen;
+
+    vlen = PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sig[0] = 0x30 + 10;
+    *siglen = 1 + NONCELEN + vlen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    if (siglen < 1 + NONCELEN) {
+        return -1;
+    }
+    if (sig[0] != 0x30 + 10) {
+        return -1;
+    }
+    return do_verify(sig + 1,
+                     sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    uint8_t *sigbuf;
+    size_t sigbuflen;
+
+    /*
+     * Move the message to its final location; this is a memmove() so
+     * it handles overlaps properly.
+     */
+    memmove(sm + PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES, m, mlen);
+    sigbuf = sm + 1 + NONCELEN;
+    sigbuflen = PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sm[0] = 0x30 + 10;
+    sigbuflen ++;
+    *smlen = mlen + NONCELEN + sigbuflen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+    const uint8_t *sigbuf;
+    size_t pmlen, sigbuflen;
+
+    if (smlen < PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES) {
+        return -1;
+    }
+    sigbuflen = PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+    pmlen = smlen - PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES;
+    if (sm[0] != 0x30 + 10) {
+        return -1;
+    }
+    sigbuf = sm + 1 + NONCELEN;
+
+    /*
+     * The one-byte signature header has been verified. Nonce is at sm+1
+     * followed by the signature (pointed to by sigbuf). The message
+     * follows the signature value.
+     */
+    if (do_verify(sm + 1, sigbuf, sigbuflen,
+                  sm + PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES, pmlen, pk) < 0) {
+        return -1;
+    }
+
+    /*
+     * Signature is correct, we just have to copy/move the message
+     * to its final destination. The memmove() properly handles
+     * overlaps.
+     */
+    memmove(m, sm + PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES, pmlen);
+    *mlen = pmlen;
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/rng.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/rng.c
new file mode 100644
index 000000000..001aecb4e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/rng.c
@@ -0,0 +1,179 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include <assert.h>
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_prng_init(prng *p, inner_shake256_context *src) {
+    inner_shake256_extract(src, p->state.d, 56);
+    PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(prng *p) {
+
+    static const uint32_t CW[] = {
+        0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+    };
+
+    uint64_t cc;
+    size_t u;
+    int i;
+    uint32_t *sw;
+    union {
+        uint32_t w[16];
+        __m256i y[2];  /* for alignment */
+    } t;
+    __m256i state[16], init[16];
+
+    sw = (uint32_t *)p->state.d;
+
+    /*
+     * XOR next counter values into state.
+     */
+    cc = *(uint64_t *)(p->state.d + 48);
+    for (u = 0; u < 8; u ++) {
+        t.w[u] = (uint32_t)(cc + u);
+        t.w[u + 8] = (uint32_t)((cc + u) >> 32);
+    }
+    *(uint64_t *)(p->state.d + 48) = cc + 8;
+
+    /*
+     * Load state.
+     */
+    for (u = 0; u < 4; u ++) {
+        state[u] = init[u] =
+                       _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)CW[u]));
+    }
+    for (u = 0; u < 10; u ++) {
+        state[u + 4] = init[u + 4] =
+                           _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[u]));
+    }
+    state[14] = init[14] = _mm256_xor_si256(
+                               _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[10])),
+                               _mm256_loadu_si256((__m256i *)&t.w[0]));
+    state[15] = init[15] = _mm256_xor_si256(
+                               _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[11])),
+                               _mm256_loadu_si256((__m256i *)&t.w[8]));
+
+    /*
+     * Do all rounds.
+     */
+    for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d)   do { \
+        state[a] = _mm256_add_epi32(state[a], state[b]); \
+        state[d] = _mm256_xor_si256(state[d], state[a]); \
+        state[d] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[d], 16), \
+                                    _mm256_srli_epi32(state[d], 16)); \
+        state[c] = _mm256_add_epi32(state[c], state[d]); \
+        state[b] = _mm256_xor_si256(state[b], state[c]); \
+        state[b] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[b], 12), \
+                                    _mm256_srli_epi32(state[b], 20)); \
+        state[a] = _mm256_add_epi32(state[a], state[b]); \
+        state[d] = _mm256_xor_si256(state[d], state[a]); \
+        state[d] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[d],  8), \
+                                    _mm256_srli_epi32(state[d], 24)); \
+        state[c] = _mm256_add_epi32(state[c], state[d]); \
+        state[b] = _mm256_xor_si256(state[b], state[c]); \
+        state[b] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[b], 7), \
+                                    _mm256_srli_epi32(state[b], 25)); \
+    } while (0)
+
+        QROUND( 0,  4,  8, 12);
+        QROUND( 1,  5,  9, 13);
+        QROUND( 2,  6, 10, 14);
+        QROUND( 3,  7, 11, 15);
+        QROUND( 0,  5, 10, 15);
+        QROUND( 1,  6, 11, 12);
+        QROUND( 2,  7,  8, 13);
+        QROUND( 3,  4,  9, 14);
+
+#undef QROUND
+
+    }
+
+    /*
+     * Add initial state back and encode the result in the destination
+     * buffer. We can dump the AVX2 values "as is" because the non-AVX2
+     * code uses a compatible order of values.
+     */
+    for (u = 0; u < 16; u ++) {
+        _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
+                            _mm256_add_epi32(state[u], init[u]));
+    }
+
+    p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) {
+    uint8_t *buf;
+
+    buf = dst;
+    while (len > 0) {
+        size_t clen;
+
+        clen = (sizeof p->buf.d) - p->ptr;
+        if (clen > len) {
+            clen = len;
+        }
+        memcpy(buf, p->buf.d, clen);
+        buf += clen;
+        len -= clen;
+        p->ptr += clen;
+        if (p->ptr == sizeof p->buf.d) {
+            PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/sign.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/sign.c
new file mode 100644
index 000000000..6761dbd60
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/sign.c
@@ -0,0 +1,1319 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ *   N = 2^logn
+ *   phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+    /*
+     * For logn = 0 (polynomials are constant), the "tree" is a
+     * single element. Otherwise, the tree node has size 2^logn, and
+     * has two child trees for size logn-1 each. Thus, treesize s()
+     * must fulfill these two relations:
+     *
+     *   s(0) = 1
+     *   s(logn) = (2^logn) + 2*s(logn-1)
+     */
+    return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+                fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g0[0];
+        return;
+    }
+    hn = n >> 1;
+
+    /*
+     * The LDL decomposition yields L (which is written in the tree)
+     * and the diagonal of D. Since d00 = g0, we just write d11
+     * into tmp.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+    /*
+     * Split d00 (currently in g0) and d11 (currently in tmp). We
+     * reuse g0 and g1 as temporary storage spaces:
+     *   d00 splits into g1, g1+hn
+     *   d11 splits into g0, g0+hn
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(g1, g1 + hn, g0, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+    /*
+     * Each split result is the first row of a new auto-adjoint
+     * quasicyclic matrix for the next recursive step.
+     */
+    ffLDL_fft_inner(tree + n,
+                    g1, g1 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+          const fpr *g01, const fpr *g11,
+          unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *d00, *d11;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g00[0];
+        return;
+    }
+    hn = n >> 1;
+    d00 = tmp;
+    d11 = tmp + n;
+    tmp += n << 1;
+
+    memcpy(d00, g00, n * sizeof * g00);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(tmp, tmp + hn, d00, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(d00, d00 + hn, d11, logn);
+    memcpy(d11, tmp, n * sizeof * tmp);
+    ffLDL_fft_inner(tree + n,
+                    d11, d11 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+    /*
+     * TODO: make an iterative version.
+     */
+    size_t n;
+
+    n = MKN(logn);
+    if (n == 1) {
+        /*
+         * We actually store in the tree leaf the inverse of
+         * the value mandated by the specification: this
+         * saves a division both here and in the sampler.
+         */
+        tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+    } else {
+        ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+        ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+                               orig_logn, logn - 1);
+    }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        r[u] = fpr_of(t[u]);
+    }
+}
+
+/*
+ * The expanded private key contains:
+ *  - The B0 matrix (four elements)
+ *  - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+    (void)logn;
+    return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+    return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+    return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+    return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+    return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp) {
+    size_t n;
+    fpr *rf, *rg, *rF, *rG;
+    fpr *b00, *b01, *b10, *b11;
+    fpr *g00, *g01, *g11, *gxx;
+    fpr *tree;
+
+    n = MKN(logn);
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * We load the private key elements directly into the B0 matrix,
+     * since B0 = [[g, -f], [G, -F]].
+     */
+    rf = b01;
+    rg = b00;
+    rF = b11;
+    rG = b10;
+
+    smallints_to_fpr(rf, f, logn);
+    smallints_to_fpr(rg, g, logn);
+    smallints_to_fpr(rF, F, logn);
+    smallints_to_fpr(rG, G, logn);
+
+    /*
+     * Compute the FFT for the key elements, and negate f and F.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rf, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rg, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rF, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(rG, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(rf, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(rF, logn);
+
+    /*
+     * The Gram matrix is G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle).
+     */
+    g00 = (fpr *)tmp;
+    g01 = g00 + n;
+    g11 = g01 + n;
+    gxx = g11 + n;
+
+    memcpy(g00, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(g00, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(g00, gxx, logn);
+
+    memcpy(g01, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(g01, b10, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(gxx, b11, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(g01, gxx, logn);
+
+    memcpy(g11, b10, n * sizeof * b10);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(g11, logn);
+    memcpy(gxx, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(g11, gxx, logn);
+
+    /*
+     * Compute the Falcon tree.
+     */
+    ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+    /*
+     * Normalize tree.
+     */
+    ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+                       fpr *t0, fpr *t1,
+                       fpr *g00, fpr *g01, fpr *g11,
+                       unsigned orig_logn, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *z0, *z1;
+
+    /*
+     * Deepest level: the LDL tree leaf value is just g00 (the
+     * array has length only 1 at this point); we normalize it
+     * with regards to sigma, then use it for sampling.
+     */
+    if (logn == 0) {
+        fpr leaf;
+
+        leaf = g00[0];
+        leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+        t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+        t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+        return;
+    }
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Decompose G into LDL. We only need d00 (identical to g00),
+     * d11, and l10; we do that in place.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_LDL_fft(g00, g01, g11, logn);
+
+    /*
+     * Split d00 and d11 and expand them into half-size quasi-cyclic
+     * Gram matrices. We also save l10 in tmp[].
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(tmp, tmp + hn, g00, logn);
+    memcpy(g00, tmp, n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(tmp, tmp + hn, g11, logn);
+    memcpy(g11, tmp, n * sizeof * tmp);
+    memcpy(tmp, g01, n * sizeof * g01);
+    memcpy(g01, g00, hn * sizeof * g00);
+    memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+    /*
+     * The half-size Gram matrices for the recursive LDL tree
+     * building are now:
+     *   - left sub-tree: g00, g00+hn, g01
+     *   - right sub-tree: g11, g11+hn, g01+hn
+     * l10 is in tmp[].
+     */
+
+    /*
+     * We split t1 and use the first recursive call on the two
+     * halves, using the right sub-tree. The result is merged
+     * back into tmp + 2*n.
+     */
+    z1 = tmp + n;
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+                           g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * l10.
+     * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+     * in tmp + (n << 1). The buffer in z1 is free.
+     *
+     * In the end, z1 is written over t1, and tb0 is in t0.
+     */
+    memcpy(z1, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(z1, tmp + (n << 1), logn);
+    memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(t0, tmp, logn);
+
+    /*
+     * Second recursive invocation, on the split tb0 (currently in t0)
+     * and the left sub-tree.
+     */
+    z0 = tmp;
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z0, z0 + hn, t0, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+                           g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+               fpr *z0, fpr *z1,
+               const fpr *tree,
+               const fpr *t0, const fpr *t1, unsigned logn,
+               fpr *tmp) {
+    size_t n, hn;
+    const fpr *tree0, *tree1;
+
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr w0, w1, w2, w3, sigma;
+        __m128d ww0, ww1, wa, wb, wc, wd;
+        __m128d wy0, wy1, wz0, wz1;
+        __m128d half, invsqrt8, invsqrt2, neghi, neglo;
+        int si0, si1, si2, si3;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        half = _mm_set1_pd(0.5);
+        invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
+        invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
+        neghi = _mm_set_pd(-0.0, 0.0);
+        neglo = _mm_set_pd(0.0, -0.0);
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        ww0 = _mm_loadu_pd(&t1[0].v);
+        ww1 = _mm_loadu_pd(&t1[2].v);
+        wa = _mm_unpacklo_pd(ww0, ww1);
+        wb = _mm_unpackhi_pd(ww0, ww1);
+        wc = _mm_add_pd(wa, wb);
+        ww0 = _mm_mul_pd(wc, half);
+        wc = _mm_sub_pd(wa, wb);
+        wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+        ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+        w2.v = _mm_cvtsd_f64(ww1);
+        w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+        wa = ww1;
+        sigma = tree1[3];
+        si2 = samp(samp_ctx, w2, sigma);
+        si3 = samp(samp_ctx, w3, sigma);
+        ww1 = _mm_set_pd((double)si3, (double)si2);
+        wa = _mm_sub_pd(wa, ww1);
+        wb = _mm_loadu_pd(&tree1[0].v);
+        wc = _mm_mul_pd(wa, wb);
+        wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+        wa = _mm_unpacklo_pd(wc, wd);
+        wb = _mm_unpackhi_pd(wc, wd);
+        ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+        w0.v = _mm_cvtsd_f64(ww0);
+        w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+        sigma = tree1[2];
+        si0 = samp(samp_ctx, w0, sigma);
+        si1 = samp(samp_ctx, w1, sigma);
+        ww0 = _mm_set_pd((double)si1, (double)si0);
+
+        wc = _mm_mul_pd(
+                 _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+                 invsqrt2);
+        wa = _mm_add_pd(ww0, wc);
+        wb = _mm_sub_pd(ww0, wc);
+        ww0 = _mm_unpacklo_pd(wa, wb);
+        ww1 = _mm_unpackhi_pd(wa, wb);
+        _mm_storeu_pd(&z1[0].v, ww0);
+        _mm_storeu_pd(&z1[2].v, ww1);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
+        wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
+        wz0 = _mm_loadu_pd(&tree[0].v);
+        wz1 = _mm_loadu_pd(&tree[2].v);
+        ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
+        ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
+        ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
+        ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
+
+        /*
+         * Second recursive invocation.
+         */
+        wa = _mm_unpacklo_pd(ww0, ww1);
+        wb = _mm_unpackhi_pd(ww0, ww1);
+        wc = _mm_add_pd(wa, wb);
+        ww0 = _mm_mul_pd(wc, half);
+        wc = _mm_sub_pd(wa, wb);
+        wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+        ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+        w2.v = _mm_cvtsd_f64(ww1);
+        w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+        wa = ww1;
+        sigma = tree0[3];
+        si2 = samp(samp_ctx, w2, sigma);
+        si3 = samp(samp_ctx, w3, sigma);
+        ww1 = _mm_set_pd((double)si3, (double)si2);
+        wa = _mm_sub_pd(wa, ww1);
+        wb = _mm_loadu_pd(&tree0[0].v);
+        wc = _mm_mul_pd(wa, wb);
+        wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+        wa = _mm_unpacklo_pd(wc, wd);
+        wb = _mm_unpackhi_pd(wc, wd);
+        ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+        w0.v = _mm_cvtsd_f64(ww0);
+        w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+        sigma = tree0[2];
+        si0 = samp(samp_ctx, w0, sigma);
+        si1 = samp(samp_ctx, w1, sigma);
+        ww0 = _mm_set_pd((double)si1, (double)si0);
+
+        wc = _mm_mul_pd(
+                 _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+                 invsqrt2);
+        wa = _mm_add_pd(ww0, wc);
+        wb = _mm_sub_pd(ww0, wc);
+        ww0 = _mm_unpacklo_pd(wa, wb);
+        ww1 = _mm_unpackhi_pd(wa, wb);
+        _mm_storeu_pd(&z0[0].v, ww0);
+        _mm_storeu_pd(&z0[2].v, ww1);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
+        fpr x0, x1, sigma;
+
+        x0 = t0[0];
+        x1 = t1[0];
+        sigma = tree[0];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+        return;
+    }
+
+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    tree0 = tree + n;
+    tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+    /*
+     * We split t1 into z1 (reused as temporary storage), then do
+     * the recursive invocation, with output in tmp. We finally
+     * merge back into z1.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree1, z1, z1 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+     */
+    memcpy(tmp, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tmp, tree, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(tmp, t0, logn);
+
+    /*
+     * Second recursive invocation.
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z0, z0 + hn, tmp, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree0, z0, z0 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+             const fpr *expanded_key,
+             const uint16_t *hm,
+             unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    const fpr *b00, *b01, *b10, *b11, *tree;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+    t0 = tmp;
+    t1 = t0 + n;
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t0, ni, logn);
+
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Apply sampling. Output is written back in [tx, ty].
+     */
+    ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(t0, tx, n * sizeof * tx);
+    memcpy(t1, ty, n * sizeof * ty);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(t1, ty, logn);
+
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t1, logn);
+
+    /*
+     * Compute the signature.
+     */
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+            const int8_t *f, const int8_t *g,
+            const int8_t *F, const int8_t *G,
+            const uint16_t *hm, unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+
+    /*
+     * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b11, logn);
+
+    /*
+     * Compute the Gram matrix G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle). g10 is not kept
+     * since it is equal to adj(g01).
+     *
+     * We _replace_ the matrix B with the Gram matrix, but we
+     * must keep b01 and b11 for computing the target vector.
+     */
+    t0 = b11 + n;
+    t1 = t0 + n;
+
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(t0, logn);    // t0 <- b01*adj(b01)
+
+    memcpy(t1, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(t1, b10, logn);   // t1 <- b00*adj(b10)
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(b00, logn);   // b00 <- b00*adj(b00)
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(b00, t0, logn);      // b00 <- g00
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(b01, b11, logn);  // b01 <- b01*adj(b11)
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(b01, t1, logn);      // b01 <- g01
+
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(b10, logn);   // b10 <- b10*adj(b10)
+    memcpy(t1, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(t1, logn);    // t1 <- b11*adj(b11)
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(b10, t1, logn);      // b10 <- g11
+
+    /*
+     * We rename variables to make things clearer. The three elements
+     * of the Gram matrix uses the first 3*n slots of tmp[], followed
+     * by b11 and b01 (in that order).
+     */
+    g00 = b00;
+    g01 = b01;
+    g11 = b10;
+    b01 = t0;
+    t0 = b01 + n;
+    t1 = t0 + n;
+
+    /*
+     * Memory layout at that point:
+     *   g00 g01 g11 b11 b01 t0 t1
+     */
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t0, ni, logn);
+
+    /*
+     * b01 and b11 can be discarded, so we move back (t0,t1).
+     * Memory layout is now:
+     *      g00 g01 g11 t0 t1
+     */
+    memcpy(b11, t0, n * 2 * sizeof * t0);
+    t0 = g11 + n;
+    t1 = t0 + n;
+
+    /*
+     * Apply sampling; result is written over (t0,t1).
+     */
+    ffSampling_fft_dyntree(samp, samp_ctx,
+                           t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+    /*
+     * We arrange the layout back to:
+     *     b00 b01 b10 b11 t0 t1
+     *
+     * We did not conserve the matrix basis, so we must recompute
+     * it now.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    memmove(b11 + n, t0, n * 2 * sizeof * t0);
+    t0 = b11 + n;
+    t1 = t0 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b11, logn);
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(tx, t0, n * sizeof * t0);
+    memcpy(ty, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_poly_add(t1, ty, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t1, logn);
+
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_gaussian0_sampler(prng *p) {
+
+    /*
+     * High words.
+     */
+    static const union {
+        uint16_t u16[16];
+        __m256i ymm[1];
+    } rhi15 = {
+        {
+            0x51FB, 0x2A69, 0x113E, 0x0568,
+            0x014A, 0x003B, 0x0008, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000
+        }
+    };
+
+    static const union {
+        uint64_t u64[20];
+        __m256i ymm[5];
+    } rlo57 = {
+        {
+            0x1F42ED3AC391802, 0x12B181F3F7DDB82,
+            0x1CDD0934829C1FF, 0x1754377C7994AE4,
+            0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
+            0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
+            0x01F80D88A7B6428, 0x001C3FDB2040C69,
+            0x00012CF24D031FB, 0x00000949F8B091F,
+            0x0000003665DA998, 0x00000000EBF6EBB,
+            0x0000000002F5D7E, 0x000000000007098,
+            0x0000000000000C6, 0x000000000000001,
+            0x000000000000000, 0x000000000000000
+        }
+    };
+
+    uint64_t lo;
+    unsigned hi;
+    __m256i xhi, rhi, gthi, eqhi, eqm;
+    __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
+    __m128i t, zt;
+    int r;
+
+    /*
+     * Get a 72-bit random value and split it into a low part
+     * (57 bits) and a high part (15 bits)
+     */
+    lo = prng_get_u64(p);
+    hi = prng_get_u8(p);
+    hi = (hi << 7) | (unsigned)(lo >> 57);
+    lo &= 0x1FFFFFFFFFFFFFF;
+
+    /*
+     * Broadcast the high part and compare it with the relevant
+     * values. We need both a "greater than" and an "equal"
+     * comparisons.
+     */
+    xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128((int)hi));
+    rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
+    gthi = _mm256_cmpgt_epi16(rhi, xhi);
+    eqhi = _mm256_cmpeq_epi16(rhi, xhi);
+
+    /*
+     * The result is the number of 72-bit values (among the list of 19)
+     * which are greater than the 72-bit random value. We first count
+     * all non-zero 16-bit elements in the first eight of gthi. Such
+     * elements have value -1 or 0, so we first negate them.
+     */
+    t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
+    zt = _mm_setzero_si128();
+    t = _mm_hadd_epi16(t, zt);
+    t = _mm_hadd_epi16(t, zt);
+    t = _mm_hadd_epi16(t, zt);
+    r = _mm_cvtsi128_si32(t);
+
+    /*
+     * We must look at the low bits for all values for which the
+     * high bits are an "equal" match; values 8-18 all have the
+     * same high bits (0).
+     * On 32-bit systems, 'lo' really is two registers, requiring
+     * some extra code.
+     */
+    #if defined(__x86_64__) || defined(_M_X64)
+    xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
+    #else
+    {
+        uint32_t e0, e1;
+        int32_t f0, f1;
+
+        e0 = (uint32_t)lo;
+        e1 = (uint32_t)(lo >> 32);
+        f0 = *(int32_t *)&e0;
+        f1 = *(int32_t *)&e1;
+        xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
+    }
+    #endif
+    gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo);
+    gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo);
+    gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo);
+    gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo);
+    gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo);
+
+    /*
+     * Keep only comparison results that correspond to the non-zero
+     * elements in eqhi.
+     */
+    gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
+                                 _mm256_castsi256_si128(eqhi)));
+    gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
+                                 _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
+    eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
+    gtlo2 = _mm256_and_si256(gtlo2, eqm);
+    gtlo3 = _mm256_and_si256(gtlo3, eqm);
+    gtlo4 = _mm256_and_si256(gtlo4, eqm);
+
+    /*
+     * Add all values to count the total number of "-1" elements.
+     * Since the first eight "high" words are all different, only
+     * one element (at most) in gtlo0:gtlo1 can be non-zero; however,
+     * if the high word of the random value is zero, then many
+     * elements of gtlo2:gtlo3:gtlo4 can be non-zero.
+     */
+    gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
+    gtlo0 = _mm256_add_epi64(
+                _mm256_add_epi64(gtlo0, gtlo2),
+                _mm256_add_epi64(gtlo3, gtlo4));
+    t = _mm_add_epi64(
+            _mm256_castsi256_si128(gtlo0),
+            _mm256_extracti128_si256(gtlo0, 1));
+    t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
+    r -= _mm_cvtsi128_si32(t);
+
+    return r;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+    int s, i;
+    fpr r;
+    uint32_t sw, w;
+    uint64_t z;
+
+    /*
+     * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+     * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+     */
+    s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+    r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+    /*
+     * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+     * (the minimum value for sigma), r = 0 and b = 1, then we get
+     * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+     * with probability about 0.000000000230383991, which is
+     * approximatively equal to 2^(-32). In any case, if s >= 64,
+     * then BerExp will be non-zero with probability less than
+     * 2^(-64), so we can simply saturate s at 63.
+     */
+    sw = (uint32_t)s;
+    sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+    s = (int)sw;
+
+    /*
+     * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+     * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+     * We scale it up to 2^64, then right-shift it by s bits because
+     * we really want exp(-x) = 2^(-s)*exp(-r).
+     *
+     * The "-1" operation makes sure that the value fits on 64 bits
+     * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+     * case). The bias is negligible since fpr_expm_p63() only computes
+     * with 51 bits of precision or so.
+     */
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+    /*
+     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+     * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+     * PRNG output to limit its consumption, the sign of the difference
+     * yields the expected result.
+     */
+    i = 64;
+    do {
+        i -= 8;
+        w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+    } while (!w && i > 0);
+    return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_sampler(void *ctx, fpr mu, fpr isigma) {
+    sampler_context *spc;
+    int s;
+    fpr r, dss, ccs;
+
+    spc = ctx;
+
+    /*
+     * Center is mu. We compute mu = s + r where s is an integer
+     * and 0 <= r < 1.
+     */
+    s = (int)fpr_floor(mu);
+    r = fpr_sub(mu, fpr_of(s));
+
+    /*
+     * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+     */
+    dss = fpr_half(fpr_sqr(isigma));
+
+    /*
+     * ccs = sigma_min / sigma = sigma_min * isigma.
+     */
+    ccs = fpr_mul(isigma, spc->sigma_min);
+
+    /*
+     * We now need to sample on center r.
+     */
+    for (;;) {
+        int z0, z, b;
+        fpr x;
+
+        /*
+         * Sample z for a Gaussian distribution. Then get a
+         * random bit b to turn the sampling into a bimodal
+         * distribution: if b = 1, we use z+1, otherwise we
+         * use -z. We thus have two situations:
+         *
+         *  - b = 1: z >= 1 and sampled against a Gaussian
+         *    centered on 1.
+         *  - b = 0: z <= 0 and sampled against a Gaussian
+         *    centered on 0.
+         */
+        z0 = PQCLEAN_FALCONPADDED1024_AVX2_gaussian0_sampler(&spc->p);
+        b = (int)prng_get_u8(&spc->p) & 1;
+        z = b + ((b << 1) - 1) * z0;
+
+        /*
+         * Rejection sampling. We want a Gaussian centered on r;
+         * but we sampled against a Gaussian centered on b (0 or
+         * 1). But we know that z is always in the range where
+         * our sampling distribution is greater than the Gaussian
+         * distribution, so rejection works.
+         *
+         * We got z with distribution:
+         *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+         * We target distribution:
+         *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
+         * Rejection sampling works by keeping the value z with
+         * probability S(z)/G(z), and starting again otherwise.
+         * This requires S(z) <= G(z), which is the case here.
+         * Thus, we simply need to keep our z with probability:
+         *    P = exp(-x)
+         * where:
+         *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+         *
+         * Here, we scale up the Bernouilli distribution, which
+         * makes rejection more probable, but makes rejection
+         * rate sufficiently decorrelated from the Gaussian
+         * center and standard deviation that the whole sampler
+         * can be said to be constant-time.
+         */
+        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+        if (BerExp(&spc->p, x, ccs)) {
+            /*
+             * Rejection sampling was centered on r, but the
+             * actual center is mu = s + r.
+             */
+            return s + z;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+                                        const fpr *expanded_key,
+                                        const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED1024_AVX2_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED1024_AVX2_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_tree(samp, samp_ctx, sig,
+                         expanded_key, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+                                       const int8_t *f, const int8_t *g,
+                                       const int8_t *F, const int8_t *G,
+                                       const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED1024_AVX2_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED1024_AVX2_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_dyn(samp, samp_ctx, sig,
+                        f, g, F, G, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/vrfy.c
new file mode 100644
index 000000000..534d5d8c0
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ *   n = 2^logn  (2 <= n <= 1024)
+ *   phi = X^n + 1
+ *   q = 12289
+ *   q0i = -1/q mod 2^16
+ *   R = 2^16 mod q
+ *   R2 = 2^32 mod q
+ */
+
+#define Q     12289
+#define Q0I   12287
+#define R      4091
+#define R2    10952
+
+/*
+ * Table for NTT, binary case:
+ *   GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+    4091,  7888, 11060, 11208,  6960,  4342,  6275,  9759,
+    1591,  6399,  9477,  5266,   586,  5825,  7538,  9710,
+    1134,  6407,  1711,   965,  7099,  7674,  3743,  6442,
+    10414,  8100,  1885,  1688,  1364, 10329, 10164,  9180,
+    12210,  6240,   997,   117,  4783,  4407,  1549,  7072,
+    2829,  6458,  4431,  8877,  7144,  2564,  5664,  4042,
+    12189,   432, 10751,  1237,  7610,  1534,  3983,  7863,
+    2181,  6308,  8720,  6570,  4843,  1690,    14,  3872,
+    5569,  9368, 12163,  2019,  7543,  2315,  4673,  7340,
+    1553,  1156,  8401, 11389,  1020,  2967, 10772,  7045,
+    3316, 11236,  5285, 11578, 10637, 10086,  9493,  6180,
+    9277,  6130,  3323,   883, 10469,   489,  1502,  2851,
+    11061,  9729,  2742, 12241,  4970, 10481, 10078,  1195,
+    730,  1762,  3854,  2030,  5892, 10922,  9020,  5274,
+    9179,  3604,  3782, 10206,  3180,  3467,  4668,  2446,
+    7613,  9386,   834,  7703,  6836,  3403,  5351, 12276,
+    3580,  1739, 10820,  9787, 10209,  4070, 12250,  8525,
+    10401,  2749,  7338, 10574,  6040,   943,  9330,  1477,
+    6865,  9668,  3585,  6633, 12145,  4063,  3684,  7680,
+    8188,  6902,  3533,  9807,  6090,   727, 10099,  7003,
+    6945,  1949,  9731, 10559,  6057,   378,  7871,  8763,
+    8901,  9229,  8846,  4551,  9589, 11664,  7630,  8821,
+    5680,  4956,  6251,  8388, 10156,  8723,  2341,  3159,
+    1467,  5460,  8553,  7783,  2649,  2320,  9036,  6188,
+    737,  3698,  4699,  5753,  9046,  3687,    16,   914,
+    5186, 10531,  4552,  1964,  3509,  8436,  7516,  5381,
+    10733,  3281,  7037,  1060,  2895,  7156,  8887,  5357,
+    6409,  8197,  2962,  6375,  5064,  6634,  5625,   278,
+    932, 10229,  8927,  7642,   351,  9298,   237,  5858,
+    7692,  3146, 12126,  7586,  2053, 11285,  3802,  5204,
+    4602,  1748, 11300,   340,  3711,  4614,   300, 10993,
+    5070, 10049, 11616, 12247,  7421, 10707,  5746,  5654,
+    3835,  5553,  1224,  8476,  9237,  3845,   250, 11209,
+    4225,  6326,  9680, 12254,  4136,  2778,   692,  8808,
+    6410,  6718, 10105, 10418,  3759,  7356, 11361,  8433,
+    6437,  3652,  6342,  8978,  5391,  2272,  6476,  7416,
+    8418, 10824, 11986,  5733,   876,  7030,  2167,  2436,
+    3442,  9217,  8206,  4858,  5964,  2746,  7178,  1434,
+    7389,  8879, 10661, 11457,  4220,  1432, 10832,  4328,
+    8557,  1867,  9454,  2416,  3816,  9076,   686,  5393,
+    2523,  4339,  6115,   619,   937,  2834,  7775,  3279,
+    2363,  7488,  6112,  5056,   824, 10204, 11690,  1113,
+    2727,  9848,   896,  2028,  5075,  2654, 10464,  7884,
+    12169,  5434,  3070,  6400,  9132, 11672, 12153,  4520,
+    1273,  9739, 11468,  9937, 10039,  9720,  2262,  9399,
+    11192,   315,  4511,  1158,  6061,  6751, 11865,   357,
+    7367,  4550,   983,  8534,  8352, 10126,  7530,  9253,
+    4367,  5221,  3999,  8777,  3161,  6990,  4130, 11652,
+    3374, 11477,  1753,   292,  8681,  2806, 10378, 12188,
+    5800, 11811,  3181,  1988,  1024,  9340,  2477, 10928,
+    4582,  6750,  3619,  5503,  5233,  2463,  8470,  7650,
+    7964,  6395,  1071,  1272,  3474, 11045,  3291, 11344,
+    8502,  9478,  9837,  1253,  1857,  6233,  4720, 11561,
+    6034,  9817,  3339,  1797,  2879,  6242,  5200,  2114,
+    7962,  9353, 11363,  5475,  6084,  9601,  4108,  7323,
+    10438,  9471,  1271,   408,  6911,  3079,   360,  8276,
+    11535,  9156,  9049, 11539,   850,  8617,   784,  7919,
+    8334, 12170,  1846, 10213, 12184,  7827, 11903,  5600,
+    9779,  1012,   721,  2784,  6676,  6552,  5348,  4424,
+    6816,  8405,  9959,  5150,  2356,  5552,  5267,  1333,
+    8801,  9661,  7308,  5788,  4910,   909, 11613,  4395,
+    8238,  6686,  4302,  3044,  2285, 12249,  1963,  9216,
+    4296, 11918,   695,  4371,  9793,  4884,  2411, 10230,
+    2650,   841,  3890, 10231,  7248,  8505, 11196,  6688,
+    4059,  6060,  3686,  4722, 11853,  5816,  7058,  6868,
+    11137,  7926,  4894, 12284,  4102,  3908,  3610,  6525,
+    7938,  7982, 11977,  6755,   537,  4562,  1623,  8227,
+    11453,  7544,   906, 11816,  9548, 10858,  9703,  2815,
+    11736,  6813,  6979,   819,  8903,  6271, 10843,   348,
+    7514,  8339,  6439,   694,   852,  5659,  2781,  3716,
+    11589,  3024,  1523,  8659,  4114, 10738,  3303,  5885,
+    2978,  7289, 11884,  9123,  9323, 11830,    98,  2526,
+    2116,  4131, 11407,  1844,  3645,  3916,  8133,  2224,
+    10871,  8092,  9651,  5989,  7140,  8480,  1670,   159,
+    10923,  4918,   128,  7312,   725,  9157,  5006,  6393,
+    3494,  6043, 10972,  6181, 11838,  3423, 10514,  7668,
+    3693,  6658,  6905, 11953, 10212, 11922,  9101,  8365,
+    5110,    45,  2400,  1921,  4377,  2720,  1695,    51,
+    2808,   650,  1896,  9997,  9971, 11980,  8098,  4833,
+    4135,  4257,  5838,  4765, 10985, 11532,   590, 12198,
+    482, 12173,  2006,  7064, 10018,  3912, 12016, 10519,
+    11362,  6954,  2210,   284,  5413,  6601,  3865, 10339,
+    11188,  6231,   517,  9564, 11281,  3863,  1210,  4604,
+    8160, 11447,   153,  7204,  5763,  5089,  9248, 12154,
+    11748,  1354,  6672,   179,  5532,  2646,  5941, 12185,
+    862,  3158,   477,  7279,  5678,  7914,  4254,   302,
+    2893, 10114,  6890,  9560,  9647, 11905,  4098,  9824,
+    10269,  1353, 10715,  5325,  6254,  3951,  1807,  6449,
+    5159,  1308,  8315,  3404,  1877,  1231,   112,  6398,
+    11724, 12272,  7286,  1459, 12274,  9896,  3456,   800,
+    1397, 10678,   103,  7420,  7976,   936,   764,   632,
+    7996,  8223,  8445,  7758, 10870,  9571,  2508,  1946,
+    6524, 10158,  1044,  4338,  2457,  3641,  1659,  4139,
+    4688,  9733, 11148,  3946,  2082,  5261,  2036, 11850,
+    7636, 12236,  5366,  2380,  1399,  7720,  2100,  3217,
+    10912,  8898,  7578, 11995,  2791,  1215,  3355,  2711,
+    2267,  2004,  8568, 10176,  3214,  2337,  1750,  4729,
+    4997,  7415,  6315, 12044,  4374,  7157,  4844,   211,
+    8003, 10159,  9290, 11481,  1735,  2336,  5793,  9875,
+    8192,   986,  7527,  1401,   870,  3615,  8465,  2756,
+    9770,  2034, 10168,  3264,  6132,    54,  2880,  4763,
+    11805,  3074,  8286,  9428,  4881,  6933,  1090, 10038,
+    2567,   708,   893,  6465,  4962, 10024,  2090,  5718,
+    10743,   780,  4733,  4623,  2134,  2087,  4802,   884,
+    5372,  5795,  5938,  4333,  6559,  7549,  5269, 10664,
+    4252,  3260,  5917, 10814,  5768,  9983,  8096,  7791,
+    6800,  7491,  6272,  1907, 10947,  6289, 11803,  6032,
+    11449,  1171,  9201,  7933,  2479,  7970, 11337,  7062,
+    8911,  6728,  6542,  8114,  8828,  6595,  3545,  4348,
+    4610,  2205,  6999,  8106,  5560, 10390,  9321,  2499,
+    2413,  7272,  6881, 10582,  9308,  9437,  3554,  3326,
+    5991, 11969,  3415, 12283,  9838, 12063,  4332,  7830,
+    11329,  6605, 12271,  2044, 11611,  7353, 11201, 11582,
+    3733,  8943,  9978,  1627,  7168,  3935,  5050,  2762,
+    7496, 10383,   755,  1654, 12053,  4952, 10134,  4394,
+    6592,  7898,  7497,  8904, 12029,  3581, 10748,  5674,
+    10358,  4901,  7414,  8771,   710,  6764,  8462,  7193,
+    5371,  7274, 11084,   290,  7864,  6827, 11822,  2509,
+    6578,  4026,  5807,  1458,  5721,  5762,  4178,  2105,
+    11621,  4852,  8897,  2856, 11510,  9264,  2520,  8776,
+    7011,  2647,  1898,  7039,  5950, 11163,  5488,  6277,
+    9182, 11456,   633, 10046, 11554,  5633,  9587,  2333,
+    7008,  7084,  5047,  7199,  9865,  8997,   569,  6390,
+    10845,  9679,  8268, 11472,  4203,  1997,     2,  9331,
+    162,  6182,  2000,  3649,  9792,  6363,  7557,  6187,
+    8510,  9935,  5536,  9019,  3706, 12009,  1452,  3067,
+    5494,  9692,  4865,  6019,  7106,  9610,  4588, 10165,
+    6261,  5887,  2652, 10172,  1580, 10379,  4638,  9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ *   iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+    4091,  4401,  1081,  1229,  2530,  6014,  7947,  5329,
+    2579,  4751,  6464, 11703,  7023,  2812,  5890, 10698,
+    3109,  2125,  1960, 10925, 10601, 10404,  4189,  1875,
+    5847,  8546,  4615,  5190, 11324, 10578,  5882, 11155,
+    8417, 12275, 10599,  7446,  5719,  3569,  5981, 10108,
+    4426,  8306, 10755,  4679, 11052,  1538, 11857,   100,
+    8247,  6625,  9725,  5145,  3412,  7858,  5831,  9460,
+    5217, 10740,  7882,  7506, 12172, 11292,  6049,    79,
+    13,  6938,  8886,  5453,  4586, 11455,  2903,  4676,
+    9843,  7621,  8822,  9109,  2083,  8507,  8685,  3110,
+    7015,  3269,  1367,  6397, 10259,  8435, 10527, 11559,
+    11094,  2211,  1808,  7319,    48,  9547,  2560,  1228,
+    9438, 10787, 11800,  1820, 11406,  8966,  6159,  3012,
+    6109,  2796,  2203,  1652,   711,  7004,  1053,  8973,
+    5244,  1517,  9322, 11269,   900,  3888, 11133, 10736,
+    4949,  7616,  9974,  4746, 10270,   126,  2921,  6720,
+    6635,  6543,  1582,  4868,    42,   673,  2240,  7219,
+    1296, 11989,  7675,  8578, 11949,   989, 10541,  7687,
+    7085,  8487,  1004, 10236,  4703,   163,  9143,  4597,
+    6431, 12052,  2991, 11938,  4647,  3362,  2060, 11357,
+    12011,  6664,  5655,  7225,  5914,  9327,  4092,  5880,
+    6932,  3402,  5133,  9394, 11229,  5252,  9008,  1556,
+    6908,  4773,  3853,  8780, 10325,  7737,  1758,  7103,
+    11375, 12273,  8602,  3243,  6536,  7590,  8591, 11552,
+    6101,  3253,  9969,  9640,  4506,  3736,  6829, 10822,
+    9130,  9948,  3566,  2133,  3901,  6038,  7333,  6609,
+    3468,  4659,   625,  2700,  7738,  3443,  3060,  3388,
+    3526,  4418, 11911,  6232,  1730,  2558, 10340,  5344,
+    5286,  2190, 11562,  6199,  2482,  8756,  5387,  4101,
+    4609,  8605,  8226,   144,  5656,  8704,  2621,  5424,
+    10812,  2959, 11346,  6249,  1715,  4951,  9540,  1888,
+    3764,    39,  8219,  2080,  2502,  1469, 10550,  8709,
+    5601,  1093,  3784,  5041,  2058,  8399, 11448,  9639,
+    2059,  9878,  7405,  2496,  7918, 11594,   371,  7993,
+    3073, 10326,    40, 10004,  9245,  7987,  5603,  4051,
+    7894,   676, 11380,  7379,  6501,  4981,  2628,  3488,
+    10956,  7022,  6737,  9933,  7139,  2330,  3884,  5473,
+    7865,  6941,  5737,  5613,  9505, 11568, 11277,  2510,
+    6689,   386,  4462,   105,  2076, 10443,   119,  3955,
+    4370, 11505,  3672, 11439,   750,  3240,  3133,   754,
+    4013, 11929,  9210,  5378, 11881, 11018,  2818,  1851,
+    4966,  8181,  2688,  6205,  6814,   926,  2936,  4327,
+    10175,  7089,  6047,  9410, 10492,  8950,  2472,  6255,
+    728,  7569,  6056, 10432, 11036,  2452,  2811,  3787,
+    945,  8998,  1244,  8815, 11017, 11218,  5894,  4325,
+    4639,  3819,  9826,  7056,  6786,  8670,  5539,  7707,
+    1361,  9812,  2949, 11265, 10301,  9108,   478,  6489,
+    101,  1911,  9483,  3608, 11997, 10536,   812,  8915,
+    637,  8159,  5299,  9128,  3512,  8290,  7068,  7922,
+    3036,  4759,  2163,  3937,  3755, 11306,  7739,  4922,
+    11932,   424,  5538,  6228, 11131,  7778, 11974,  1097,
+    2890, 10027,  2569,  2250,  2352,   821,  2550, 11016,
+    7769,   136,   617,  3157,  5889,  9219,  6855,   120,
+    4405,  1825,  9635,  7214, 10261, 11393,  2441,  9562,
+    11176,   599,  2085, 11465,  7233,  6177,  4801,  9926,
+    9010,  4514,  9455, 11352, 11670,  6174,  7950,  9766,
+    6896, 11603,  3213,  8473,  9873,  2835, 10422,  3732,
+    7961,  1457, 10857,  8069,   832,  1628,  3410,  4900,
+    10855,  5111,  9543,  6325,  7431,  4083,  3072,  8847,
+    9853, 10122,  5259, 11413,  6556,   303,  1465,  3871,
+    4873,  5813, 10017,  6898,  3311,  5947,  8637,  5852,
+    3856,   928,  4933,  8530,  1871,  2184,  5571,  5879,
+    3481, 11597,  9511,  8153,    35,  2609,  5963,  8064,
+    1080, 12039,  8444,  3052,  3813, 11065,  6736,  8454,
+    2340,  7651,  1910, 10709,  2117,  9637,  6402,  6028,
+    2124,  7701,  2679,  5183,  6270,  7424,  2597,  6795,
+    9222, 10837,   280,  8583,  3270,  6753,  2354,  3779,
+    6102,  4732,  5926,  2497,  8640, 10289,  6107, 12127,
+    2958, 12287, 10292,  8086,   817,  4021,  2610,  1444,
+    5899, 11720,  3292,  2424,  5090,  7242,  5205,  5281,
+    9956,  2702,  6656,   735,  2243, 11656,   833,  3107,
+    6012,  6801,  1126,  6339,  5250, 10391,  9642,  5278,
+    3513,  9769,  3025,   779,  9433,  3392,  7437,   668,
+    10184,  8111,  6527,  6568, 10831,  6482,  8263,  5711,
+    9780,   467,  5462,  4425, 11999,  1205,  5015,  6918,
+    5096,  3827,  5525, 11579,  3518,  4875,  7388,  1931,
+    6615,  1541,  8708,   260,  3385,  4792,  4391,  5697,
+    7895,  2155,  7337,   236, 10635, 11534,  1906,  4793,
+    9527,  7239,  8354,  5121, 10662,  2311,  3346,  8556,
+    707,  1088,  4936,   678, 10245,    18,  5684,   960,
+    4459,  7957,   226,  2451,     6,  8874,   320,  6298,
+    8963,  8735,  2852,  2981,  1707,  5408,  5017,  9876,
+    9790,  2968,  1899,  6729,  4183,  5290, 10084,  7679,
+    7941,  8744,  5694,  3461,  4175,  5747,  5561,  3378,
+    5227,   952,  4319,  9810,  4356,  3088, 11118,   840,
+    6257,   486,  6000,  1342, 10382,  6017,  4798,  5489,
+    4498,  4193,  2306,  6521,  1475,  6372,  9029,  8037,
+    1625,  7020,  4740,  5730,  7956,  6351,  6494,  6917,
+    11405,  7487, 10202, 10155,  7666,  7556, 11509,  1546,
+    6571, 10199,  2265,  7327,  5824, 11396, 11581,  9722,
+    2251, 11199,  5356,  7408,  2861,  4003,  9215,   484,
+    7526,  9409, 12235,  6157,  9025,  2121, 10255,  2519,
+    9533,  3824,  8674, 11419, 10888,  4762, 11303,  4097,
+    2414,  6496,  9953, 10554,   808,  2999,  2130,  4286,
+    12078,  7445,  5132,  7915,   245,  5974,  4874,  7292,
+    7560, 10539,  9952,  9075,  2113,  3721, 10285, 10022,
+    9578,  8934, 11074,  9498,   294,  4711,  3391,  1377,
+    9072, 10189,  4569, 10890,  9909,  6923,    53,  4653,
+    439, 10253,  7028, 10207,  8343,  1141,  2556,  7601,
+    8150, 10630,  8648,  9832,  7951, 11245,  2131,  5765,
+    10343,  9781,  2718,  1419,  4531,  3844,  4066,  4293,
+    11657, 11525, 11353,  4313,  4869, 12186,  1611, 10892,
+    11489,  8833,  2393,    15, 10830,  5003,    17,   565,
+    5891, 12177, 11058, 10412,  8885,  3974, 10981,  7130,
+    5840, 10482,  8338,  6035,  6964,  1574, 10936,  2020,
+    2465,  8191,   384,  2642,  2729,  5399,  2175,  9396,
+    11987,  8035,  4375,  6611,  5010, 11812,  9131, 11427,
+    104,  6348,  9643,  6757, 12110,  5617, 10935,   541,
+    135,  3041,  7200,  6526,  5085, 12136,   842,  4129,
+    7685, 11079,  8426,  1008,  2725, 11772,  6058,  1101,
+    1950,  8424,  5688,  6876, 12005, 10079,  5335,   927,
+    1770,   273,  8377,  2271,  5225, 10283,   116, 11807,
+    91, 11699,   757,  1304,  7524,  6451,  8032,  8154,
+    7456,  4191,   309,  2318,  2292, 10393, 11639,  9481,
+    12238, 10594,  9569,  7912, 10368,  9889, 12244,  7179,
+    3924,  3188,   367,  2077,   336,  5384,  5631,  8596,
+    4621,  1775,  8866,   451,  6108,  1317,  6246,  8795,
+    5896,  7283,  3132, 11564,  4977, 12161,  7371,  1366,
+    12130, 10619,  3809,  5149,  6300,  2638,  4197,  1418,
+    10065,  4156,  8373,  8644, 10445,   882,  8158, 10173,
+    9763, 12191,   459,  2966,  3166,   405,  5000,  9311,
+    6404,  8986,  1551,  8175,  3630, 10766,  9265,   700,
+    8573,  9508,  6630, 11437, 11595,  5850,  3950,  4775,
+    11941,  1446,  6018,  3386, 11470,  5310,  5476,   553,
+    9474,  2586,  1431,  2741,   473, 11383,  4745,   836,
+    4062, 10666,  7727, 11752,  5534,   312,  4307,  4351,
+    5764,  8679,  8381,  8187,     5,  7395,  4363,  1152,
+    5421,  5231,  6473,   436,  7567,  8603,  6229,  8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+    /*
+     * If x < 0, the cast to uint32_t will set the high bit to 1.
+     */
+    uint32_t y;
+
+    y = (uint32_t)x;
+    y += Q & -(y >> 31);
+    return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+    /*
+     * We compute x + y - q. If the result is negative, then the
+     * high bit will be set, and 'd >> 31' will be equal to 1;
+     * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+     * it will be an all-zero pattern. In other words, this
+     * implements a conditional addition of q.
+     */
+    uint32_t d;
+
+    d = x + y - Q;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+    /*
+     * As in mq_add(), we use a conditional addition to ensure the
+     * result is in the 0..q-1 range.
+     */
+    uint32_t d;
+
+    d = x - y;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+    x += Q & -(x & 1);
+    return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+    uint32_t z, w;
+
+    /*
+     * We compute x*y + k*q with a value of k chosen so that the 16
+     * low bits of the result are 0. We can then shift the value.
+     * After the shift, result may still be larger than q, but it
+     * will be lower than 2*q, so a conditional subtraction works.
+     */
+
+    z = x * y;
+    w = ((z * Q0I) & 0xFFFF) * Q;
+
+    /*
+     * When adding z and w, the result will have its low 16 bits
+     * equal to 0. Since x, y and z are lower than q, the sum will
+     * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+     * fit on 29 bits.
+     */
+    z = (z + w) >> 16;
+
+    /*
+     * After the shift, analysis shows that the value will be less
+     * than 2q. We do a subtraction then conditional subtraction to
+     * ensure the result is in the expected range.
+     */
+    z -= Q;
+    z += Q & -(z >> 31);
+    return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+    return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+    /*
+     * We invert y by computing y^(q-2) mod q.
+     *
+     * We use the following addition chain for exponent e = 12287:
+     *
+     *   e0 = 1
+     *   e1 = 2 * e0 = 2
+     *   e2 = e1 + e0 = 3
+     *   e3 = e2 + e1 = 5
+     *   e4 = 2 * e3 = 10
+     *   e5 = 2 * e4 = 20
+     *   e6 = 2 * e5 = 40
+     *   e7 = 2 * e6 = 80
+     *   e8 = 2 * e7 = 160
+     *   e9 = e8 + e2 = 163
+     *   e10 = e9 + e8 = 323
+     *   e11 = 2 * e10 = 646
+     *   e12 = 2 * e11 = 1292
+     *   e13 = e12 + e9 = 1455
+     *   e14 = 2 * e13 = 2910
+     *   e15 = 2 * e14 = 5820
+     *   e16 = e15 + e10 = 6143
+     *   e17 = 2 * e16 = 12286
+     *   e18 = e17 + e0 = 12287
+     *
+     * Additions on exponents are converted to Montgomery
+     * multiplications. We define all intermediate results as so
+     * many local variables, and let the C compiler work out which
+     * must be kept around.
+     */
+    uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+    uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+    y0 = mq_montymul(y, R2);
+    y1 = mq_montysqr(y0);
+    y2 = mq_montymul(y1, y0);
+    y3 = mq_montymul(y2, y1);
+    y4 = mq_montysqr(y3);
+    y5 = mq_montysqr(y4);
+    y6 = mq_montysqr(y5);
+    y7 = mq_montysqr(y6);
+    y8 = mq_montysqr(y7);
+    y9 = mq_montymul(y8, y2);
+    y10 = mq_montymul(y9, y8);
+    y11 = mq_montysqr(y10);
+    y12 = mq_montysqr(y11);
+    y13 = mq_montymul(y12, y9);
+    y14 = mq_montysqr(y13);
+    y15 = mq_montysqr(y14);
+    y16 = mq_montymul(y15, y10);
+    y17 = mq_montysqr(y16);
+    y18 = mq_montymul(y17, y0);
+
+    /*
+     * Final multiplication with x, which is not in Montgomery
+     * representation, computes the correct division result.
+     */
+    return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, i, j1;
+
+        ht = t >> 1;
+        for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+            size_t j, j2;
+            uint32_t s;
+
+            s = GMb[m + i];
+            j2 = j1 + ht;
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v;
+
+                u = a[j];
+                v = mq_montymul(a[j + ht], s);
+                a[j] = (uint16_t)mq_add(u, v);
+                a[j + ht] = (uint16_t)mq_sub(u, v);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+    uint32_t ni;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    while (m > 1) {
+        size_t hm, dt, i, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+            size_t j, j2;
+            uint32_t s;
+
+            j2 = j1 + t;
+            s = iGMb[hm + i];
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v, w;
+
+                u = a[j];
+                v = a[j + t];
+                a[j] = (uint16_t)mq_add(u, v);
+                w = mq_sub(u, v);
+                a[j + t] = (uint16_t)
+                           mq_montymul(w, s);
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * To complete the inverse NTT, we must now divide all values by
+     * n (the vector size). We thus need the inverse of n, i.e. we
+     * need to divide 1 by 2 logn times. But we also want it in
+     * Montgomery representation, i.e. we also want to multiply it
+     * by R = 2^16. In the common case, this should be a simple right
+     * shift. The loop below is generic and works also in corner cases;
+     * its computation time is negligible.
+     */
+    ni = R;
+    for (m = n; m > 1; m >>= 1) {
+        ni = mq_rshift1(ni);
+    }
+    for (m = 0; m < n; m ++) {
+        a[m] = (uint16_t)mq_montymul(a[m], ni);
+    }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], R2);
+    }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+    }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_sub(f[u], g[u]);
+    }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_to_ntt_monty(uint16_t *h, unsigned logn) {
+    mq_NTT(h, logn);
+    mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+        const uint16_t *h, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+
+    /*
+     * Reduce s2 elements modulo q ([0..q-1] range).
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+     */
+    mq_NTT(tt, logn);
+    mq_poly_montymul_ntt(tt, h, logn);
+    mq_iNTT(tt, logn);
+    mq_poly_sub(tt, c0, logn);
+
+    /*
+     * Normalize -s1 elements into the [-q/2..q/2] range.
+     */
+    for (u = 0; u < n; u ++) {
+        int32_t w;
+
+        w = (int32_t)tt[u];
+        w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+        ((int16_t *)tt)[u] = (int16_t)w;
+    }
+
+    /*
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
+     * is short enough.
+     */
+    return PQCLEAN_FALCONPADDED1024_AVX2_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        tt[u] = (uint16_t)mq_conv_small(f[u]);
+        h[u] = (uint16_t)mq_conv_small(g[u]);
+    }
+    mq_NTT(h, logn);
+    mq_NTT(tt, logn);
+    for (u = 0; u < n; u ++) {
+        if (tt[u] == 0) {
+            return 0;
+        }
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *t1, *t2;
+
+    n = (size_t)1 << logn;
+    t1 = (uint16_t *)tmp;
+    t2 = t1 + n;
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint16_t)mq_conv_small(g[u]);
+        t2[u] = (uint16_t)mq_conv_small(F[u]);
+    }
+    mq_NTT(t1, logn);
+    mq_NTT(t2, logn);
+    mq_poly_tomonty(t1, logn);
+    mq_poly_montymul_ntt(t1, t2, logn);
+    for (u = 0; u < n; u ++) {
+        t2[u] = (uint16_t)mq_conv_small(f[u]);
+    }
+    mq_NTT(t2, logn);
+    for (u = 0; u < n; u ++) {
+        if (t2[u] == 0) {
+            return 0;
+        }
+        t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+    }
+    mq_iNTT(t1, logn);
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+        int32_t gi;
+
+        w = t1[u];
+        w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+        gi = *(int32_t *)&w;
+        if (gi < -127 || gi > +127) {
+            return 0;
+        }
+        G[u] = (int8_t)gi;
+    }
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED1024_AVX2_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/LICENSE b/src/sig/falcon/pqclean_falcon-padded-1024_clean/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin <thomas.pornin@nccgroup.com>.
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/api.h b/src/sig/falcon/pqclean_falcon-padded-1024_clean/api.h
new file mode 100644
index 000000000..0d38a55f7
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED1024_CLEAN_API_H
+#define PQCLEAN_FALCONPADDED1024_CLEAN_API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES   2305
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES   1793
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES            1280
+
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_ALGNAME          "Falcon-padded-1024"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ *   public (pk): PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES
+ *   private (sk): PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/codec.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/codec.c
new file mode 100644
index 000000000..9556fe73a
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_modq_encode(
+    void *out, size_t max_out_len,
+    const uint16_t *x, unsigned logn) {
+    size_t n, out_len, u;
+    uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        if (x[u] >= 12289) {
+            return 0;
+        }
+    }
+    out_len = ((n * 14) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << 14) | x[u];
+        acc_len += 14;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_modq_decode(
+    uint16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len, u;
+    const uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * 14) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    u = 0;
+    while (u < n) {
+        acc = (acc << 8) | (*buf ++);
+        acc_len += 8;
+        if (acc_len >= 14) {
+            unsigned w;
+
+            acc_len -= 14;
+            w = (acc >> acc_len) & 0x3FFF;
+            if (w >= 12289) {
+                return 0;
+            }
+            x[u ++] = (uint16_t)w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint16_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_decode(
+    int16_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            w |= -(w & mask2);
+            x[u ++] = (int16_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+    void *out, size_t max_out_len,
+    const int8_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint8_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+    int8_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            x[u ++] = (int8_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_comp_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn) {
+    uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = out;
+
+    /*
+     * Make sure that all values are within the -2047..+2047 range.
+     */
+    for (u = 0; u < n; u ++) {
+        if (x[u] < -2047 || x[u] > +2047) {
+            return 0;
+        }
+    }
+
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        int t;
+        unsigned w;
+
+        /*
+         * Get sign and absolute value of next integer; push the
+         * sign bit.
+         */
+        acc <<= 1;
+        t = x[u];
+        if (t < 0) {
+            t = -t;
+            acc |= 1;
+        }
+        w = (unsigned)t;
+
+        /*
+         * Push the low 7 bits of the absolute value.
+         */
+        acc <<= 7;
+        acc |= w & 127u;
+        w >>= 7;
+
+        /*
+         * We pushed exactly 8 bits.
+         */
+        acc_len += 8;
+
+        /*
+         * Push as many zeros as necessary, then a one. Since the
+         * absolute value is at most 2047, w can only range up to
+         * 15 at this point, thus we will add at most 16 bits
+         * here. With the 8 bits above and possibly up to 7 bits
+         * from previous iterations, we may go up to 31 bits, which
+         * will fit in the accumulator, which is an uint32_t.
+         */
+        acc <<= (w + 1);
+        acc |= 1;
+        acc_len += w + 1;
+
+        /*
+         * Produce all full bytes.
+         */
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            if (buf != NULL) {
+                if (v >= max_out_len) {
+                    return 0;
+                }
+                buf[v] = (uint8_t)(acc >> acc_len);
+            }
+            v ++;
+        }
+    }
+
+    /*
+     * Flush remaining bits (if any).
+     */
+    if (acc_len > 0) {
+        if (buf != NULL) {
+            if (v >= max_out_len) {
+                return 0;
+            }
+            buf[v] = (uint8_t)(acc << (8 - acc_len));
+        }
+        v ++;
+    }
+
+    return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_comp_decode(
+    int16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    const uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        unsigned b, s, m;
+
+        /*
+         * Get next eight bits: sign and low seven bits of the
+         * absolute value.
+         */
+        if (v >= max_in_len) {
+            return 0;
+        }
+        acc = (acc << 8) | (uint32_t)buf[v ++];
+        b = acc >> acc_len;
+        s = b & 128;
+        m = b & 127;
+
+        /*
+         * Get next bits until a 1 is reached.
+         */
+        for (;;) {
+            if (acc_len == 0) {
+                if (v >= max_in_len) {
+                    return 0;
+                }
+                acc = (acc << 8) | (uint32_t)buf[v ++];
+                acc_len = 8;
+            }
+            acc_len --;
+            if (((acc >> acc_len) & 1) != 0) {
+                break;
+            }
+            m += 128;
+            if (m > 2047) {
+                return 0;
+            }
+        }
+
+        /*
+         * "-0" is forbidden.
+         */
+        if (s && m == 0) {
+            return 0;
+        }
+        if (s) {
+            x[u] = (int16_t) - m;
+        } else {
+            x[u] = (int16_t)m;
+        }
+    }
+
+    /*
+     * Unused bits in the last byte must be zero.
+     */
+    if ((acc & ((1u << acc_len) - 1u)) != 0) {
+        return 0;
+    }
+
+    return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
+ *      1       2     129       56.31       143       60.02
+ *      2       4     123       40.93       160       46.52
+ *      3       8      97       28.97       159       38.01
+ *      4      16     100       21.48       154       32.50
+ *      5      32      71       15.41       151       29.36
+ *      6      64      59       11.07       138       27.77
+ *      7     128      39        7.91       144       27.00
+ *      8     256      32        5.63       148       26.61
+ *      9     512      22        4.00       137       26.46
+ *     10    1024      15        2.84       146       26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    7,
+    7,
+    6,
+    6,
+    5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ *   log(n)     n   max sig coeff (theoretical)
+ *      1       2       412
+ *      2       4       583
+ *      3       8       824
+ *      4      16      1166
+ *      5      32      1649
+ *      6      64      2332
+ *      7     128      3299
+ *      8     256      4665
+ *      9     512      6598
+ *     10    1024      9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_sig_bits[] = {
+    0, /* unused */
+    10,
+    11,
+    11,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/common.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/common.c
new file mode 100644
index 000000000..87c6771c2
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
+
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
+    /*
+     * Each 16-bit sample is a value in 0..65535. The value is
+     * kept if it falls in 0..61444 (because 61445 = 5*12289)
+     * and rejected otherwise; thus, each sample has probability
+     * about 0.93758 of being selected.
+     *
+     * We want to oversample enough to be sure that we will
+     * have enough values with probability at least 1 - 2^(-256).
+     * Depending on degree N, this leads to the following
+     * required oversampling:
+     *
+     *   logn     n  oversampling
+     *     1      2     65
+     *     2      4     67
+     *     3      8     71
+     *     4     16     77
+     *     5     32     86
+     *     6     64    100
+     *     7    128    122
+     *     8    256    154
+     *     9    512    205
+     *    10   1024    287
+     *
+     * If logn >= 7, then the provided temporary buffer is large
+     * enough. Otherwise, we use a stack buffer of 63 entries
+     * (i.e. 126 bytes) for the values that do not fit in tmp[].
+     */
+
+    static const uint16_t overtab[] = {
+        0, /* unused */
+        65,
+        67,
+        71,
+        77,
+        86,
+        100,
+        122,
+        154,
+        205,
+        287
+    };
+
+    unsigned n, n2, u, m, p, over;
+    uint16_t *tt1, tt2[63];
+
+    /*
+     * We first generate m 16-bit value. Values 0..n-1 go to x[].
+     * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+     * We also reduce modulo q the values; rejected values are set
+     * to 0xFFFF.
+     */
+    n = 1U << logn;
+    n2 = n << 1;
+    over = overtab[logn];
+    m = n + over;
+    tt1 = (uint16_t *)tmp;
+    for (u = 0; u < m; u ++) {
+        uint8_t buf[2];
+        uint32_t w, wr;
+
+        inner_shake256_extract(sc, buf, sizeof buf);
+        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+        wr |= ((w - 61445) >> 31) - 1;
+        if (u < n) {
+            x[u] = (uint16_t)wr;
+        } else if (u < n2) {
+            tt1[u - n] = (uint16_t)wr;
+        } else {
+            tt2[u - n2] = (uint16_t)wr;
+        }
+    }
+
+    /*
+     * Now we must "squeeze out" the invalid values. We do this in
+     * a logarithmic sequence of passes; each pass computes where a
+     * value should go, and moves it down by 'p' slots if necessary,
+     * where 'p' uses an increasing powers-of-two scale. It can be
+     * shown that in all cases where the loop decides that a value
+     * has to be moved down by p slots, the destination slot is
+     * "free" (i.e. contains an invalid value).
+     */
+    for (p = 1; p <= over; p <<= 1) {
+        unsigned v;
+
+        /*
+         * In the loop below:
+         *
+         *   - v contains the index of the final destination of
+         *     the value; it is recomputed dynamically based on
+         *     whether values are valid or not.
+         *
+         *   - u is the index of the value we consider ("source");
+         *     its address is s.
+         *
+         *   - The loop may swap the value with the one at index
+         *     u-p. The address of the swap destination is d.
+         */
+        v = 0;
+        for (u = 0; u < m; u ++) {
+            uint16_t *s, *d;
+            unsigned j, sv, dv, mk;
+
+            if (u < n) {
+                s = &x[u];
+            } else if (u < n2) {
+                s = &tt1[u - n];
+            } else {
+                s = &tt2[u - n2];
+            }
+            sv = *s;
+
+            /*
+             * The value in sv should ultimately go to
+             * address v, i.e. jump back by u-v slots.
+             */
+            j = u - v;
+
+            /*
+             * We increment v for the next iteration, but
+             * only if the source value is valid. The mask
+             * 'mk' is -1 if the value is valid, 0 otherwise,
+             * so we _subtract_ mk.
+             */
+            mk = (sv >> 15) - 1U;
+            v -= mk;
+
+            /*
+             * In this loop we consider jumps by p slots; if
+             * u < p then there is nothing more to do.
+             */
+            if (u < p) {
+                continue;
+            }
+
+            /*
+             * Destination for the swap: value at address u-p.
+             */
+            if ((u - p) < n) {
+                d = &x[u - p];
+            } else if ((u - p) < n2) {
+                d = &tt1[(u - p) - n];
+            } else {
+                d = &tt2[(u - p) - n2];
+            }
+            dv = *d;
+
+            /*
+             * The swap should be performed only if the source
+             * is valid AND the jump j has its 'p' bit set.
+             */
+            mk &= -(((j & p) + 0x1FF) >> 9);
+
+            *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+        }
+    }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+    0,    /* unused */
+    101498,
+    208714,
+    428865,
+    892039,
+    1852696,
+    3842630,
+    7959734,
+    16468416,
+    34034726,
+    70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_is_short(
+    const int16_t *s1, const int16_t *s2, unsigned logn) {
+    /*
+     * We use the l2-norm. Code below uses only 32-bit operations to
+     * compute the square of the norm with saturation to 2^32-1 if
+     * the value exceeds 2^31-1.
+     */
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = (size_t)1 << logn;
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s1[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+        z = s2[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    s |= -(ng >> 31);
+
+    return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(
+    uint32_t sqn, const int16_t *s2, unsigned logn) {
+    size_t n, u;
+    uint32_t ng;
+
+    n = (size_t)1 << logn;
+    ng = -(sqn >> 31);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s2[u];
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+    }
+    sqn |= -(ng >> 31);
+
+    return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/fft.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fft.c
new file mode 100644
index 000000000..f0d5bd842
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fft.c
@@ -0,0 +1,699 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_add(a_re, b_re); \
+        fpct_im = fpr_add(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_sub(a_re, b_re); \
+        fpct_im = fpr_sub(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+        fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+        fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ *    Re(f(w_j)) -> slot rev(j)/2
+ *    Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_FFT(fpr *f, unsigned logn) {
+    /*
+     * FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = N
+     *   for m = 1; m < N; m *= 2:
+     *       ht = t/2
+     *       for i1 = 0; i1 < m; i1 ++:
+     *           j1 = i1 * t
+     *           s = GM[m + i1]
+     *           for j = j1; j < (j1 + ht); j ++:
+     *               x = f[j]
+     *               y = s * f[j + ht]
+     *               f[j] = x + y
+     *               f[j + ht] = x - y
+     *       t = ht
+     *
+     * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+     *
+     * In the description above, f[] is supposed to contain complex
+     * numbers. In our in-memory representation, the real and
+     * imaginary parts of f[k] are in array slots k and k+N/2.
+     *
+     * We only keep the first half of the complex numbers. We can
+     * see that after the first iteration, the first and second halves
+     * of the array of complex numbers have separate lives, so we
+     * simply ignore the second part.
+     */
+
+    unsigned u;
+    size_t t, n, hn, m;
+
+    /*
+     * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+     * (because GM[1] = w^rev(1) = w^(N/2) = i).
+     * In our chosen representation, this is a no-op: everything is
+     * already where it should be.
+     */
+
+    /*
+     * Subsequent iterations are truncated to use only the first
+     * half of values.
+     */
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    t = hn;
+    for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+        size_t ht, hm, i1, j1;
+
+        ht = t >> 1;
+        hm = m >> 1;
+        for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+            size_t j, j2;
+
+            j2 = j1 + ht;
+            fpr s_re, s_im;
+
+            s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+            s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+            for (j = j1; j < j2; j ++) {
+                fpr x_re, x_im, y_re, y_im;
+
+                x_re = f[j];
+                x_im = f[j + hn];
+                y_re = f[j + ht];
+                y_im = f[j + ht + hn];
+                FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
+                FPC_ADD(f[j], f[j + hn],
+                        x_re, x_im, y_re, y_im);
+                FPC_SUB(f[j + ht], f[j + ht + hn],
+                        x_re, x_im, y_re, y_im);
+            }
+        }
+        t = ht;
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(fpr *f, unsigned logn) {
+    /*
+     * Inverse FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = 1
+     *   for m = N; m > 1; m /= 2:
+     *       hm = m/2
+     *       dt = t*2
+     *       for i1 = 0; i1 < hm; i1 ++:
+     *           j1 = i1 * dt
+     *           s = iGM[hm + i1]
+     *           for j = j1; j < (j1 + t); j ++:
+     *               x = f[j]
+     *               y = f[j + t]
+     *               f[j] = x + y
+     *               f[j + t] = s * (x - y)
+     *       t = dt
+     *   for i1 = 0; i1 < N; i1 ++:
+     *       f[i1] = f[i1] / N
+     *
+     * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+     * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+     *
+     * In the main loop (not counting the final division loop), in
+     * all iterations except the last, the first and second half of f[]
+     * (as an array of complex numbers) are separate. In our chosen
+     * representation, we do not keep the second half.
+     *
+     * The last iteration recombines the recomputed half with the
+     * implicit half, and should yield only real numbers since the
+     * target polynomial is real; moreover, s = i at that step.
+     * Thus, when considering x and y:
+     *    y = conj(x) since the final f[j] must be real
+     *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+     *    filled with 2*Im(x).
+     * But we already have Re(x) and Im(x) in array slots j and j+t
+     * in our chosen representation. That last iteration is thus a
+     * simple doubling of the values in all the array.
+     *
+     * We make the last iteration a no-op by tweaking the final
+     * division into a division by N/2, not N.
+     */
+    size_t u, n, hn, t, m;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    hn = n >> 1;
+    for (u = logn; u > 1; u --) {
+        size_t hm, dt, i1, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+            size_t j, j2;
+
+            j2 = j1 + t;
+            fpr s_re, s_im;
+
+            s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+            s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+            for (j = j1; j < j2; j ++) {
+                fpr x_re, x_im, y_re, y_im;
+
+                x_re = f[j];
+                x_im = f[j + hn];
+                y_re = f[j + t];
+                y_im = f[j + t + hn];
+                FPC_ADD(f[j], f[j + hn],
+                        x_re, x_im, y_re, y_im);
+                FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
+                FPC_MUL(f[j + t], f[j + t + hn],
+                        x_re, x_im, s_re, s_im);
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * Last iteration is a no-op, provided that we divide by N/2
+     * instead of N. We need to make a special case for logn = 0.
+     */
+    if (logn > 0) {
+        fpr ni;
+
+        ni = fpr_p2_tab[logn];
+        for (u = 0; u < n; u ++) {
+            f[u] = fpr_mul(f[u], ni);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_add(a[u], b[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_sub(a[u], b[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_neg(a[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = (n >> 1); u < n; u ++) {
+        a[u] = fpr_neg(a[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = b[u + hn];
+        FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = fpr_neg(b[u + hn]);
+        FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+        a[u + hn] = fpr_zero;
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_mul(a[u], x);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = b[u + hn];
+        FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im;
+        fpr b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = b[u + hn];
+        d[u] = fpr_inv(fpr_add(
+                           fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+                           fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr F_re, F_im, G_re, G_im;
+        fpr f_re, f_im, g_re, g_im;
+        fpr a_re, a_im, b_re, b_im;
+
+        F_re = F[u];
+        F_im = F[u + hn];
+        G_re = G[u];
+        G_im = G[u + hn];
+        f_re = f[u];
+        f_im = f[u + hn];
+        g_re = g[u];
+        g_im = g[u + hn];
+
+        FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+        FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+        d[u] = fpr_add(a_re, b_re);
+        d[u + hn] = fpr_add(a_im, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        a[u] = fpr_mul(a[u], b[u]);
+        a[u + hn] = fpr_mul(a[u + hn], b[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr ib;
+
+        ib = fpr_inv(b[u]);
+        a[u] = fpr_mul(a[u], ib);
+        a[u + hn] = fpr_mul(a[u + hn], ib);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDL_fft(
+    const fpr *g00,
+    fpr *g01, fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+        fpr mu_re, mu_im;
+
+        g00_re = g00[u];
+        g00_im = g00[u + hn];
+        g01_re = g01[u];
+        g01_im = g01[u + hn];
+        g11_re = g11[u];
+        g11_im = g11[u + hn];
+        FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+        FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+        FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
+        g01[u] = mu_re;
+        g01[u + hn] = fpr_neg(mu_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(
+    fpr *d11, fpr *l10,
+    const fpr *g00, const fpr *g01,
+    const fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+        fpr mu_re, mu_im;
+
+        g00_re = g00[u];
+        g00_im = g00[u + hn];
+        g01_re = g01[u];
+        g01_im = g01[u + hn];
+        g11_re = g11[u];
+        g11_im = g11[u + hn];
+        FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+        FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+        FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
+        l10[u] = mu_re;
+        l10[u + hn] = fpr_neg(mu_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(
+    fpr *f0, fpr *f1,
+    const fpr *f, unsigned logn) {
+    /*
+     * The FFT representation we use is in bit-reversed order
+     * (element i contains f(w^(rev(i))), where rev() is the
+     * bit-reversal function over the ring degree. This changes
+     * indexes with regards to the Falcon specification.
+     */
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    /*
+     * We process complex values by pairs. For logn = 1, there is only
+     * one complex value (the other one is the implicit conjugate),
+     * so we add the two lines below because the loop will be
+     * skipped.
+     */
+    f0[0] = f[0];
+    f1[0] = f[hn];
+
+    for (u = 0; u < qn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+        fpr t_re, t_im;
+
+        a_re = f[(u << 1) + 0];
+        a_im = f[(u << 1) + 0 + hn];
+        b_re = f[(u << 1) + 1];
+        b_im = f[(u << 1) + 1 + hn];
+
+        FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+        f0[u] = fpr_half(t_re);
+        f0[u + qn] = fpr_half(t_im);
+
+        FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+        FPC_MUL(t_re, t_im, t_re, t_im,
+                fpr_gm_tab[((u + hn) << 1) + 0],
+                fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+        f1[u] = fpr_half(t_re);
+        f1[u + qn] = fpr_half(t_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(
+    fpr *f,
+    const fpr *f0, const fpr *f1, unsigned logn) {
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    /*
+     * An extra copy to handle the special case logn = 1.
+     */
+    f[0] = f0[0];
+    f[hn] = f1[0];
+
+    for (u = 0; u < qn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+        fpr t_re, t_im;
+
+        a_re = f0[u];
+        a_im = f0[u + qn];
+        FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+                fpr_gm_tab[((u + hn) << 1) + 0],
+                fpr_gm_tab[((u + hn) << 1) + 1]);
+        FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+        f[(u << 1) + 0] = t_re;
+        f[(u << 1) + 0 + hn] = t_im;
+        FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+        f[(u << 1) + 1] = t_re;
+        f[(u << 1) + 1 + hn] = t_im;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.c
new file mode 100644
index 000000000..82ff1df46
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.c
@@ -0,0 +1,1622 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/*
+ * Normalize a provided unsigned integer to the 2^63..2^64-1 range by
+ * left-shifting it if necessary. The exponent e is adjusted accordingly
+ * (i.e. if the value was left-shifted by n bits, then n is subtracted
+ * from e). If source m is 0, then it remains 0, but e is altered.
+ * Both m and e must be simple variables (no expressions allowed).
+ */
+#define FPR_NORM64(m, e)   do { \
+        uint32_t nt; \
+        \
+        (e) -= 63; \
+        \
+        nt = (uint32_t)((m) >> 32); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 5); \
+        \
+        nt = (uint32_t)((m) >> 48); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 4); \
+        \
+        nt = (uint32_t)((m) >> 56); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) <<  8)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 3); \
+        \
+        nt = (uint32_t)((m) >> 60); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) <<  4)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 2); \
+        \
+        nt = (uint32_t)((m) >> 62); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) <<  2)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 1); \
+        \
+        nt = (uint32_t)((m) >> 63); \
+        (m) ^= ((m) ^ ((m) <<  1)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt); \
+    } while (0)
+
+fpr
+fpr_scaled(int64_t i, int sc) {
+    /*
+     * To convert from int to float, we have to do the following:
+     *  1. Get the absolute value of the input, and its sign
+     *  2. Shift right or left the value as appropriate
+     *  3. Pack the result
+     *
+     * We can assume that the source integer is not -2^63.
+     */
+    int s, e;
+    uint32_t t;
+    uint64_t m;
+
+    /*
+     * Extract sign bit.
+     * We have: -i = 1 + ~i
+     */
+    s = (int)((uint64_t)i >> 63);
+    i ^= -(int64_t)s;
+    i += s;
+
+    /*
+     * For now we suppose that i != 0.
+     * Otherwise, we set m to i and left-shift it as much as needed
+     * to get a 1 in the top bit. We can do that in a logarithmic
+     * number of conditional shifts.
+     */
+    m = (uint64_t)i;
+    e = 9 + sc;
+    FPR_NORM64(m, e);
+
+    /*
+     * Now m is in the 2^63..2^64-1 range. We must divide it by 512;
+     * if one of the dropped bits is a 1, this should go into the
+     * "sticky bit".
+     */
+    m |= ((uint32_t)m & 0x1FF) + 0x1FF;
+    m >>= 9;
+
+    /*
+     * Corrective action: if i = 0 then all of the above was
+     * incorrect, and we clamp e and m down to zero.
+     */
+    t = (uint32_t)((uint64_t)(i | -i) >> 63);
+    m &= -(uint64_t)t;
+    e &= -(int)t;
+
+    /*
+     * Assemble back everything. The FPR() function will handle cases
+     * where e is too low.
+     */
+    return FPR(s, e, m);
+}
+
+fpr
+fpr_add(fpr x, fpr y) {
+    uint64_t m, xu, yu, za;
+    uint32_t cs;
+    int ex, ey, sx, sy, cc;
+
+    /*
+     * Make sure that the first operand (x) has the larger absolute
+     * value. This guarantees that the exponent of y is less than
+     * or equal to the exponent of x, and, if they are equal, then
+     * the mantissa of y will not be greater than the mantissa of x.
+     *
+     * After this swap, the result will have the sign x, except in
+     * the following edge case: abs(x) = abs(y), and x and y have
+     * opposite sign bits; in that case, the result shall be +0
+     * even if the sign bit of x is 1. To handle this case properly,
+     * we do the swap is abs(x) = abs(y) AND the sign of x is 1.
+     */
+    m = ((uint64_t)1 << 63) - 1;
+    za = (x & m) - (y & m);
+    cs = (uint32_t)(za >> 63)
+         | ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63));
+    m = (x ^ y) & -(uint64_t)cs;
+    x ^= m;
+    y ^= m;
+
+    /*
+     * Extract sign bits, exponents and mantissas. The mantissas are
+     * scaled up to 2^55..2^56-1, and the exponent is unbiased. If
+     * an operand is zero, its mantissa is set to 0 at this step, and
+     * its exponent will be -1078.
+     */
+    ex = (int)(x >> 52);
+    sx = ex >> 11;
+    ex &= 0x7FF;
+    m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52;
+    xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3;
+    ex -= 1078;
+    ey = (int)(y >> 52);
+    sy = ey >> 11;
+    ey &= 0x7FF;
+    m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52;
+    yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3;
+    ey -= 1078;
+
+    /*
+     * x has the larger exponent; hence, we only need to right-shift y.
+     * If the shift count is larger than 59 bits then we clamp the
+     * value to zero.
+     */
+    cc = ex - ey;
+    yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31);
+    cc &= 63;
+
+    /*
+     * The lowest bit of yu is "sticky".
+     */
+    m = fpr_ulsh(1, cc) - 1;
+    yu |= (yu & m) + m;
+    yu = fpr_ursh(yu, cc);
+
+    /*
+     * If the operands have the same sign, then we add the mantissas;
+     * otherwise, we subtract the mantissas.
+     */
+    xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy));
+
+    /*
+     * The result may be smaller, or slightly larger. We normalize
+     * it to the 2^63..2^64-1 range (if xu is zero, then it stays
+     * at zero).
+     */
+    FPR_NORM64(xu, ex);
+
+    /*
+     * Scale down the value to 2^54..s^55-1, handling the last bit
+     * as sticky.
+     */
+    xu |= ((uint32_t)xu & 0x1FF) + 0x1FF;
+    xu >>= 9;
+    ex += 9;
+
+    /*
+     * In general, the result has the sign of x. However, if the
+     * result is exactly zero, then the following situations may
+     * be encountered:
+     *   x > 0, y = -x   -> result should be +0
+     *   x < 0, y = -x   -> result should be +0
+     *   x = +0, y = +0  -> result should be +0
+     *   x = -0, y = +0  -> result should be +0
+     *   x = +0, y = -0  -> result should be +0
+     *   x = -0, y = -0  -> result should be -0
+     *
+     * But at the conditional swap step at the start of the
+     * function, we ensured that if abs(x) = abs(y) and the
+     * sign of x was 1, then x and y were swapped. Thus, the
+     * two following cases cannot actually happen:
+     *   x < 0, y = -x
+     *   x = -0, y = +0
+     * In all other cases, the sign bit of x is conserved, which
+     * is what the FPR() function does. The FPR() function also
+     * properly clamps values to zero when the exponent is too
+     * low, but does not alter the sign in that case.
+     */
+    return FPR(sx, ex, xu);
+}
+
+fpr
+fpr_mul(fpr x, fpr y) {
+    uint64_t xu, yu, w, zu, zv;
+    uint32_t x0, x1, y0, y1, z0, z1, z2;
+    int ex, ey, d, e, s;
+
+    /*
+     * Extract absolute values as scaled unsigned integers. We
+     * don't extract exponents yet.
+     */
+    xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+    yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+    /*
+     * We have two 53-bit integers to multiply; we need to split
+     * each into a lower half and a upper half. Moreover, we
+     * prefer to have lower halves to be of 25 bits each, for
+     * reasons explained later on.
+     */
+    x0 = (uint32_t)xu & 0x01FFFFFF;
+    x1 = (uint32_t)(xu >> 25);
+    y0 = (uint32_t)yu & 0x01FFFFFF;
+    y1 = (uint32_t)(yu >> 25);
+    w = (uint64_t)x0 * (uint64_t)y0;
+    z0 = (uint32_t)w & 0x01FFFFFF;
+    z1 = (uint32_t)(w >> 25);
+    w = (uint64_t)x0 * (uint64_t)y1;
+    z1 += (uint32_t)w & 0x01FFFFFF;
+    z2 = (uint32_t)(w >> 25);
+    w = (uint64_t)x1 * (uint64_t)y0;
+    z1 += (uint32_t)w & 0x01FFFFFF;
+    z2 += (uint32_t)(w >> 25);
+    zu = (uint64_t)x1 * (uint64_t)y1;
+    z2 += (z1 >> 25);
+    z1 &= 0x01FFFFFF;
+    zu += z2;
+
+    /*
+     * Since xu and yu are both in the 2^52..2^53-1 range, the
+     * product is in the 2^104..2^106-1 range. We first reassemble
+     * it and round it into the 2^54..2^56-1 range; the bottom bit
+     * is made "sticky". Since the low limbs z0 and z1 are 25 bits
+     * each, we just take the upper part (zu), and consider z0 and
+     * z1 only for purposes of stickiness.
+     * (This is the reason why we chose 25-bit limbs above.)
+     */
+    zu |= ((z0 | z1) + 0x01FFFFFF) >> 25;
+
+    /*
+     * We normalize zu to the 2^54..s^55-1 range: it could be one
+     * bit too large at this point. This is done with a conditional
+     * right-shift that takes into account the sticky bit.
+     */
+    zv = (zu >> 1) | (zu & 1);
+    w = zu >> 55;
+    zu ^= (zu ^ zv) & -w;
+
+    /*
+     * Get the aggregate scaling factor:
+     *
+     *   - Each exponent is biased by 1023.
+     *
+     *   - Integral mantissas are scaled by 2^52, hence an
+     *     extra 52 bias for each exponent.
+     *
+     *   - However, we right-shifted z by 50 bits, and then
+     *     by 0 or 1 extra bit (depending on the value of w).
+     *
+     * In total, we must add the exponents, then subtract
+     * 2 * (1023 + 52), then add 50 + w.
+     */
+    ex = (int)((x >> 52) & 0x7FF);
+    ey = (int)((y >> 52) & 0x7FF);
+    e = ex + ey - 2100 + (int)w;
+
+    /*
+     * Sign bit is the XOR of the operand sign bits.
+     */
+    s = (int)((x ^ y) >> 63);
+
+    /*
+     * Corrective actions for zeros: if either of the operands is
+     * zero, then the computations above were wrong. Test for zero
+     * is whether ex or ey is zero. We just have to set the mantissa
+     * (zu) to zero, the FPR() function will normalize e.
+     */
+    d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11;
+    zu &= -(uint64_t)d;
+
+    /*
+     * FPR() packs the result and applies proper rounding.
+     */
+    return FPR(s, e, zu);
+}
+
+fpr
+fpr_div(fpr x, fpr y) {
+    uint64_t xu, yu, q, q2, w;
+    int i, ex, ey, e, d, s;
+
+    /*
+     * Extract mantissas of x and y (unsigned).
+     */
+    xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+    yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+    /*
+     * Perform bit-by-bit division of xu by yu. We run it for 55 bits.
+     */
+    q = 0;
+    for (i = 0; i < 55; i ++) {
+        /*
+         * If yu is less than or equal xu, then subtract it and
+         * push a 1 in the quotient; otherwise, leave xu unchanged
+         * and push a 0.
+         */
+        uint64_t b;
+
+        b = ((xu - yu) >> 63) - 1;
+        xu -= b & yu;
+        q |= b & 1;
+        xu <<= 1;
+        q <<= 1;
+    }
+
+    /*
+     * We got 55 bits in the quotient, followed by an extra zero. We
+     * want that 56th bit to be "sticky": it should be a 1 if and
+     * only if the remainder (xu) is non-zero.
+     */
+    q |= (xu | -xu) >> 63;
+
+    /*
+     * Quotient is at most 2^56-1. Its top bit may be zero, but in
+     * that case the next-to-top bit will be a one, since the
+     * initial xu and yu were both in the 2^52..2^53-1 range.
+     * We perform a conditional shift to normalize q to the
+     * 2^54..2^55-1 range (with the bottom bit being sticky).
+     */
+    q2 = (q >> 1) | (q & 1);
+    w = q >> 55;
+    q ^= (q ^ q2) & -w;
+
+    /*
+     * Extract exponents to compute the scaling factor:
+     *
+     *   - Each exponent is biased and we scaled them up by
+     *     52 bits; but these biases will cancel out.
+     *
+     *   - The division loop produced a 55-bit shifted result,
+     *     so we must scale it down by 55 bits.
+     *
+     *   - If w = 1, we right-shifted the integer by 1 bit,
+     *     hence we must add 1 to the scaling.
+     */
+    ex = (int)((x >> 52) & 0x7FF);
+    ey = (int)((y >> 52) & 0x7FF);
+    e = ex - ey - 55 + (int)w;
+
+    /*
+     * Sign is the XOR of the signs of the operands.
+     */
+    s = (int)((x ^ y) >> 63);
+
+    /*
+     * Corrective actions for zeros: if x = 0, then the computation
+     * is wrong, and we must clamp e and q to 0. We do not care
+     * about the case y = 0 (as per assumptions in this module,
+     * the caller does not perform divisions by zero).
+     */
+    d = (ex + 0x7FF) >> 11;
+    s &= d;
+    e &= -d;
+    q &= -(uint64_t)d;
+
+    /*
+     * FPR() packs the result and applies proper rounding.
+     */
+    return FPR(s, e, q);
+}
+
+fpr
+fpr_sqrt(fpr x) {
+    uint64_t xu, q, s, r;
+    int ex, e;
+
+    /*
+     * Extract the mantissa and the exponent. We don't care about
+     * the sign: by assumption, the operand is nonnegative.
+     * We want the "true" exponent corresponding to a mantissa
+     * in the 1..2 range.
+     */
+    xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+    ex = (int)((x >> 52) & 0x7FF);
+    e = ex - 1023;
+
+    /*
+     * If the exponent is odd, double the mantissa and decrement
+     * the exponent. The exponent is then halved to account for
+     * the square root.
+     */
+    xu += xu & -(uint64_t)(e & 1);
+    e >>= 1;
+
+    /*
+     * Double the mantissa.
+     */
+    xu <<= 1;
+
+    /*
+     * We now have a mantissa in the 2^53..2^55-1 range. It
+     * represents a value between 1 (inclusive) and 4 (exclusive)
+     * in fixed point notation (with 53 fractional bits). We
+     * compute the square root bit by bit.
+     */
+    q = 0;
+    s = 0;
+    r = (uint64_t)1 << 53;
+    for (int i = 0; i < 54; i ++) {
+        uint64_t t, b;
+
+        t = s + r;
+        b = ((xu - t) >> 63) - 1;
+        s += (r << 1) & b;
+        xu -= t & b;
+        q += r & b;
+        xu <<= 1;
+        r >>= 1;
+    }
+
+    /*
+     * Now, q is a rounded-low 54-bit value, with a leading 1,
+     * 52 fractional digits, and an additional guard bit. We add
+     * an extra sticky bit to account for what remains of the operand.
+     */
+    q <<= 1;
+    q |= (xu | -xu) >> 63;
+
+    /*
+     * Result q is in the 2^54..2^55-1 range; we bias the exponent
+     * by 54 bits (the value e at that point contains the "true"
+     * exponent, but q is now considered an integer, i.e. scaled
+     * up.
+     */
+    e -= 54;
+
+    /*
+     * Corrective action for an operand of value zero.
+     */
+    q &= -(uint64_t)((ex + 0x7FF) >> 11);
+
+    /*
+     * Apply rounding and back result.
+     */
+    return FPR(0, e, q);
+}
+
+uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+    /*
+     * Polynomial approximation of exp(-x) is taken from FACCT:
+     *   https://eprint.iacr.org/2018/1234
+     * Specifically, values are extracted from the implementation
+     * referenced from the FACCT article, and available at:
+     *   https://github.com/raykzhao/gaussian
+     * Here, the coefficients have been scaled up by 2^63 and
+     * converted to integers.
+     *
+     * Tests over more than 24 billions of random inputs in the
+     * 0..log(2) range have never shown a deviation larger than
+     * 2^(-50) from the true mathematical value.
+     */
+    static const uint64_t C[] = {
+        0x00000004741183A3u,
+        0x00000036548CFC06u,
+        0x0000024FDCBF140Au,
+        0x0000171D939DE045u,
+        0x0000D00CF58F6F84u,
+        0x000680681CF796E3u,
+        0x002D82D8305B0FEAu,
+        0x011111110E066FD0u,
+        0x0555555555070F00u,
+        0x155555555581FF00u,
+        0x400000000002B400u,
+        0x7FFFFFFFFFFF4800u,
+        0x8000000000000000u
+    };
+
+    uint64_t z, y;
+    unsigned u;
+    uint32_t z0, z1, y0, y1;
+    uint64_t a, b;
+
+    y = C[0];
+    z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
+    for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) {
+        /*
+         * Compute product z * y over 128 bits, but keep only
+         * the top 64 bits.
+         *
+         * TODO: On some architectures/compilers we could use
+         * some intrinsics (__umulh() on MSVC) or other compiler
+         * extensions (unsigned __int128 on GCC / Clang) for
+         * improved speed; however, most 64-bit architectures
+         * also have appropriate IEEE754 floating-point support,
+         * which is better.
+         */
+        uint64_t c;
+
+        z0 = (uint32_t)z;
+        z1 = (uint32_t)(z >> 32);
+        y0 = (uint32_t)y;
+        y1 = (uint32_t)(y >> 32);
+        a = ((uint64_t)z0 * (uint64_t)y1)
+            + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+        b = ((uint64_t)z1 * (uint64_t)y0);
+        c = (a >> 32) + (b >> 32);
+        c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+        c += (uint64_t)z1 * (uint64_t)y1;
+        y = C[u] - c;
+    }
+
+    /*
+     * The scaling factor must be applied at the end. Since y is now
+     * in fixed-point notation, we have to convert the factor to the
+     * same format, and do an extra integer multiplication.
+     */
+    z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+    z0 = (uint32_t)z;
+    z1 = (uint32_t)(z >> 32);
+    y0 = (uint32_t)y;
+    y1 = (uint32_t)(y >> 32);
+    a = ((uint64_t)z0 * (uint64_t)y1)
+        + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+    b = ((uint64_t)z1 * (uint64_t)y0);
+    y = (a >> 32) + (b >> 32);
+    y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+    y += (uint64_t)z1 * (uint64_t)y1;
+
+    return y;
+}
+
+const fpr fpr_gm_tab[] = {
+    0, 0,
+    9223372036854775808U,  4607182418800017408U,
+    4604544271217802189U,  4604544271217802189U,
+    13827916308072577997U,  4604544271217802189U,
+    4606496786581982534U,  4600565431771507043U,
+    13823937468626282851U,  4606496786581982534U,
+    4600565431771507043U,  4606496786581982534U,
+    13829868823436758342U,  4600565431771507043U,
+    4607009347991985328U,  4596196889902818827U,
+    13819568926757594635U,  4607009347991985328U,
+    4603179351334086856U,  4605664432017547683U,
+    13829036468872323491U,  4603179351334086856U,
+    4605664432017547683U,  4603179351334086856U,
+    13826551388188862664U,  4605664432017547683U,
+    4596196889902818827U,  4607009347991985328U,
+    13830381384846761136U,  4596196889902818827U,
+    4607139046673687846U,  4591727299969791020U,
+    13815099336824566828U,  4607139046673687846U,
+    4603889326261607894U,  4605137878724712257U,
+    13828509915579488065U,  4603889326261607894U,
+    4606118860100255153U,  4602163548591158843U,
+    13825535585445934651U,  4606118860100255153U,
+    4598900923775164166U,  4606794571824115162U,
+    13830166608678890970U,  4598900923775164166U,
+    4606794571824115162U,  4598900923775164166U,
+    13822272960629939974U,  4606794571824115162U,
+    4602163548591158843U,  4606118860100255153U,
+    13829490896955030961U,  4602163548591158843U,
+    4605137878724712257U,  4603889326261607894U,
+    13827261363116383702U,  4605137878724712257U,
+    4591727299969791020U,  4607139046673687846U,
+    13830511083528463654U,  4591727299969791020U,
+    4607171569234046334U,  4587232218149935124U,
+    13810604255004710932U,  4607171569234046334U,
+    4604224084862889120U,  4604849113969373103U,
+    13828221150824148911U,  4604224084862889120U,
+    4606317631232591731U,  4601373767755717824U,
+    13824745804610493632U,  4606317631232591731U,
+    4599740487990714333U,  4606655894547498725U,
+    13830027931402274533U,  4599740487990714333U,
+    4606912484326125783U,  4597922303871901467U,
+    13821294340726677275U,  4606912484326125783U,
+    4602805845399633902U,  4605900952042040894U,
+    13829272988896816702U,  4602805845399633902U,
+    4605409869824231233U,  4603540801876750389U,
+    13826912838731526197U,  4605409869824231233U,
+    4594454542771183930U,  4607084929468638487U,
+    13830456966323414295U,  4594454542771183930U,
+    4607084929468638487U,  4594454542771183930U,
+    13817826579625959738U,  4607084929468638487U,
+    4603540801876750389U,  4605409869824231233U,
+    13828781906679007041U,  4603540801876750389U,
+    4605900952042040894U,  4602805845399633902U,
+    13826177882254409710U,  4605900952042040894U,
+    4597922303871901467U,  4606912484326125783U,
+    13830284521180901591U,  4597922303871901467U,
+    4606655894547498725U,  4599740487990714333U,
+    13823112524845490141U,  4606655894547498725U,
+    4601373767755717824U,  4606317631232591731U,
+    13829689668087367539U,  4601373767755717824U,
+    4604849113969373103U,  4604224084862889120U,
+    13827596121717664928U,  4604849113969373103U,
+    4587232218149935124U,  4607171569234046334U,
+    13830543606088822142U,  4587232218149935124U,
+    4607179706000002317U,  4582730748936808062U,
+    13806102785791583870U,  4607179706000002317U,
+    4604386048625945823U,  4604698657331085206U,
+    13828070694185861014U,  4604386048625945823U,
+    4606409688975526202U,  4600971798440897930U,
+    13824343835295673738U,  4606409688975526202U,
+    4600154912527631775U,  4606578871587619388U,
+    13829950908442395196U,  4600154912527631775U,
+    4606963563043808649U,  4597061974398750563U,
+    13820434011253526371U,  4606963563043808649U,
+    4602994049708411683U,  4605784983948558848U,
+    13829157020803334656U,  4602994049708411683U,
+    4605539368864982914U,  4603361638657888991U,
+    13826733675512664799U,  4605539368864982914U,
+    4595327571478659014U,  4607049811591515049U,
+    13830421848446290857U,  4595327571478659014U,
+    4607114680469659603U,  4593485039402578702U,
+    13816857076257354510U,  4607114680469659603U,
+    4603716733069447353U,  4605276012900672507U,
+    13828648049755448315U,  4603716733069447353U,
+    4606012266443150634U,  4602550884377336506U,
+    13825922921232112314U,  4606012266443150634U,
+    4598476289818621559U,  4606856142606846307U,
+    13830228179461622115U,  4598476289818621559U,
+    4606727809065869586U,  4599322407794599425U,
+    13822694444649375233U,  4606727809065869586U,
+    4601771097584682078U,  4606220668805321205U,
+    13829592705660097013U,  4601771097584682078U,
+    4604995550503212910U,  4604058477489546729U,
+    13827430514344322537U,  4604995550503212910U,
+    4589965306122607094U,  4607158013403433018U,
+    13830530050258208826U,  4589965306122607094U,
+    4607158013403433018U,  4589965306122607094U,
+    13813337342977382902U,  4607158013403433018U,
+    4604058477489546729U,  4604995550503212910U,
+    13828367587357988718U,  4604058477489546729U,
+    4606220668805321205U,  4601771097584682078U,
+    13825143134439457886U,  4606220668805321205U,
+    4599322407794599425U,  4606727809065869586U,
+    13830099845920645394U,  4599322407794599425U,
+    4606856142606846307U,  4598476289818621559U,
+    13821848326673397367U,  4606856142606846307U,
+    4602550884377336506U,  4606012266443150634U,
+    13829384303297926442U,  4602550884377336506U,
+    4605276012900672507U,  4603716733069447353U,
+    13827088769924223161U,  4605276012900672507U,
+    4593485039402578702U,  4607114680469659603U,
+    13830486717324435411U,  4593485039402578702U,
+    4607049811591515049U,  4595327571478659014U,
+    13818699608333434822U,  4607049811591515049U,
+    4603361638657888991U,  4605539368864982914U,
+    13828911405719758722U,  4603361638657888991U,
+    4605784983948558848U,  4602994049708411683U,
+    13826366086563187491U,  4605784983948558848U,
+    4597061974398750563U,  4606963563043808649U,
+    13830335599898584457U,  4597061974398750563U,
+    4606578871587619388U,  4600154912527631775U,
+    13823526949382407583U,  4606578871587619388U,
+    4600971798440897930U,  4606409688975526202U,
+    13829781725830302010U,  4600971798440897930U,
+    4604698657331085206U,  4604386048625945823U,
+    13827758085480721631U,  4604698657331085206U,
+    4582730748936808062U,  4607179706000002317U,
+    13830551742854778125U,  4582730748936808062U,
+    4607181740574479067U,  4578227681973159812U,
+    13801599718827935620U,  4607181740574479067U,
+    4604465633578481725U,  4604621949701367983U,
+    13827993986556143791U,  4604465633578481725U,
+    4606453861145241227U,  4600769149537129431U,
+    13824141186391905239U,  4606453861145241227U,
+    4600360675823176935U,  4606538458821337243U,
+    13829910495676113051U,  4600360675823176935U,
+    4606987119037722413U,  4596629994023683153U,
+    13820002030878458961U,  4606987119037722413U,
+    4603087070374583113U,  4605725276488455441U,
+    13829097313343231249U,  4603087070374583113U,
+    4605602459698789090U,  4603270878689749849U,
+    13826642915544525657U,  4605602459698789090U,
+    4595762727260045105U,  4607030246558998647U,
+    13830402283413774455U,  4595762727260045105U,
+    4607127537664763515U,  4592606767730311893U,
+    13815978804585087701U,  4607127537664763515U,
+    4603803453461190356U,  4605207475328619533U,
+    13828579512183395341U,  4603803453461190356U,
+    4606066157444814153U,  4602357870542944470U,
+    13825729907397720278U,  4606066157444814153U,
+    4598688984595225406U,  4606826008603986804U,
+    13830198045458762612U,  4598688984595225406U,
+    4606761837001494797U,  4599112075441176914U,
+    13822484112295952722U,  4606761837001494797U,
+    4601967947786150793U,  4606170366472647579U,
+    13829542403327423387U,  4601967947786150793U,
+    4605067233569943231U,  4603974338538572089U,
+    13827346375393347897U,  4605067233569943231U,
+    4590846768565625881U,  4607149205763218185U,
+    13830521242617993993U,  4590846768565625881U,
+    4607165468267934125U,  4588998070480937184U,
+    13812370107335712992U,  4607165468267934125U,
+    4604141730443515286U,  4604922840319727473U,
+    13828294877174503281U,  4604141730443515286U,
+    4606269759522929756U,  4601573027631668967U,
+    13824945064486444775U,  4606269759522929756U,
+    4599531889160152938U,  4606692493141721470U,
+    13830064529996497278U,  4599531889160152938U,
+    4606884969294623682U,  4598262871476403630U,
+    13821634908331179438U,  4606884969294623682U,
+    4602710690099904183U,  4605957195211051218U,
+    13829329232065827026U,  4602710690099904183U,
+    4605343481119364930U,  4603629178146150899U,
+    13827001215000926707U,  4605343481119364930U,
+    4594016801320007031U,  4607100477024622401U,
+    13830472513879398209U,  4594016801320007031U,
+    4607068040143112603U,  4594891488091520602U,
+    13818263524946296410U,  4607068040143112603U,
+    4603451617570386922U,  4605475169017376660U,
+    13828847205872152468U,  4603451617570386922U,
+    4605843545406134034U,  4602900303344142735U,
+    13826272340198918543U,  4605843545406134034U,
+    4597492765973365521U,  4606938683557690074U,
+    13830310720412465882U,  4597492765973365521U,
+    4606618018794815019U,  4599948172872067014U,
+    13823320209726842822U,  4606618018794815019U,
+    4601173347964633034U,  4606364276725003740U,
+    13829736313579779548U,  4601173347964633034U,
+    4604774382555066977U,  4604305528345395596U,
+    13827677565200171404U,  4604774382555066977U,
+    4585465300892538317U,  4607176315382986589U,
+    13830548352237762397U,  4585465300892538317U,
+    4607176315382986589U,  4585465300892538317U,
+    13808837337747314125U,  4607176315382986589U,
+    4604305528345395596U,  4604774382555066977U,
+    13828146419409842785U,  4604305528345395596U,
+    4606364276725003740U,  4601173347964633034U,
+    13824545384819408842U,  4606364276725003740U,
+    4599948172872067014U,  4606618018794815019U,
+    13829990055649590827U,  4599948172872067014U,
+    4606938683557690074U,  4597492765973365521U,
+    13820864802828141329U,  4606938683557690074U,
+    4602900303344142735U,  4605843545406134034U,
+    13829215582260909842U,  4602900303344142735U,
+    4605475169017376660U,  4603451617570386922U,
+    13826823654425162730U,  4605475169017376660U,
+    4594891488091520602U,  4607068040143112603U,
+    13830440076997888411U,  4594891488091520602U,
+    4607100477024622401U,  4594016801320007031U,
+    13817388838174782839U,  4607100477024622401U,
+    4603629178146150899U,  4605343481119364930U,
+    13828715517974140738U,  4603629178146150899U,
+    4605957195211051218U,  4602710690099904183U,
+    13826082726954679991U,  4605957195211051218U,
+    4598262871476403630U,  4606884969294623682U,
+    13830257006149399490U,  4598262871476403630U,
+    4606692493141721470U,  4599531889160152938U,
+    13822903926014928746U,  4606692493141721470U,
+    4601573027631668967U,  4606269759522929756U,
+    13829641796377705564U,  4601573027631668967U,
+    4604922840319727473U,  4604141730443515286U,
+    13827513767298291094U,  4604922840319727473U,
+    4588998070480937184U,  4607165468267934125U,
+    13830537505122709933U,  4588998070480937184U,
+    4607149205763218185U,  4590846768565625881U,
+    13814218805420401689U,  4607149205763218185U,
+    4603974338538572089U,  4605067233569943231U,
+    13828439270424719039U,  4603974338538572089U,
+    4606170366472647579U,  4601967947786150793U,
+    13825339984640926601U,  4606170366472647579U,
+    4599112075441176914U,  4606761837001494797U,
+    13830133873856270605U,  4599112075441176914U,
+    4606826008603986804U,  4598688984595225406U,
+    13822061021450001214U,  4606826008603986804U,
+    4602357870542944470U,  4606066157444814153U,
+    13829438194299589961U,  4602357870542944470U,
+    4605207475328619533U,  4603803453461190356U,
+    13827175490315966164U,  4605207475328619533U,
+    4592606767730311893U,  4607127537664763515U,
+    13830499574519539323U,  4592606767730311893U,
+    4607030246558998647U,  4595762727260045105U,
+    13819134764114820913U,  4607030246558998647U,
+    4603270878689749849U,  4605602459698789090U,
+    13828974496553564898U,  4603270878689749849U,
+    4605725276488455441U,  4603087070374583113U,
+    13826459107229358921U,  4605725276488455441U,
+    4596629994023683153U,  4606987119037722413U,
+    13830359155892498221U,  4596629994023683153U,
+    4606538458821337243U,  4600360675823176935U,
+    13823732712677952743U,  4606538458821337243U,
+    4600769149537129431U,  4606453861145241227U,
+    13829825898000017035U,  4600769149537129431U,
+    4604621949701367983U,  4604465633578481725U,
+    13827837670433257533U,  4604621949701367983U,
+    4578227681973159812U,  4607181740574479067U,
+    13830553777429254875U,  4578227681973159812U,
+    4607182249242036882U,  4573724215515480177U,
+    13797096252370255985U,  4607182249242036882U,
+    4604505071555817232U,  4604583231088591477U,
+    13827955267943367285U,  4604505071555817232U,
+    4606475480113671417U,  4600667422348321968U,
+    13824039459203097776U,  4606475480113671417U,
+    4600463181646572228U,  4606517779747998088U,
+    13829889816602773896U,  4600463181646572228U,
+    4606998399608725124U,  4596413578358834022U,
+    13819785615213609830U,  4606998399608725124U,
+    4603133304188877240U,  4605694995810664660U,
+    13829067032665440468U,  4603133304188877240U,
+    4605633586259814045U,  4603225210076562971U,
+    13826597246931338779U,  4605633586259814045U,
+    4595979936813835462U,  4607019963775302583U,
+    13830392000630078391U,  4595979936813835462U,
+    4607133460805585796U,  4592167175087283203U,
+    13815539211942059011U,  4607133460805585796U,
+    4603846496621587377U,  4605172808754305228U,
+    13828544845609081036U,  4603846496621587377U,
+    4606092657816072624U,  4602260871257280788U,
+    13825632908112056596U,  4606092657816072624U,
+    4598795050632330097U,  4606810452769876110U,
+    13830182489624651918U,  4598795050632330097U,
+    4606778366364612594U,  4599006600037663623U,
+    13822378636892439431U,  4606778366364612594U,
+    4602065906208722008U,  4606144763310860551U,
+    13829516800165636359U,  4602065906208722008U,
+    4605102686554936490U,  4603931940768740167U,
+    13827303977623515975U,  4605102686554936490U,
+    4591287158938884897U,  4607144295058764886U,
+    13830516331913540694U,  4591287158938884897U,
+    4607168688050493276U,  4588115294056142819U,
+    13811487330910918627U,  4607168688050493276U,
+    4604183020748362039U,  4604886103475043762U,
+    13828258140329819570U,  4604183020748362039U,
+    4606293848208650998U,  4601473544562720001U,
+    13824845581417495809U,  4606293848208650998U,
+    4599636300858866724U,  4606674353838411301U,
+    13830046390693187109U,  4599636300858866724U,
+    4606898891031025132U,  4598136582470364665U,
+    13821508619325140473U,  4606898891031025132U,
+    4602758354025980442U,  4605929219593405673U,
+    13829301256448181481U,  4602758354025980442U,
+    4605376811039722786U,  4603585091850767959U,
+    13826957128705543767U,  4605376811039722786U,
+    4594235767444503503U,  4607092871118901179U,
+    13830464907973676987U,  4594235767444503503U,
+    4607076652372832968U,  4594673119063280916U,
+    13818045155918056724U,  4607076652372832968U,
+    4603496309891590679U,  4605442656228245717U,
+    13828814693083021525U,  4603496309891590679U,
+    4605872393621214213U,  4602853162432841185U,
+    13826225199287616993U,  4605872393621214213U,
+    4597707695679609371U,  4606925748668145757U,
+    13830297785522921565U,  4597707695679609371U,
+    4606637115963965612U,  4599844446633109139U,
+    13823216483487884947U,  4606637115963965612U,
+    4601273700967202825U,  4606341107699334546U,
+    13829713144554110354U,  4601273700967202825U,
+    4604811873195349477U,  4604264921241055824U,
+    13827636958095831632U,  4604811873195349477U,
+    4586348876009622851U,  4607174111710118367U,
+    13830546148564894175U,  4586348876009622851U,
+    4607178180169683960U,  4584498631466405633U,
+    13807870668321181441U,  4607178180169683960U,
+    4604345904647073908U,  4604736643460027021U,
+    13828108680314802829U,  4604345904647073908U,
+    4606387137437298591U,  4601072712526242277U,
+    13824444749381018085U,  4606387137437298591U,
+    4600051662802353687U,  4606598603759044570U,
+    13829970640613820378U,  4600051662802353687U,
+    4606951288507767453U,  4597277522845151878U,
+    13820649559699927686U,  4606951288507767453U,
+    4602947266358709886U,  4605814408482919348U,
+    13829186445337695156U,  4602947266358709886U,
+    4605507406967535927U,  4603406726595779752U,
+    13826778763450555560U,  4605507406967535927U,
+    4595109641634432498U,  4607059093103722971U,
+    13830431129958498779U,  4595109641634432498U,
+    4607107746899444102U,  4593797652641645341U,
+    13817169689496421149U,  4607107746899444102U,
+    4603673059103075106U,  4605309881318010327U,
+    13828681918172786135U,  4603673059103075106U,
+    4605984877841711338U,  4602646891659203088U,
+    13826018928513978896U,  4605984877841711338U,
+    4598369669086960528U,  4606870719641066940U,
+    13830242756495842748U,  4598369669086960528U,
+    4606710311774494716U,  4599427256825614420U,
+    13822799293680390228U,  4606710311774494716U,
+    4601672213217083403U,  4606245366082353408U,
+    13829617402937129216U,  4601672213217083403U,
+    4604959323120302796U,  4604100215502905499U,
+    13827472252357681307U,  4604959323120302796U,
+    4589524267239410099U,  4607161910007591876U,
+    13830533946862367684U,  4589524267239410099U,
+    4607153778602162496U,  4590406145430462614U,
+    13813778182285238422U,  4607153778602162496U,
+    4604016517974851588U,  4605031521104517324U,
+    13828403557959293132U,  4604016517974851588U,
+    4606195668621671667U,  4601869677011524443U,
+    13825241713866300251U,  4606195668621671667U,
+    4599217346014614711U,  4606744984357082948U,
+    13830117021211858756U,  4599217346014614711U,
+    4606841238740778884U,  4598582729657176439U,
+    13821954766511952247U,  4606841238740778884U,
+    4602454542796181607U,  4606039359984203741U,
+    13829411396838979549U,  4602454542796181607U,
+    4605241877142478242U,  4603760198400967492U,
+    13827132235255743300U,  4605241877142478242U,
+    4593046061348462537U,  4607121277474223905U,
+    13830493314328999713U,  4593046061348462537U,
+    4607040195955932526U,  4595545269419264690U,
+    13818917306274040498U,  4607040195955932526U,
+    4603316355454250015U,  4605571053506370248U,
+    13828943090361146056U,  4603316355454250015U,
+    4605755272910869620U,  4603040651631881451U,
+    13826412688486657259U,  4605755272910869620U,
+    4596846128749438754U,  4606975506703684317U,
+    13830347543558460125U,  4596846128749438754U,
+    4606558823023444576U,  4600257918160607478U,
+    13823629955015383286U,  4606558823023444576U,
+    4600870609507958271U,  4606431930490633905U,
+    13829803967345409713U,  4600870609507958271U,
+    4604660425598397818U,  4604425958770613225U,
+    13827797995625389033U,  4604660425598397818U,
+    4580962600092897021U,  4607180892816495009U,
+    13830552929671270817U,  4580962600092897021U,
+    4607180892816495009U,  4580962600092897021U,
+    13804334636947672829U,  4607180892816495009U,
+    4604425958770613225U,  4604660425598397818U,
+    13828032462453173626U,  4604425958770613225U,
+    4606431930490633905U,  4600870609507958271U,
+    13824242646362734079U,  4606431930490633905U,
+    4600257918160607478U,  4606558823023444576U,
+    13829930859878220384U,  4600257918160607478U,
+    4606975506703684317U,  4596846128749438754U,
+    13820218165604214562U,  4606975506703684317U,
+    4603040651631881451U,  4605755272910869620U,
+    13829127309765645428U,  4603040651631881451U,
+    4605571053506370248U,  4603316355454250015U,
+    13826688392309025823U,  4605571053506370248U,
+    4595545269419264690U,  4607040195955932526U,
+    13830412232810708334U,  4595545269419264690U,
+    4607121277474223905U,  4593046061348462537U,
+    13816418098203238345U,  4607121277474223905U,
+    4603760198400967492U,  4605241877142478242U,
+    13828613913997254050U,  4603760198400967492U,
+    4606039359984203741U,  4602454542796181607U,
+    13825826579650957415U,  4606039359984203741U,
+    4598582729657176439U,  4606841238740778884U,
+    13830213275595554692U,  4598582729657176439U,
+    4606744984357082948U,  4599217346014614711U,
+    13822589382869390519U,  4606744984357082948U,
+    4601869677011524443U,  4606195668621671667U,
+    13829567705476447475U,  4601869677011524443U,
+    4605031521104517324U,  4604016517974851588U,
+    13827388554829627396U,  4605031521104517324U,
+    4590406145430462614U,  4607153778602162496U,
+    13830525815456938304U,  4590406145430462614U,
+    4607161910007591876U,  4589524267239410099U,
+    13812896304094185907U,  4607161910007591876U,
+    4604100215502905499U,  4604959323120302796U,
+    13828331359975078604U,  4604100215502905499U,
+    4606245366082353408U,  4601672213217083403U,
+    13825044250071859211U,  4606245366082353408U,
+    4599427256825614420U,  4606710311774494716U,
+    13830082348629270524U,  4599427256825614420U,
+    4606870719641066940U,  4598369669086960528U,
+    13821741705941736336U,  4606870719641066940U,
+    4602646891659203088U,  4605984877841711338U,
+    13829356914696487146U,  4602646891659203088U,
+    4605309881318010327U,  4603673059103075106U,
+    13827045095957850914U,  4605309881318010327U,
+    4593797652641645341U,  4607107746899444102U,
+    13830479783754219910U,  4593797652641645341U,
+    4607059093103722971U,  4595109641634432498U,
+    13818481678489208306U,  4607059093103722971U,
+    4603406726595779752U,  4605507406967535927U,
+    13828879443822311735U,  4603406726595779752U,
+    4605814408482919348U,  4602947266358709886U,
+    13826319303213485694U,  4605814408482919348U,
+    4597277522845151878U,  4606951288507767453U,
+    13830323325362543261U,  4597277522845151878U,
+    4606598603759044570U,  4600051662802353687U,
+    13823423699657129495U,  4606598603759044570U,
+    4601072712526242277U,  4606387137437298591U,
+    13829759174292074399U,  4601072712526242277U,
+    4604736643460027021U,  4604345904647073908U,
+    13827717941501849716U,  4604736643460027021U,
+    4584498631466405633U,  4607178180169683960U,
+    13830550217024459768U,  4584498631466405633U,
+    4607174111710118367U,  4586348876009622851U,
+    13809720912864398659U,  4607174111710118367U,
+    4604264921241055824U,  4604811873195349477U,
+    13828183910050125285U,  4604264921241055824U,
+    4606341107699334546U,  4601273700967202825U,
+    13824645737821978633U,  4606341107699334546U,
+    4599844446633109139U,  4606637115963965612U,
+    13830009152818741420U,  4599844446633109139U,
+    4606925748668145757U,  4597707695679609371U,
+    13821079732534385179U,  4606925748668145757U,
+    4602853162432841185U,  4605872393621214213U,
+    13829244430475990021U,  4602853162432841185U,
+    4605442656228245717U,  4603496309891590679U,
+    13826868346746366487U,  4605442656228245717U,
+    4594673119063280916U,  4607076652372832968U,
+    13830448689227608776U,  4594673119063280916U,
+    4607092871118901179U,  4594235767444503503U,
+    13817607804299279311U,  4607092871118901179U,
+    4603585091850767959U,  4605376811039722786U,
+    13828748847894498594U,  4603585091850767959U,
+    4605929219593405673U,  4602758354025980442U,
+    13826130390880756250U,  4605929219593405673U,
+    4598136582470364665U,  4606898891031025132U,
+    13830270927885800940U,  4598136582470364665U,
+    4606674353838411301U,  4599636300858866724U,
+    13823008337713642532U,  4606674353838411301U,
+    4601473544562720001U,  4606293848208650998U,
+    13829665885063426806U,  4601473544562720001U,
+    4604886103475043762U,  4604183020748362039U,
+    13827555057603137847U,  4604886103475043762U,
+    4588115294056142819U,  4607168688050493276U,
+    13830540724905269084U,  4588115294056142819U,
+    4607144295058764886U,  4591287158938884897U,
+    13814659195793660705U,  4607144295058764886U,
+    4603931940768740167U,  4605102686554936490U,
+    13828474723409712298U,  4603931940768740167U,
+    4606144763310860551U,  4602065906208722008U,
+    13825437943063497816U,  4606144763310860551U,
+    4599006600037663623U,  4606778366364612594U,
+    13830150403219388402U,  4599006600037663623U,
+    4606810452769876110U,  4598795050632330097U,
+    13822167087487105905U,  4606810452769876110U,
+    4602260871257280788U,  4606092657816072624U,
+    13829464694670848432U,  4602260871257280788U,
+    4605172808754305228U,  4603846496621587377U,
+    13827218533476363185U,  4605172808754305228U,
+    4592167175087283203U,  4607133460805585796U,
+    13830505497660361604U,  4592167175087283203U,
+    4607019963775302583U,  4595979936813835462U,
+    13819351973668611270U,  4607019963775302583U,
+    4603225210076562971U,  4605633586259814045U,
+    13829005623114589853U,  4603225210076562971U,
+    4605694995810664660U,  4603133304188877240U,
+    13826505341043653048U,  4605694995810664660U,
+    4596413578358834022U,  4606998399608725124U,
+    13830370436463500932U,  4596413578358834022U,
+    4606517779747998088U,  4600463181646572228U,
+    13823835218501348036U,  4606517779747998088U,
+    4600667422348321968U,  4606475480113671417U,
+    13829847516968447225U,  4600667422348321968U,
+    4604583231088591477U,  4604505071555817232U,
+    13827877108410593040U,  4604583231088591477U,
+    4573724215515480177U,  4607182249242036882U,
+    13830554286096812690U,  4573724215515480177U,
+    4607182376410422530U,  4569220649180767418U,
+    13792592686035543226U,  4607182376410422530U,
+    4604524701268679793U,  4604563781218984604U,
+    13827935818073760412U,  4604524701268679793U,
+    4606486172460753999U,  4600616459743653188U,
+    13823988496598428996U,  4606486172460753999U,
+    4600514338912178239U,  4606507322377452870U,
+    13829879359232228678U,  4600514338912178239U,
+    4607003915349878877U,  4596305267720071930U,
+    13819677304574847738U,  4607003915349878877U,
+    4603156351203636159U,  4605679749231851918U,
+    13829051786086627726U,  4603156351203636159U,
+    4605649044311923410U,  4603202304363743346U,
+    13826574341218519154U,  4605649044311923410U,
+    4596088445927168004U,  4607014697483910382U,
+    13830386734338686190U,  4596088445927168004U,
+    4607136295912168606U,  4591947271803021404U,
+    13815319308657797212U,  4607136295912168606U,
+    4603867938232615808U,  4605155376589456981U,
+    13828527413444232789U,  4603867938232615808U,
+    4606105796280968177U,  4602212250118051877U,
+    13825584286972827685U,  4606105796280968177U,
+    4598848011564831930U,  4606802552898869248U,
+    13830174589753645056U,  4598848011564831930U,
+    4606786509620734768U,  4598953786765296928U,
+    13822325823620072736U,  4606786509620734768U,
+    4602114767134999006U,  4606131849150971908U,
+    13829503886005747716U,  4602114767134999006U,
+    4605120315324767624U,  4603910660507251362U,
+    13827282697362027170U,  4605120315324767624U,
+    4591507261658050721U,  4607141713064252300U,
+    13830513749919028108U,  4591507261658050721U,
+    4607170170974224083U,  4587673791460508439U,
+    13811045828315284247U,  4607170170974224083U,
+    4604203581176243359U,  4604867640218014515U,
+    13828239677072790323U,  4604203581176243359U,
+    4606305777984577632U,  4601423692641949331U,
+    13824795729496725139U,  4606305777984577632U,
+    4599688422741010356U,  4606665164148251002U,
+    13830037201003026810U,  4599688422741010356U,
+    4606905728766014348U,  4598029484874872834U,
+    13821401521729648642U,  4606905728766014348U,
+    4602782121393764535U,  4605915122243179241U,
+    13829287159097955049U,  4602782121393764535U,
+    4605393374401988274U,  4603562972219549215U,
+    13826935009074325023U,  4605393374401988274U,
+    4594345179472540681U,  4607088942243446236U,
+    13830460979098222044U,  4594345179472540681U,
+    4607080832832247697U,  4594563856311064231U,
+    13817935893165840039U,  4607080832832247697U,
+    4603518581031047189U,  4605426297151190466U,
+    13828798334005966274U,  4603518581031047189U,
+    4605886709123365959U,  4602829525820289164U,
+    13826201562675064972U,  4605886709123365959U,
+    4597815040470278984U,  4606919157647773535U,
+    13830291194502549343U,  4597815040470278984U,
+    4606646545123403481U,  4599792496117920694U,
+    13823164532972696502U,  4606646545123403481U,
+    4601323770373937522U,  4606329407841126011U,
+    13829701444695901819U,  4601323770373937522U,
+    4604830524903495634U,  4604244531615310815U,
+    13827616568470086623U,  4604830524903495634U,
+    4586790578280679046U,  4607172882816799076U,
+    13830544919671574884U,  4586790578280679046U,
+    4607178985458280057U,  4583614727651146525U,
+    13806986764505922333U,  4607178985458280057U,
+    4604366005771528720U,  4604717681185626434U,
+    13828089718040402242U,  4604366005771528720U,
+    4606398451906509788U,  4601022290077223616U,
+    13824394326931999424U,  4606398451906509788U,
+    4600103317933788342U,  4606588777269136769U,
+    13829960814123912577U,  4600103317933788342U,
+    4606957467106717424U,  4597169786279785693U,
+    13820541823134561501U,  4606957467106717424U,
+    4602970680601913687U,  4605799732098147061U,
+    13829171768952922869U,  4602970680601913687U,
+    4605523422498301790U,  4603384207141321914U,
+    13826756243996097722U,  4605523422498301790U,
+    4595218635031890910U,  4607054494135176056U,
+    13830426530989951864U,  4595218635031890910U,
+    4607111255739239816U,  4593688012422887515U,
+    13817060049277663323U,  4607111255739239816U,
+    4603694922063032361U,  4605292980606880364U,
+    13828665017461656172U,  4603694922063032361U,
+    4605998608960791335U,  4602598930031891166U,
+    13825970966886666974U,  4605998608960791335U,
+    4598423001813699022U,  4606863472012527185U,
+    13830235508867302993U,  4598423001813699022U,
+    4606719100629313491U,  4599374859150636784U,
+    13822746896005412592U,  4606719100629313491U,
+    4601721693286060937U,  4606233055365547081U,
+    13829605092220322889U,  4601721693286060937U,
+    4604977468824438271U,  4604079374282302598U,
+    13827451411137078406U,  4604977468824438271U,
+    4589744810590291021U,  4607160003989618959U,
+    13830532040844394767U,  4589744810590291021U,
+    4607155938267770208U,  4590185751760970393U,
+    13813557788615746201U,  4607155938267770208U,
+    4604037525321326463U,  4605013567986435066U,
+    13828385604841210874U,  4604037525321326463U,
+    4606208206518262803U,  4601820425647934753U,
+    13825192462502710561U,  4606208206518262803U,
+    4599269903251194481U,  4606736437002195879U,
+    13830108473856971687U,  4599269903251194481U,
+    4606848731493011465U,  4598529532600161144U,
+    13821901569454936952U,  4606848731493011465U,
+    4602502755147763107U,  4606025850160239809U,
+    13829397887015015617U,  4602502755147763107U,
+    4605258978359093269U,  4603738491917026584U,
+    13827110528771802392U,  4605258978359093269U,
+    4593265590854265407U,  4607118021058468598U,
+    13830490057913244406U,  4593265590854265407U,
+    4607045045516813836U,  4595436449949385485U,
+    13818808486804161293U,  4607045045516813836U,
+    4603339021357904144U,  4605555245917486022U,
+    13828927282772261830U,  4603339021357904144U,
+    4605770164172969910U,  4603017373458244943U,
+    13826389410313020751U,  4605770164172969910U,
+    4596954088216812973U,  4606969576261663845U,
+    13830341613116439653U,  4596954088216812973U,
+    4606568886807728474U,  4600206446098256018U,
+    13823578482953031826U,  4606568886807728474U,
+    4600921238092511730U,  4606420848538580260U,
+    13829792885393356068U,  4600921238092511730U,
+    4604679572075463103U,  4604406033021674239U,
+    13827778069876450047U,  4604679572075463103U,
+    4581846703643734566U,  4607180341788068727U,
+    13830552378642844535U,  4581846703643734566U,
+    4607181359080094673U,  4579996072175835083U,
+    13803368109030610891U,  4607181359080094673U,
+    4604445825685214043U,  4604641218080103285U,
+    13828013254934879093U,  4604445825685214043U,
+    4606442934727379583U,  4600819913163773071U,
+    13824191950018548879U,  4606442934727379583U,
+    4600309328230211502U,  4606548680329491866U,
+    13829920717184267674U,  4600309328230211502U,
+    4606981354314050484U,  4596738097012783531U,
+    13820110133867559339U,  4606981354314050484U,
+    4603063884010218172U,  4605740310302420207U,
+    13829112347157196015U,  4603063884010218172U,
+    4605586791482848547U,  4603293641160266722U,
+    13826665678015042530U,  4605586791482848547U,
+    4595654028864046335U,  4607035262954517034U,
+    13830407299809292842U,  4595654028864046335U,
+    4607124449686274900U,  4592826452951465409U,
+    13816198489806241217U,  4607124449686274900U,
+    4603781852316960384U,  4605224709411790590U,
+    13828596746266566398U,  4603781852316960384U,
+    4606052795787882823U,  4602406247776385022U,
+    13825778284631160830U,  4606052795787882823U,
+    4598635880488956483U,  4606833664420673202U,
+    13830205701275449010U,  4598635880488956483U,
+    4606753451050079834U,  4599164736579548843U,
+    13822536773434324651U,  4606753451050079834U,
+    4601918851211878557U,  4606183055233559255U,
+    13829555092088335063U,  4601918851211878557U,
+    4605049409688478101U,  4603995455647851249U,
+    13827367492502627057U,  4605049409688478101U,
+    4590626485056654602U,  4607151534426937478U,
+    13830523571281713286U,  4590626485056654602U,
+    4607163731439411601U,  4589303678145802340U,
+    13812675715000578148U,  4607163731439411601U,
+    4604121000955189926U,  4604941113561600762U,
+    13828313150416376570U,  4604121000955189926U,
+    4606257600839867033U,  4601622657843474729U,
+    13824994694698250537U,  4606257600839867033U,
+    4599479600326345459U,  4606701442584137310U,
+    13830073479438913118U,  4599479600326345459U,
+    4606877885424248132U,  4598316292140394014U,
+    13821688328995169822U,  4606877885424248132U,
+    4602686793990243041U,  4605971073215153165U,
+    13829343110069928973U,  4602686793990243041U,
+    4605326714874986465U,  4603651144395358093U,
+    13827023181250133901U,  4605326714874986465U,
+    4593907249284540294U,  4607104153983298999U,
+    13830476190838074807U,  4593907249284540294U,
+    4607063608453868552U,  4595000592312171144U,
+    13818372629166946952U,  4607063608453868552U,
+    4603429196809300824U,  4605491322423429598U,
+    13828863359278205406U,  4603429196809300824U,
+    4605829012964735987U,  4602923807199184054U,
+    13826295844053959862U,  4605829012964735987U,
+    4597385183080791534U,  4606945027305114062U,
+    13830317064159889870U,  4597385183080791534U,
+    4606608350964852124U,  4599999947619525579U,
+    13823371984474301387U,  4606608350964852124U,
+    4601123065313358619U,  4606375745674388705U,
+    13829747782529164513U,  4601123065313358619U,
+    4604755543975806820U,  4604325745441780828U,
+    13827697782296556636U,  4604755543975806820U,
+    4585023436363055487U,  4607177290141793710U,
+    13830549326996569518U,  4585023436363055487U,
+    4607175255902437396U,  4585907115494236537U,
+    13809279152349012345U,  4607175255902437396U,
+    4604285253548209224U,  4604793159020491611U,
+    13828165195875267419U,  4604285253548209224U,
+    4606352730697093817U,  4601223560006786057U,
+    13824595596861561865U,  4606352730697093817U,
+    4599896339047301634U,  4606627607157935956U,
+    13829999644012711764U,  4599896339047301634U,
+    4606932257325205256U,  4597600270510262682U,
+    13820972307365038490U,  4606932257325205256U,
+    4602876755014813164U,  4605858005670328613U,
+    13829230042525104421U,  4602876755014813164U,
+    4605458946901419122U,  4603473988668005304U,
+    13826846025522781112U,  4605458946901419122U,
+    4594782329999411347U,  4607072388129742377U,
+    13830444424984518185U,  4594782329999411347U,
+    4607096716058023245U,  4594126307716900071U,
+    13817498344571675879U,  4607096716058023245U,
+    4603607160562208225U,  4605360179893335444U,
+    13828732216748111252U,  4603607160562208225U,
+    4605943243960030558U,  4602734543519989142U,
+    13826106580374764950U,  4605943243960030558U,
+    4598209407597805010U,  4606891971185517504U,
+    13830264008040293312U,  4598209407597805010U,
+    4606683463531482757U,  4599584122834874440U,
+    13822956159689650248U,  4606683463531482757U,
+    4601523323048804569U,  4606281842017099424U,
+    13829653878871875232U,  4601523323048804569U,
+    4604904503566677638U,  4604162403772767740U,
+    13827534440627543548U,  4604904503566677638U,
+    4588556721781247689U,  4607167120476811757U,
+    13830539157331587565U,  4588556721781247689U,
+    4607146792632922887U,  4591066993883984169U,
+    13814439030738759977U,  4607146792632922887U,
+    4603953166845776383U,  4605084992581147553U,
+    13828457029435923361U,  4603953166845776383U,
+    4606157602458368090U,  4602016966272225497U,
+    13825389003127001305U,  4606157602458368090U,
+    4599059363095165615U,  4606770142132396069U,
+    13830142178987171877U,  4599059363095165615U,
+    4606818271362779153U,  4598742041476147134U,
+    13822114078330922942U,  4606818271362779153U,
+    4602309411551204896U,  4606079444829232727U,
+    13829451481684008535U,  4602309411551204896U,
+    4605190175055178825U,  4603825001630339212U,
+    13827197038485115020U,  4605190175055178825U,
+    4592387007752762956U,  4607130541380624519U,
+    13830502578235400327U,  4592387007752762956U,
+    4607025146816593591U,  4595871363584150300U,
+    13819243400438926108U,  4607025146816593591U,
+    4603248068256948438U,  4605618058006716661U,
+    13828990094861492469U,  4603248068256948438U,
+    4605710171610479304U,  4603110210506737381U,
+    13826482247361513189U,  4605710171610479304U,
+    4596521820799644122U,  4606992800820440327U,
+    13830364837675216135U,  4596521820799644122U,
+    4606528158595189433U,  4600411960456200676U,
+    13823783997310976484U,  4606528158595189433U,
+    4600718319105833937U,  4606464709641375231U,
+    13829836746496151039U,  4600718319105833937U,
+    4604602620643553229U,  4604485382263976838U,
+    13827857419118752646U,  4604602620643553229U,
+    4576459225186735875U,  4607182037296057423U,
+    13830554074150833231U,  4576459225186735875U,
+    4607182037296057423U,  4576459225186735875U,
+    13799831262041511683U,  4607182037296057423U,
+    4604485382263976838U,  4604602620643553229U,
+    13827974657498329037U,  4604485382263976838U,
+    4606464709641375231U,  4600718319105833937U,
+    13824090355960609745U,  4606464709641375231U,
+    4600411960456200676U,  4606528158595189433U,
+    13829900195449965241U,  4600411960456200676U,
+    4606992800820440327U,  4596521820799644122U,
+    13819893857654419930U,  4606992800820440327U,
+    4603110210506737381U,  4605710171610479304U,
+    13829082208465255112U,  4603110210506737381U,
+    4605618058006716661U,  4603248068256948438U,
+    13826620105111724246U,  4605618058006716661U,
+    4595871363584150300U,  4607025146816593591U,
+    13830397183671369399U,  4595871363584150300U,
+    4607130541380624519U,  4592387007752762956U,
+    13815759044607538764U,  4607130541380624519U,
+    4603825001630339212U,  4605190175055178825U,
+    13828562211909954633U,  4603825001630339212U,
+    4606079444829232727U,  4602309411551204896U,
+    13825681448405980704U,  4606079444829232727U,
+    4598742041476147134U,  4606818271362779153U,
+    13830190308217554961U,  4598742041476147134U,
+    4606770142132396069U,  4599059363095165615U,
+    13822431399949941423U,  4606770142132396069U,
+    4602016966272225497U,  4606157602458368090U,
+    13829529639313143898U,  4602016966272225497U,
+    4605084992581147553U,  4603953166845776383U,
+    13827325203700552191U,  4605084992581147553U,
+    4591066993883984169U,  4607146792632922887U,
+    13830518829487698695U,  4591066993883984169U,
+    4607167120476811757U,  4588556721781247689U,
+    13811928758636023497U,  4607167120476811757U,
+    4604162403772767740U,  4604904503566677638U,
+    13828276540421453446U,  4604162403772767740U,
+    4606281842017099424U,  4601523323048804569U,
+    13824895359903580377U,  4606281842017099424U,
+    4599584122834874440U,  4606683463531482757U,
+    13830055500386258565U,  4599584122834874440U,
+    4606891971185517504U,  4598209407597805010U,
+    13821581444452580818U,  4606891971185517504U,
+    4602734543519989142U,  4605943243960030558U,
+    13829315280814806366U,  4602734543519989142U,
+    4605360179893335444U,  4603607160562208225U,
+    13826979197416984033U,  4605360179893335444U,
+    4594126307716900071U,  4607096716058023245U,
+    13830468752912799053U,  4594126307716900071U,
+    4607072388129742377U,  4594782329999411347U,
+    13818154366854187155U,  4607072388129742377U,
+    4603473988668005304U,  4605458946901419122U,
+    13828830983756194930U,  4603473988668005304U,
+    4605858005670328613U,  4602876755014813164U,
+    13826248791869588972U,  4605858005670328613U,
+    4597600270510262682U,  4606932257325205256U,
+    13830304294179981064U,  4597600270510262682U,
+    4606627607157935956U,  4599896339047301634U,
+    13823268375902077442U,  4606627607157935956U,
+    4601223560006786057U,  4606352730697093817U,
+    13829724767551869625U,  4601223560006786057U,
+    4604793159020491611U,  4604285253548209224U,
+    13827657290402985032U,  4604793159020491611U,
+    4585907115494236537U,  4607175255902437396U,
+    13830547292757213204U,  4585907115494236537U,
+    4607177290141793710U,  4585023436363055487U,
+    13808395473217831295U,  4607177290141793710U,
+    4604325745441780828U,  4604755543975806820U,
+    13828127580830582628U,  4604325745441780828U,
+    4606375745674388705U,  4601123065313358619U,
+    13824495102168134427U,  4606375745674388705U,
+    4599999947619525579U,  4606608350964852124U,
+    13829980387819627932U,  4599999947619525579U,
+    4606945027305114062U,  4597385183080791534U,
+    13820757219935567342U,  4606945027305114062U,
+    4602923807199184054U,  4605829012964735987U,
+    13829201049819511795U,  4602923807199184054U,
+    4605491322423429598U,  4603429196809300824U,
+    13826801233664076632U,  4605491322423429598U,
+    4595000592312171144U,  4607063608453868552U,
+    13830435645308644360U,  4595000592312171144U,
+    4607104153983298999U,  4593907249284540294U,
+    13817279286139316102U,  4607104153983298999U,
+    4603651144395358093U,  4605326714874986465U,
+    13828698751729762273U,  4603651144395358093U,
+    4605971073215153165U,  4602686793990243041U,
+    13826058830845018849U,  4605971073215153165U,
+    4598316292140394014U,  4606877885424248132U,
+    13830249922279023940U,  4598316292140394014U,
+    4606701442584137310U,  4599479600326345459U,
+    13822851637181121267U,  4606701442584137310U,
+    4601622657843474729U,  4606257600839867033U,
+    13829629637694642841U,  4601622657843474729U,
+    4604941113561600762U,  4604121000955189926U,
+    13827493037809965734U,  4604941113561600762U,
+    4589303678145802340U,  4607163731439411601U,
+    13830535768294187409U,  4589303678145802340U,
+    4607151534426937478U,  4590626485056654602U,
+    13813998521911430410U,  4607151534426937478U,
+    4603995455647851249U,  4605049409688478101U,
+    13828421446543253909U,  4603995455647851249U,
+    4606183055233559255U,  4601918851211878557U,
+    13825290888066654365U,  4606183055233559255U,
+    4599164736579548843U,  4606753451050079834U,
+    13830125487904855642U,  4599164736579548843U,
+    4606833664420673202U,  4598635880488956483U,
+    13822007917343732291U,  4606833664420673202U,
+    4602406247776385022U,  4606052795787882823U,
+    13829424832642658631U,  4602406247776385022U,
+    4605224709411790590U,  4603781852316960384U,
+    13827153889171736192U,  4605224709411790590U,
+    4592826452951465409U,  4607124449686274900U,
+    13830496486541050708U,  4592826452951465409U,
+    4607035262954517034U,  4595654028864046335U,
+    13819026065718822143U,  4607035262954517034U,
+    4603293641160266722U,  4605586791482848547U,
+    13828958828337624355U,  4603293641160266722U,
+    4605740310302420207U,  4603063884010218172U,
+    13826435920864993980U,  4605740310302420207U,
+    4596738097012783531U,  4606981354314050484U,
+    13830353391168826292U,  4596738097012783531U,
+    4606548680329491866U,  4600309328230211502U,
+    13823681365084987310U,  4606548680329491866U,
+    4600819913163773071U,  4606442934727379583U,
+    13829814971582155391U,  4600819913163773071U,
+    4604641218080103285U,  4604445825685214043U,
+    13827817862539989851U,  4604641218080103285U,
+    4579996072175835083U,  4607181359080094673U,
+    13830553395934870481U,  4579996072175835083U,
+    4607180341788068727U,  4581846703643734566U,
+    13805218740498510374U,  4607180341788068727U,
+    4604406033021674239U,  4604679572075463103U,
+    13828051608930238911U,  4604406033021674239U,
+    4606420848538580260U,  4600921238092511730U,
+    13824293274947287538U,  4606420848538580260U,
+    4600206446098256018U,  4606568886807728474U,
+    13829940923662504282U,  4600206446098256018U,
+    4606969576261663845U,  4596954088216812973U,
+    13820326125071588781U,  4606969576261663845U,
+    4603017373458244943U,  4605770164172969910U,
+    13829142201027745718U,  4603017373458244943U,
+    4605555245917486022U,  4603339021357904144U,
+    13826711058212679952U,  4605555245917486022U,
+    4595436449949385485U,  4607045045516813836U,
+    13830417082371589644U,  4595436449949385485U,
+    4607118021058468598U,  4593265590854265407U,
+    13816637627709041215U,  4607118021058468598U,
+    4603738491917026584U,  4605258978359093269U,
+    13828631015213869077U,  4603738491917026584U,
+    4606025850160239809U,  4602502755147763107U,
+    13825874792002538915U,  4606025850160239809U,
+    4598529532600161144U,  4606848731493011465U,
+    13830220768347787273U,  4598529532600161144U,
+    4606736437002195879U,  4599269903251194481U,
+    13822641940105970289U,  4606736437002195879U,
+    4601820425647934753U,  4606208206518262803U,
+    13829580243373038611U,  4601820425647934753U,
+    4605013567986435066U,  4604037525321326463U,
+    13827409562176102271U,  4605013567986435066U,
+    4590185751760970393U,  4607155938267770208U,
+    13830527975122546016U,  4590185751760970393U,
+    4607160003989618959U,  4589744810590291021U,
+    13813116847445066829U,  4607160003989618959U,
+    4604079374282302598U,  4604977468824438271U,
+    13828349505679214079U,  4604079374282302598U,
+    4606233055365547081U,  4601721693286060937U,
+    13825093730140836745U,  4606233055365547081U,
+    4599374859150636784U,  4606719100629313491U,
+    13830091137484089299U,  4599374859150636784U,
+    4606863472012527185U,  4598423001813699022U,
+    13821795038668474830U,  4606863472012527185U,
+    4602598930031891166U,  4605998608960791335U,
+    13829370645815567143U,  4602598930031891166U,
+    4605292980606880364U,  4603694922063032361U,
+    13827066958917808169U,  4605292980606880364U,
+    4593688012422887515U,  4607111255739239816U,
+    13830483292594015624U,  4593688012422887515U,
+    4607054494135176056U,  4595218635031890910U,
+    13818590671886666718U,  4607054494135176056U,
+    4603384207141321914U,  4605523422498301790U,
+    13828895459353077598U,  4603384207141321914U,
+    4605799732098147061U,  4602970680601913687U,
+    13826342717456689495U,  4605799732098147061U,
+    4597169786279785693U,  4606957467106717424U,
+    13830329503961493232U,  4597169786279785693U,
+    4606588777269136769U,  4600103317933788342U,
+    13823475354788564150U,  4606588777269136769U,
+    4601022290077223616U,  4606398451906509788U,
+    13829770488761285596U,  4601022290077223616U,
+    4604717681185626434U,  4604366005771528720U,
+    13827738042626304528U,  4604717681185626434U,
+    4583614727651146525U,  4607178985458280057U,
+    13830551022313055865U,  4583614727651146525U,
+    4607172882816799076U,  4586790578280679046U,
+    13810162615135454854U,  4607172882816799076U,
+    4604244531615310815U,  4604830524903495634U,
+    13828202561758271442U,  4604244531615310815U,
+    4606329407841126011U,  4601323770373937522U,
+    13824695807228713330U,  4606329407841126011U,
+    4599792496117920694U,  4606646545123403481U,
+    13830018581978179289U,  4599792496117920694U,
+    4606919157647773535U,  4597815040470278984U,
+    13821187077325054792U,  4606919157647773535U,
+    4602829525820289164U,  4605886709123365959U,
+    13829258745978141767U,  4602829525820289164U,
+    4605426297151190466U,  4603518581031047189U,
+    13826890617885822997U,  4605426297151190466U,
+    4594563856311064231U,  4607080832832247697U,
+    13830452869687023505U,  4594563856311064231U,
+    4607088942243446236U,  4594345179472540681U,
+    13817717216327316489U,  4607088942243446236U,
+    4603562972219549215U,  4605393374401988274U,
+    13828765411256764082U,  4603562972219549215U,
+    4605915122243179241U,  4602782121393764535U,
+    13826154158248540343U,  4605915122243179241U,
+    4598029484874872834U,  4606905728766014348U,
+    13830277765620790156U,  4598029484874872834U,
+    4606665164148251002U,  4599688422741010356U,
+    13823060459595786164U,  4606665164148251002U,
+    4601423692641949331U,  4606305777984577632U,
+    13829677814839353440U,  4601423692641949331U,
+    4604867640218014515U,  4604203581176243359U,
+    13827575618031019167U,  4604867640218014515U,
+    4587673791460508439U,  4607170170974224083U,
+    13830542207828999891U,  4587673791460508439U,
+    4607141713064252300U,  4591507261658050721U,
+    13814879298512826529U,  4607141713064252300U,
+    4603910660507251362U,  4605120315324767624U,
+    13828492352179543432U,  4603910660507251362U,
+    4606131849150971908U,  4602114767134999006U,
+    13825486803989774814U,  4606131849150971908U,
+    4598953786765296928U,  4606786509620734768U,
+    13830158546475510576U,  4598953786765296928U,
+    4606802552898869248U,  4598848011564831930U,
+    13822220048419607738U,  4606802552898869248U,
+    4602212250118051877U,  4606105796280968177U,
+    13829477833135743985U,  4602212250118051877U,
+    4605155376589456981U,  4603867938232615808U,
+    13827239975087391616U,  4605155376589456981U,
+    4591947271803021404U,  4607136295912168606U,
+    13830508332766944414U,  4591947271803021404U,
+    4607014697483910382U,  4596088445927168004U,
+    13819460482781943812U,  4607014697483910382U,
+    4603202304363743346U,  4605649044311923410U,
+    13829021081166699218U,  4603202304363743346U,
+    4605679749231851918U,  4603156351203636159U,
+    13826528388058411967U,  4605679749231851918U,
+    4596305267720071930U,  4607003915349878877U,
+    13830375952204654685U,  4596305267720071930U,
+    4606507322377452870U,  4600514338912178239U,
+    13823886375766954047U,  4606507322377452870U,
+    4600616459743653188U,  4606486172460753999U,
+    13829858209315529807U,  4600616459743653188U,
+    4604563781218984604U,  4604524701268679793U,
+    13827896738123455601U,  4604563781218984604U,
+    4569220649180767418U,  4607182376410422530U,
+    13830554413265198338U,  4569220649180767418U
+};
+
+const fpr fpr_p2_tab[] = {
+    4611686018427387904U,
+    4607182418800017408U,
+    4602678819172646912U,
+    4598175219545276416U,
+    4593671619917905920U,
+    4589168020290535424U,
+    4584664420663164928U,
+    4580160821035794432U,
+    4575657221408423936U,
+    4571153621781053440U,
+    4566650022153682944U
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.h b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.h
new file mode 100644
index 000000000..3e80b5068
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.h
@@ -0,0 +1,491 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/* ====================================================================== */
+/*
+ * Custom floating-point implementation with integer arithmetics. We
+ * use IEEE-754 "binary64" format, with some simplifications:
+ *
+ *   - Top bit is s = 1 for negative, 0 for positive.
+ *
+ *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
+ *
+ *   - Mantissa m uses the 52 low bits.
+ *
+ * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
+ * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
+ * less than 1.0), but the top bit (equal to 1 by definition) is omitted
+ * in the encoding.
+ *
+ * In IEEE-754, there are some special values:
+ *
+ *   - If e = 2047, then the value is either an infinite (m = 0) or
+ *     a NaN (m != 0).
+ *
+ *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
+ *     aka "denormalized number" (m != 0).
+ *
+ * Of these, we only need the zeros. The caller is responsible for not
+ * providing operands that would lead to infinites, NaNs or subnormals.
+ * If inputs are such that values go out of range, then indeterminate
+ * values are returned (it would still be deterministic, but no specific
+ * value may be relied upon).
+ *
+ * At the C level, the three parts are stored in a 64-bit unsigned
+ * word.
+ *
+ * One may note that a property of the IEEE-754 format is that order
+ * is preserved for positive values: if two positive floating-point
+ * values x and y are such that x < y, then their respective encodings
+ * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
+ * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
+ * y < 0, and x < y, then ia64(x) > ia64(y).
+ *
+ * IMPORTANT ASSUMPTIONS:
+ * ======================
+ *
+ * For proper computations, and constant-time behaviour, we assume the
+ * following:
+ *
+ *   - 32x32->64 multiplication (unsigned) has an execution time that
+ *     is independent of its operands. This is true of most modern
+ *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
+ *     and M3 (in the M0 and M0+, this is done in software, so it depends
+ *     on that routine), and the PowerPC cores from the G3/G4 lines.
+ *     For more info, see: https://www.bearssl.org/ctmul.html
+ *
+ *   - Left-shifts and right-shifts of 32-bit values have an execution
+ *     time which does not depend on the shifted value nor on the
+ *     shift count. An historical exception is the Pentium IV, but most
+ *     modern CPU have barrel shifters. Some small microcontrollers
+ *     might have varying-time shifts (not the ARM Cortex M*, though).
+ *
+ *   - Right-shift of a signed negative value performs a sign extension.
+ *     As per the C standard, this operation returns an
+ *     implementation-defined result (this is NOT an "undefined
+ *     behaviour"). On most/all systems, an arithmetic shift is
+ *     performed, because this is what makes most sense.
+ */
+
+/*
+ * Normally we should declare the 'fpr' type to be a struct or union
+ * around the internal 64-bit value; however, we want to use the
+ * direct 64-bit integer type to enable a lighter call convention on
+ * ARM platforms. This means that direct (invalid) use of operators
+ * such as '*' or '+' will not be caught by the compiler. We rely on
+ * the "normal" (non-emulated) code to detect such instances.
+ */
+typedef uint64_t fpr;
+
+/*
+ * For computations, we split values into an integral mantissa in the
+ * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
+ * "sticky" (it is set to 1 if any of the bits below it is 1); when
+ * re-encoding, the low two bits are dropped, but may induce an
+ * increment in the value for proper rounding.
+ */
+
+/*
+ * Right-shift a 64-bit unsigned value by a possibly secret shift count.
+ * We assumed that the underlying architecture had a barrel shifter for
+ * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
+ * typically invoke a software routine that is not necessarily
+ * constant-time; hence the function below.
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ursh(uint64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+/*
+ * Right-shift a 64-bit signed value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline int64_t
+fpr_irsh(int64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+/*
+ * Left-shift a 64-bit unsigned value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ulsh(uint64_t x, int n) {
+    x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
+    return x << (n & 31);
+}
+
+/*
+ * Expectations:
+ *   s = 0 or 1
+ *   exponent e is "arbitrary" and unbiased
+ *   2^54 <= m < 2^55
+ * Numerical value is (-1)^2 * m * 2^e
+ *
+ * Exponents which are too low lead to value zero. If the exponent is
+ * too large, the returned value is indeterminate.
+ *
+ * If m = 0, then a zero is returned (using the provided sign).
+ * If e < -1076, then a zero is returned (regardless of the value of m).
+ * If e >= -1076 and e != 0, m must be within the expected range
+ * (2^54 to 2^55-1).
+ */
+static inline fpr
+FPR(int s, int e, uint64_t m) {
+    fpr x;
+    uint32_t t;
+    unsigned f;
+
+    /*
+     * If e >= -1076, then the value is "normal"; otherwise, it
+     * should be a subnormal, which we clamp down to zero.
+     */
+    e += 1076;
+    t = (uint32_t)e >> 31;
+    m &= (uint64_t)t - 1;
+
+    /*
+     * If m = 0 then we want a zero; make e = 0 too, but conserve
+     * the sign.
+     */
+    t = (uint32_t)(m >> 54);
+    e &= -(int)t;
+
+    /*
+     * The 52 mantissa bits come from m. Value m has its top bit set
+     * (unless it is a zero); we leave it "as is": the top bit will
+     * increment the exponent by 1, except when m = 0, which is
+     * exactly what we want.
+     */
+    x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
+
+    /*
+     * Rounding: if the low three bits of m are 011, 110 or 111,
+     * then the value should be incremented to get the next
+     * representable value. This implements the usual
+     * round-to-nearest rule (with preference to even values in case
+     * of a tie). Note that the increment may make a carry spill
+     * into the exponent field, which is again exactly what we want
+     * in that case.
+     */
+    f = (unsigned)m & 7U;
+    x += (0xC8U >> f) & 1;
+    return x;
+}
+
+#define fpr_scaled   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_scaled
+fpr fpr_scaled(int64_t i, int sc);
+
+static inline fpr
+fpr_of(int64_t i) {
+    return fpr_scaled(i, 0);
+}
+
+static const fpr fpr_q = 4667981563525332992;
+static const fpr fpr_inverse_of_q = 4545632735260551042;
+static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
+static const fpr fpr_inv_sigma[] = {
+    0,  /* unused */
+    4574611497772390042,
+    4574501679055810265,
+    4574396282908341804,
+    4574245855758572086,
+    4574103865040221165,
+    4573969550563515544,
+    4573842244705920822,
+    4573721358406441454,
+    4573606369665796042,
+    4573496814039276259
+};
+static const fpr fpr_sigma_min[] = {
+    0,  /* unused */
+    4607707126469777035,
+    4607777455861499430,
+    4607846828256951418,
+    4607949175006100261,
+    4608049571757433526,
+    4608148125896792003,
+    4608244935301382692,
+    4608340089478362016,
+    4608433670533905013,
+    4608525754002622308
+};
+static const fpr fpr_log2 = 4604418534313441775;
+static const fpr fpr_inv_log2 = 4609176140021203710;
+static const fpr fpr_bnorm_max = 4670353323383631276;
+static const fpr fpr_zero = 0;
+static const fpr fpr_one = 4607182418800017408;
+static const fpr fpr_two = 4611686018427387904;
+static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
+static const fpr fpr_ptwo31 = 4746794007248502784;
+static const fpr fpr_ptwo31m1 = 4746794007244308480;
+static const fpr fpr_mtwo31m1 = 13970166044099084288U;
+static const fpr fpr_ptwo63m1 = 4890909195324358656;
+static const fpr fpr_mtwo63m1 = 14114281232179134464U;
+static const fpr fpr_ptwo63 = 4890909195324358656;
+
+static inline int64_t
+fpr_rint(fpr x) {
+    uint64_t m, d;
+    int e;
+    uint32_t s, dd, f;
+
+    /*
+     * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
+     * thus extract the mantissa as a 63-bit integer, then right-shift
+     * it as needed.
+     */
+    m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    e = 1085 - ((int)(x >> 52) & 0x7FF);
+
+    /*
+     * If a shift of more than 63 bits is needed, then simply set m
+     * to zero. This also covers the case of an input operand equal
+     * to zero.
+     */
+    m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
+    e &= 63;
+
+    /*
+     * Right-shift m as needed. Shift count is e. Proper rounding
+     * mandates that:
+     *   - If the highest dropped bit is zero, then round low.
+     *   - If the highest dropped bit is one, and at least one of the
+     *     other dropped bits is one, then round up.
+     *   - If the highest dropped bit is one, and all other dropped
+     *     bits are zero, then round up if the lowest kept bit is 1,
+     *     or low otherwise (i.e. ties are broken by "rounding to even").
+     *
+     * We thus first extract a word consisting of all the dropped bit
+     * AND the lowest kept bit; then we shrink it down to three bits,
+     * the lowest being "sticky".
+     */
+    d = fpr_ulsh(m, 63 - e);
+    dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
+    f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
+    m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
+
+    /*
+     * Apply the sign bit.
+     */
+    s = (uint32_t)(x >> 63);
+    return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+    uint64_t t;
+    int64_t xi;
+    int e, cc;
+
+    /*
+     * We extract the integer as a _signed_ 64-bit integer with
+     * a scaling factor. Since we assume that the value fits
+     * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
+     * absolute value to make it in the 2^62..2^63-1 range: we
+     * will only need a right-shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    t = x >> 63;
+    xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
+                   & (((uint64_t)1 << 63) - 1));
+    xi = (xi ^ -(int64_t)t) + (int64_t)t;
+    cc = 1085 - e;
+
+    /*
+     * We perform an arithmetic right-shift on the value. This
+     * applies floor() semantics on both positive and negative values
+     * (rounding toward minus infinity).
+     */
+    xi = fpr_irsh(xi, cc & 63);
+
+    /*
+     * If the true shift count was 64 or more, then we should instead
+     * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
+     * case: -0 will be floored to -1, not 0 (whether this is correct
+     * is debatable; in any case, the other functions normalize zero
+     * to +0).
+     *
+     * For an input of zero, the non-shifted xi was incorrect (we used
+     * a top implicit bit of value 1, not 0), but this does not matter
+     * since this operation will clamp it down.
+     */
+    xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
+    return xi;
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+    uint64_t t, xu;
+    int e, cc;
+
+    /*
+     * Extract the absolute value. Since we assume that the value
+     * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
+     * the absolute value into the 2^62..2^63-1 range, and then
+     * do a right shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    cc = 1085 - e;
+    xu = fpr_ursh(xu, cc & 63);
+
+    /*
+     * If the exponent is too low (cc > 63), then the shift was wrong
+     * and we must clamp the value to 0. This also covers the case
+     * of an input equal to zero.
+     */
+    xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
+
+    /*
+     * Apply back the sign, if the source value is negative.
+     */
+    t = x >> 63;
+    xu = (xu ^ -t) + t;
+    return *(int64_t *)&xu;
+}
+
+#define fpr_add   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_add
+fpr fpr_add(fpr x, fpr y);
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+    y ^= (uint64_t)1 << 63;
+    return fpr_add(x, y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+    x ^= (uint64_t)1 << 63;
+    return x;
+}
+
+static inline fpr
+fpr_half(fpr x) {
+    /*
+     * To divide a value by 2, we just have to subtract 1 from its
+     * exponent, but we have to take care of zero.
+     */
+    uint32_t t;
+
+    x -= (uint64_t)1 << 52;
+    t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
+    x &= (uint64_t)t - 1;
+    return x;
+}
+
+static inline fpr
+fpr_double(fpr x) {
+    /*
+     * To double a value, we just increment by one the exponent. We
+     * don't care about infinites or NaNs; however, 0 is a
+     * special case.
+     */
+    x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
+    return x;
+}
+
+#define fpr_mul   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_mul
+fpr fpr_mul(fpr x, fpr y);
+
+static inline fpr
+fpr_sqr(fpr x) {
+    return fpr_mul(x, x);
+}
+
+#define fpr_div   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_div
+fpr fpr_div(fpr x, fpr y);
+
+static inline fpr
+fpr_inv(fpr x) {
+    return fpr_div(4607182418800017408u, x);
+}
+
+#define fpr_sqrt   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_sqrt
+fpr fpr_sqrt(fpr x);
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+    /*
+     * If both x and y are positive, then a signed comparison yields
+     * the proper result:
+     *   - For positive values, the order is preserved.
+     *   - The sign bit is at the same place as in integers, so
+     *     sign is preserved.
+     * Moreover, we can compute [x < y] as sgn(x-y) and the computation
+     * of x-y will not overflow.
+     *
+     * If the signs differ, then sgn(x) gives the proper result.
+     *
+     * If both x and y are negative, then the order is reversed.
+     * Hence [x < y] = sgn(y-x). We must compute this separately from
+     * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
+     * case x = y properly.
+     */
+    int cc0, cc1;
+    int64_t sx;
+    int64_t sy;
+
+    sx = *(int64_t *)&x;
+    sy = *(int64_t *)&y;
+    sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
+
+    cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
+    cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
+
+    return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
+}
+
+/*
+ * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
+ * bits or so.
+ */
+#define fpr_expm_p63   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_expm_p63
+uint64_t fpr_expm_p63(fpr x, fpr ccs);
+
+#define fpr_gm_tab   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab   PQCLEAN_FALCONPADDED1024_CLEAN_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/inner.h b/src/sig/falcon/pqclean_falcon-padded-1024_clean/inner.h
new file mode 100644
index 000000000..c63ee1ddf
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/inner.h
@@ -0,0 +1,820 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCONPADDED1024_CLEAN_ macro (e.g. PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc)        shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ *   - modq: sequence of values modulo 12289, each encoded over exactly
+ *     14 bits. The encoder and decoder verify that integers are within
+ *     the valid range (0..12288). Values are arrays of uint16.
+ *
+ *   - trim: sequence of signed integers, a specified number of bits
+ *     each. The number of bits is provided as parameter and includes
+ *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ *     (which means that the -2^(bits-1) value is forbidden); encode and
+ *     decode functions check that property. Values are arrays of
+ *     int16_t or int8_t, corresponding to names 'trim_i16' and
+ *     'trim_i8', respectively.
+ *
+ *   - comp: variable-length encoding for signed integers; each integer
+ *     uses a minimum of 9 bits, possibly more. This is normally used
+ *     only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_modq_encode(void *out, size_t max_out_len,
+        const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(void *out, size_t max_out_len,
+        const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_comp_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_modq_decode(uint16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_comp_decode(int16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ *   c0[]      contains the hashed nonce+message
+ *   s2[]      is the decoded signature
+ *   h[]       contains the public key, in NTT + Montgomery format
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+        const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ *   fpr
+ *         type for a real number
+ *
+ *   fpr fpr_of(int64_t i)
+ *         cast an integer into a real number; source must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_scaled(int64_t i, int sc)
+ *         compute i*2^sc as a real number; source 'i' must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_ldexp(fpr x, int e)
+ *         compute x*2^e
+ *
+ *   int64_t fpr_rint(fpr x)
+ *         round x to the nearest integer; x must be in the -(2^63-1)
+ *         to +(2^63-1) range
+ *
+ *   int64_t fpr_trunc(fpr x)
+ *         round to an integer; this rounds towards zero; value must
+ *         be in the -(2^63-1) to +(2^63-1) range
+ *
+ *   fpr fpr_add(fpr x, fpr y)
+ *         compute x + y
+ *
+ *   fpr fpr_sub(fpr x, fpr y)
+ *         compute x - y
+ *
+ *   fpr fpr_neg(fpr x)
+ *         compute -x
+ *
+ *   fpr fpr_half(fpr x)
+ *         compute x/2
+ *
+ *   fpr fpr_double(fpr x)
+ *         compute x*2
+ *
+ *   fpr fpr_mul(fpr x, fpr y)
+ *         compute x * y
+ *
+ *   fpr fpr_sqr(fpr x)
+ *         compute x * x
+ *
+ *   fpr fpr_inv(fpr x)
+ *         compute 1/x
+ *
+ *   fpr fpr_div(fpr x, fpr y)
+ *         compute x/y
+ *
+ *   fpr fpr_sqrt(fpr x)
+ *         compute the square root of x
+ *
+ *   int fpr_lt(fpr x, fpr y)
+ *         return 1 if x < y, 0 otherwise
+ *
+ *   uint64_t fpr_expm_p63(fpr x)
+ *         return exp(x), assuming that 0 <= x < log(2). Returned value
+ *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ *         rounded to the nearest integer). Computation should have a
+ *         precision of at least 45 bits.
+ *
+ *   const fpr fpr_gm_tab[]
+ *         array of constants for FFT / iFFT
+ *
+ *   const fpr fpr_p2_tab[]
+ *         precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ *   fpr fpr_q                 12289
+ *   fpr fpr_inverse_of_q      1/12289
+ *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
+ *   fpr fpr_inv_sigma[]       1/sigma (indexed by logn, 1 to 10)
+ *   fpr fpr_sigma_min[]       1/sigma_min (indexed by logn, 1 to 10)
+ *   fpr fpr_log2              log(2)
+ *   fpr fpr_inv_log2          1/log(2)
+ *   fpr fpr_bnorm_max         16822.4121
+ *   fpr fpr_zero              0
+ *   fpr fpr_one               1
+ *   fpr fpr_two               2
+ *   fpr fpr_onehalf           0.5
+ *   fpr fpr_ptwo31            2^31
+ *   fpr fpr_ptwo31m1          2^31-1
+ *   fpr fpr_mtwo31m1          -(2^31-1)
+ *   fpr fpr_ptwo63m1          2^63-1
+ *   fpr fpr_mtwo63m1          -(2^63-1)
+ *   fpr fpr_ptwo63            2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+    union {
+        uint8_t d[512]; /* MUST be 512, exactly */
+        uint64_t dummy_u64;
+    } buf;
+    size_t ptr;
+    union {
+        uint8_t d[256];
+        uint64_t dummy_u64;
+    } state;
+    int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+    }
+    return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDL_fft(const fpr *g00,
+        fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(fpr *d11, fpr *l10,
+        const fpr *g00, const fpr *g01,
+        const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(fpr *f0, fpr *f1,
+        const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(fpr *f,
+        const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1      136
+#define FALCON_KEYGEN_TEMP_2      272
+#define FALCON_KEYGEN_TEMP_3      224
+#define FALCON_KEYGEN_TEMP_4      448
+#define FALCON_KEYGEN_TEMP_5      896
+#define FALCON_KEYGEN_TEMP_6     1792
+#define FALCON_KEYGEN_TEMP_7     3584
+#define FALCON_KEYGEN_TEMP_8     7168
+#define FALCON_KEYGEN_TEMP_9    14336
+#define FALCON_KEYGEN_TEMP_10   28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_keygen(inner_shake256_context *rng,
+        int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED1024_CLEAN_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *expanded_key,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED1024_CLEAN_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/keygen.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/keygen.c
new file mode 100644
index 000000000..411c37463
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/keygen.c
@@ -0,0 +1,4234 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ *   Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ *   Montgomery multiplication computes (x*y)/R mod p for
+ *   operands x and y. Therefore:
+ *
+ *    - if operands are x*R and y*R (Montgomery representations of x and
+ *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ *      mod p, which is the Montgomery representation of the product x*y;
+ *
+ *    - if operands are x*R and y (or x and y*R), then Montgomery
+ *      multiplication returns x*y mod p: mixed-representation
+ *      multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ *  p   The prime itself.
+ *
+ *  g   A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ *  s   The inverse of the product of all previous primes in the array,
+ *      computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+    uint32_t p;
+    uint32_t g;
+    uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+    { 2147473409,  383167813,      10239 },
+    { 2147389441,  211808905,  471403745 },
+    { 2147387393,   37672282, 1329335065 },
+    { 2147377153, 1977035326,  968223422 },
+    { 2147358721, 1067163706,  132460015 },
+    { 2147352577, 1606082042,  598693809 },
+    { 2147346433, 2033915641, 1056257184 },
+    { 2147338241, 1653770625,  421286710 },
+    { 2147309569,  631200819, 1111201074 },
+    { 2147297281, 2038364663, 1042003613 },
+    { 2147295233, 1962540515,   19440033 },
+    { 2147239937, 2100082663,  353296760 },
+    { 2147235841, 1991153006, 1703918027 },
+    { 2147217409,  516405114, 1258919613 },
+    { 2147205121,  409347988, 1089726929 },
+    { 2147196929,  927788991, 1946238668 },
+    { 2147178497, 1136922411, 1347028164 },
+    { 2147100673,  868626236,  701164723 },
+    { 2147082241, 1897279176,  617820870 },
+    { 2147074049, 1888819123,  158382189 },
+    { 2147051521,   25006327,  522758543 },
+    { 2147043329,  327546255,   37227845 },
+    { 2147039233,  766324424, 1133356428 },
+    { 2146988033, 1862817362,   73861329 },
+    { 2146963457,  404622040,  653019435 },
+    { 2146959361, 1936581214,  995143093 },
+    { 2146938881, 1559770096,  634921513 },
+    { 2146908161,  422623708, 1985060172 },
+    { 2146885633, 1751189170,  298238186 },
+    { 2146871297,  578919515,  291810829 },
+    { 2146846721, 1114060353,  915902322 },
+    { 2146834433, 2069565474,   47859524 },
+    { 2146818049, 1552824584,  646281055 },
+    { 2146775041, 1906267847, 1597832891 },
+    { 2146756609, 1847414714, 1228090888 },
+    { 2146744321, 1818792070, 1176377637 },
+    { 2146738177, 1118066398, 1054971214 },
+    { 2146736129,   52057278,  933422153 },
+    { 2146713601,  592259376, 1406621510 },
+    { 2146695169,  263161877, 1514178701 },
+    { 2146656257,  685363115,  384505091 },
+    { 2146650113,  927727032,  537575289 },
+    { 2146646017,   52575506, 1799464037 },
+    { 2146643969, 1276803876, 1348954416 },
+    { 2146603009,  814028633, 1521547704 },
+    { 2146572289, 1846678872, 1310832121 },
+    { 2146547713,  919368090, 1019041349 },
+    { 2146508801,  671847612,   38582496 },
+    { 2146492417,  283911680,  532424562 },
+    { 2146490369, 1780044827,  896447978 },
+    { 2146459649,  327980850, 1327906900 },
+    { 2146447361, 1310561493,  958645253 },
+    { 2146441217,  412148926,  287271128 },
+    { 2146437121,  293186449, 2009822534 },
+    { 2146430977,  179034356, 1359155584 },
+    { 2146418689, 1517345488, 1790248672 },
+    { 2146406401, 1615820390, 1584833571 },
+    { 2146404353,  826651445,  607120498 },
+    { 2146379777,    3816988, 1897049071 },
+    { 2146363393, 1221409784, 1986921567 },
+    { 2146355201, 1388081168,  849968120 },
+    { 2146336769, 1803473237, 1655544036 },
+    { 2146312193, 1023484977,  273671831 },
+    { 2146293761, 1074591448,  467406983 },
+    { 2146283521,  831604668, 1523950494 },
+    { 2146203649,  712865423, 1170834574 },
+    { 2146154497, 1764991362, 1064856763 },
+    { 2146142209,  627386213, 1406840151 },
+    { 2146127873, 1638674429, 2088393537 },
+    { 2146099201, 1516001018,  690673370 },
+    { 2146093057, 1294931393,  315136610 },
+    { 2146091009, 1942399533,  973539425 },
+    { 2146078721, 1843461814, 2132275436 },
+    { 2146060289, 1098740778,  360423481 },
+    { 2146048001, 1617213232, 1951981294 },
+    { 2146041857, 1805783169, 2075683489 },
+    { 2146019329,  272027909, 1753219918 },
+    { 2145986561, 1206530344, 2034028118 },
+    { 2145976321, 1243769360, 1173377644 },
+    { 2145964033,  887200839, 1281344586 },
+    { 2145906689, 1651026455,  906178216 },
+    { 2145875969, 1673238256, 1043521212 },
+    { 2145871873, 1226591210, 1399796492 },
+    { 2145841153, 1465353397, 1324527802 },
+    { 2145832961, 1150638905,  554084759 },
+    { 2145816577,  221601706,  427340863 },
+    { 2145785857,  608896761,  316590738 },
+    { 2145755137, 1712054942, 1684294304 },
+    { 2145742849, 1302302867,  724873116 },
+    { 2145728513,  516717693,  431671476 },
+    { 2145699841,  524575579, 1619722537 },
+    { 2145691649, 1925625239,  982974435 },
+    { 2145687553,  463795662, 1293154300 },
+    { 2145673217,  771716636,  881778029 },
+    { 2145630209, 1509556977,  837364988 },
+    { 2145595393,  229091856,  851648427 },
+    { 2145587201, 1796903241,  635342424 },
+    { 2145525761,  715310882, 1677228081 },
+    { 2145495041, 1040930522,  200685896 },
+    { 2145466369,  949804237, 1809146322 },
+    { 2145445889, 1673903706,   95316881 },
+    { 2145390593,  806941852, 1428671135 },
+    { 2145372161, 1402525292,  159350694 },
+    { 2145361921, 2124760298, 1589134749 },
+    { 2145359873, 1217503067, 1561543010 },
+    { 2145355777,  338341402,   83865711 },
+    { 2145343489, 1381532164,  641430002 },
+    { 2145325057, 1883895478, 1528469895 },
+    { 2145318913, 1335370424,   65809740 },
+    { 2145312769, 2000008042, 1919775760 },
+    { 2145300481,  961450962, 1229540578 },
+    { 2145282049,  910466767, 1964062701 },
+    { 2145232897,  816527501,  450152063 },
+    { 2145218561, 1435128058, 1794509700 },
+    { 2145187841,   33505311, 1272467582 },
+    { 2145181697,  269767433, 1380363849 },
+    { 2145175553,   56386299, 1316870546 },
+    { 2145079297, 2106880293, 1391797340 },
+    { 2145021953, 1347906152,  720510798 },
+    { 2145015809,  206769262, 1651459955 },
+    { 2145003521, 1885513236, 1393381284 },
+    { 2144960513, 1810381315,   31937275 },
+    { 2144944129, 1306487838, 2019419520 },
+    { 2144935937,   37304730, 1841489054 },
+    { 2144894977, 1601434616,  157985831 },
+    { 2144888833,   98749330, 2128592228 },
+    { 2144880641, 1772327002, 2076128344 },
+    { 2144864257, 1404514762, 2029969964 },
+    { 2144827393,  801236594,  406627220 },
+    { 2144806913,  349217443, 1501080290 },
+    { 2144796673, 1542656776, 2084736519 },
+    { 2144778241, 1210734884, 1746416203 },
+    { 2144759809, 1146598851,  716464489 },
+    { 2144757761,  286328400, 1823728177 },
+    { 2144729089, 1347555695, 1836644881 },
+    { 2144727041, 1795703790,  520296412 },
+    { 2144696321, 1302475157,  852964281 },
+    { 2144667649, 1075877614,  504992927 },
+    { 2144573441,  198765808, 1617144982 },
+    { 2144555009,  321528767,  155821259 },
+    { 2144550913,  814139516, 1819937644 },
+    { 2144536577,  571143206,  962942255 },
+    { 2144524289, 1746733766,    2471321 },
+    { 2144512001, 1821415077,  124190939 },
+    { 2144468993,  917871546, 1260072806 },
+    { 2144458753,  378417981, 1569240563 },
+    { 2144421889,  175229668, 1825620763 },
+    { 2144409601, 1699216963,  351648117 },
+    { 2144370689, 1071885991,  958186029 },
+    { 2144348161, 1763151227,  540353574 },
+    { 2144335873, 1060214804,  919598847 },
+    { 2144329729,  663515846, 1448552668 },
+    { 2144327681, 1057776305,  590222840 },
+    { 2144309249, 1705149168, 1459294624 },
+    { 2144296961,  325823721, 1649016934 },
+    { 2144290817,  738775789,  447427206 },
+    { 2144243713,  962347618,  893050215 },
+    { 2144237569, 1655257077,  900860862 },
+    { 2144161793,  242206694, 1567868672 },
+    { 2144155649,  769415308, 1247993134 },
+    { 2144137217,  320492023,  515841070 },
+    { 2144120833, 1639388522,  770877302 },
+    { 2144071681, 1761785233,  964296120 },
+    { 2144065537,  419817825,  204564472 },
+    { 2144028673,  666050597, 2091019760 },
+    { 2144010241, 1413657615, 1518702610 },
+    { 2143952897, 1238327946,  475672271 },
+    { 2143940609,  307063413, 1176750846 },
+    { 2143918081, 2062905559,  786785803 },
+    { 2143899649, 1338112849, 1562292083 },
+    { 2143891457,   68149545,   87166451 },
+    { 2143885313,  921750778,  394460854 },
+    { 2143854593,  719766593,  133877196 },
+    { 2143836161, 1149399850, 1861591875 },
+    { 2143762433, 1848739366, 1335934145 },
+    { 2143756289, 1326674710,  102999236 },
+    { 2143713281,  808061791, 1156900308 },
+    { 2143690753,  388399459, 1926468019 },
+    { 2143670273, 1427891374, 1756689401 },
+    { 2143666177, 1912173949,  986629565 },
+    { 2143645697, 2041160111,  371842865 },
+    { 2143641601, 1279906897, 2023974350 },
+    { 2143635457,  720473174, 1389027526 },
+    { 2143621121, 1298309455, 1732632006 },
+    { 2143598593, 1548762216, 1825417506 },
+    { 2143567873,  620475784, 1073787233 },
+    { 2143561729, 1932954575,  949167309 },
+    { 2143553537,  354315656, 1652037534 },
+    { 2143541249,  577424288, 1097027618 },
+    { 2143531009,  357862822,  478640055 },
+    { 2143522817, 2017706025, 1550531668 },
+    { 2143506433, 2078127419, 1824320165 },
+    { 2143488001,  613475285, 1604011510 },
+    { 2143469569, 1466594987,  502095196 },
+    { 2143426561, 1115430331, 1044637111 },
+    { 2143383553,    9778045, 1902463734 },
+    { 2143377409, 1557401276, 2056861771 },
+    { 2143363073,  652036455, 1965915971 },
+    { 2143260673, 1464581171, 1523257541 },
+    { 2143246337, 1876119649,  764541916 },
+    { 2143209473, 1614992673, 1920672844 },
+    { 2143203329,  981052047, 2049774209 },
+    { 2143160321, 1847355533,  728535665 },
+    { 2143129601,  965558457,  603052992 },
+    { 2143123457, 2140817191,    8348679 },
+    { 2143100929, 1547263683,  694209023 },
+    { 2143092737,  643459066, 1979934533 },
+    { 2143082497,  188603778, 2026175670 },
+    { 2143062017, 1657329695,  377451099 },
+    { 2143051777,  114967950,  979255473 },
+    { 2143025153, 1698431342, 1449196896 },
+    { 2143006721, 1862741675, 1739650365 },
+    { 2142996481,  756660457,  996160050 },
+    { 2142976001,  927864010, 1166847574 },
+    { 2142965761,  905070557,  661974566 },
+    { 2142916609,   40932754, 1787161127 },
+    { 2142892033, 1987985648,  675335382 },
+    { 2142885889,  797497211, 1323096997 },
+    { 2142871553, 2068025830, 1411877159 },
+    { 2142861313, 1217177090, 1438410687 },
+    { 2142830593,  409906375, 1767860634 },
+    { 2142803969, 1197788993,  359782919 },
+    { 2142785537,  643817365,  513932862 },
+    { 2142779393, 1717046338,  218943121 },
+    { 2142724097,   89336830,  416687049 },
+    { 2142707713,    5944581, 1356813523 },
+    { 2142658561,  887942135, 2074011722 },
+    { 2142638081,  151851972, 1647339939 },
+    { 2142564353, 1691505537, 1483107336 },
+    { 2142533633, 1989920200, 1135938817 },
+    { 2142529537,  959263126, 1531961857 },
+    { 2142527489,  453251129, 1725566162 },
+    { 2142502913, 1536028102,  182053257 },
+    { 2142498817,  570138730,  701443447 },
+    { 2142416897,  326965800,  411931819 },
+    { 2142363649, 1675665410, 1517191733 },
+    { 2142351361,  968529566, 1575712703 },
+    { 2142330881, 1384953238, 1769087884 },
+    { 2142314497, 1977173242, 1833745524 },
+    { 2142289921,   95082313, 1714775493 },
+    { 2142283777,  109377615, 1070584533 },
+    { 2142277633,   16960510,  702157145 },
+    { 2142263297,  553850819,  431364395 },
+    { 2142208001,  241466367, 2053967982 },
+    { 2142164993, 1795661326, 1031836848 },
+    { 2142097409, 1212530046,  712772031 },
+    { 2142087169, 1763869720,  822276067 },
+    { 2142078977,  644065713, 1765268066 },
+    { 2142074881,  112671944,  643204925 },
+    { 2142044161, 1387785471, 1297890174 },
+    { 2142025729,  783885537, 1000425730 },
+    { 2142011393,  905662232, 1679401033 },
+    { 2141974529,  799788433,  468119557 },
+    { 2141943809, 1932544124,  449305555 },
+    { 2141933569, 1527403256,  841867925 },
+    { 2141931521, 1247076451,  743823916 },
+    { 2141902849, 1199660531,  401687910 },
+    { 2141890561,  150132350, 1720336972 },
+    { 2141857793, 1287438162,  663880489 },
+    { 2141833217,  618017731, 1819208266 },
+    { 2141820929,  999578638, 1403090096 },
+    { 2141786113,   81834325, 1523542501 },
+    { 2141771777,  120001928,  463556492 },
+    { 2141759489,  122455485, 2124928282 },
+    { 2141749249,  141986041,  940339153 },
+    { 2141685761,  889088734,  477141499 },
+    { 2141673473,  324212681, 1122558298 },
+    { 2141669377, 1175806187, 1373818177 },
+    { 2141655041, 1113654822,  296887082 },
+    { 2141587457,  991103258, 1585913875 },
+    { 2141583361, 1401451409, 1802457360 },
+    { 2141575169, 1571977166,  712760980 },
+    { 2141546497, 1107849376, 1250270109 },
+    { 2141515777,  196544219,  356001130 },
+    { 2141495297, 1733571506, 1060744866 },
+    { 2141483009,  321552363, 1168297026 },
+    { 2141458433,  505818251,  733225819 },
+    { 2141360129, 1026840098,  948342276 },
+    { 2141325313,  945133744, 2129965998 },
+    { 2141317121, 1871100260, 1843844634 },
+    { 2141286401, 1790639498, 1750465696 },
+    { 2141267969, 1376858592,  186160720 },
+    { 2141255681, 2129698296, 1876677959 },
+    { 2141243393, 2138900688, 1340009628 },
+    { 2141214721, 1933049835, 1087819477 },
+    { 2141212673, 1898664939, 1786328049 },
+    { 2141202433,  990234828,  940682169 },
+    { 2141175809, 1406392421,  993089586 },
+    { 2141165569, 1263518371,  289019479 },
+    { 2141073409, 1485624211,  507864514 },
+    { 2141052929, 1885134788,  311252465 },
+    { 2141040641, 1285021247,  280941862 },
+    { 2141028353, 1527610374,  375035110 },
+    { 2141011969, 1400626168,  164696620 },
+    { 2140999681,  632959608,  966175067 },
+    { 2140997633, 2045628978, 1290889438 },
+    { 2140993537, 1412755491,  375366253 },
+    { 2140942337,  719477232,  785367828 },
+    { 2140925953,   45224252,  836552317 },
+    { 2140917761, 1157376588, 1001839569 },
+    { 2140887041,  278480752, 2098732796 },
+    { 2140837889, 1663139953,  924094810 },
+    { 2140788737,  802501511, 2045368990 },
+    { 2140766209, 1820083885, 1800295504 },
+    { 2140764161, 1169561905, 2106792035 },
+    { 2140696577,  127781498, 1885987531 },
+    { 2140684289,   16014477, 1098116827 },
+    { 2140653569,  665960598, 1796728247 },
+    { 2140594177, 1043085491,  377310938 },
+    { 2140579841, 1732838211, 1504505945 },
+    { 2140569601,  302071939,  358291016 },
+    { 2140567553,  192393733, 1909137143 },
+    { 2140557313,  406595731, 1175330270 },
+    { 2140549121, 1748850918,  525007007 },
+    { 2140477441,  499436566, 1031159814 },
+    { 2140469249, 1886004401, 1029951320 },
+    { 2140426241, 1483168100, 1676273461 },
+    { 2140420097, 1779917297,  846024476 },
+    { 2140413953,  522948893, 1816354149 },
+    { 2140383233, 1931364473, 1296921241 },
+    { 2140366849, 1917356555,  147196204 },
+    { 2140354561,   16466177, 1349052107 },
+    { 2140348417, 1875366972, 1860485634 },
+    { 2140323841,  456498717, 1790256483 },
+    { 2140321793, 1629493973,  150031888 },
+    { 2140315649, 1904063898,  395510935 },
+    { 2140280833, 1784104328,  831417909 },
+    { 2140250113,  256087139,  697349101 },
+    { 2140229633,  388553070,  243875754 },
+    { 2140223489,  747459608, 1396270850 },
+    { 2140200961,  507423743, 1895572209 },
+    { 2140162049,  580106016, 2045297469 },
+    { 2140149761,  712426444,  785217995 },
+    { 2140137473, 1441607584,  536866543 },
+    { 2140119041,  346538902, 1740434653 },
+    { 2140090369,  282642885,   21051094 },
+    { 2140076033, 1407456228,  319910029 },
+    { 2140047361, 1619330500, 1488632070 },
+    { 2140041217, 2089408064, 2012026134 },
+    { 2140008449, 1705524800, 1613440760 },
+    { 2139924481, 1846208233, 1280649481 },
+    { 2139906049,  989438755, 1185646076 },
+    { 2139867137, 1522314850,  372783595 },
+    { 2139842561, 1681587377,  216848235 },
+    { 2139826177, 2066284988, 1784999464 },
+    { 2139824129,  480888214, 1513323027 },
+    { 2139789313,  847937200,  858192859 },
+    { 2139783169, 1642000434, 1583261448 },
+    { 2139770881,  940699589,  179702100 },
+    { 2139768833,  315623242,  964612676 },
+    { 2139666433,  331649203,  764666914 },
+    { 2139641857, 2118730799, 1313764644 },
+    { 2139635713,  519149027,  519212449 },
+    { 2139598849, 1526413634, 1769667104 },
+    { 2139574273,  551148610,  820739925 },
+    { 2139568129, 1386800242,  472447405 },
+    { 2139549697,  813760130, 1412328531 },
+    { 2139537409, 1615286260, 1609362979 },
+    { 2139475969, 1352559299, 1696720421 },
+    { 2139455489, 1048691649, 1584935400 },
+    { 2139432961,  836025845,  950121150 },
+    { 2139424769, 1558281165, 1635486858 },
+    { 2139406337, 1728402143, 1674423301 },
+    { 2139396097, 1727715782, 1483470544 },
+    { 2139383809, 1092853491, 1741699084 },
+    { 2139369473,  690776899, 1242798709 },
+    { 2139351041, 1768782380, 2120712049 },
+    { 2139334657, 1739968247, 1427249225 },
+    { 2139332609, 1547189119,  623011170 },
+    { 2139310081, 1346827917, 1605466350 },
+    { 2139303937,  369317948,  828392831 },
+    { 2139301889, 1560417239, 1788073219 },
+    { 2139283457, 1303121623,  595079358 },
+    { 2139248641, 1354555286,  573424177 },
+    { 2139240449,   60974056,  885781403 },
+    { 2139222017,  355573421, 1221054839 },
+    { 2139215873,  566477826, 1724006500 },
+    { 2139150337,  871437673, 1609133294 },
+    { 2139144193, 1478130914, 1137491905 },
+    { 2139117569, 1854880922,  964728507 },
+    { 2139076609,  202405335,  756508944 },
+    { 2139062273, 1399715741,  884826059 },
+    { 2139045889, 1051045798, 1202295476 },
+    { 2139033601, 1707715206,  632234634 },
+    { 2139006977, 2035853139,  231626690 },
+    { 2138951681,  183867876,  838350879 },
+    { 2138945537, 1403254661,  404460202 },
+    { 2138920961,  310865011, 1282911681 },
+    { 2138910721, 1328496553,  103472415 },
+    { 2138904577,   78831681,  993513549 },
+    { 2138902529, 1319697451, 1055904361 },
+    { 2138816513,  384338872, 1706202469 },
+    { 2138810369, 1084868275,  405677177 },
+    { 2138787841,  401181788, 1964773901 },
+    { 2138775553, 1850532988, 1247087473 },
+    { 2138767361,  874261901, 1576073565 },
+    { 2138757121, 1187474742,  993541415 },
+    { 2138748929, 1782458888, 1043206483 },
+    { 2138744833, 1221500487,  800141243 },
+    { 2138738689,  413465368, 1450660558 },
+    { 2138695681,  739045140,  342611472 },
+    { 2138658817, 1355845756,  672674190 },
+    { 2138644481,  608379162, 1538874380 },
+    { 2138632193, 1444914034,  686911254 },
+    { 2138607617,  484707818, 1435142134 },
+    { 2138591233,  539460669, 1290458549 },
+    { 2138572801, 2093538990, 2011138646 },
+    { 2138552321, 1149786988, 1076414907 },
+    { 2138546177,  840688206, 2108985273 },
+    { 2138533889,  209669619,  198172413 },
+    { 2138523649, 1975879426, 1277003968 },
+    { 2138490881, 1351891144, 1976858109 },
+    { 2138460161, 1817321013, 1979278293 },
+    { 2138429441, 1950077177,  203441928 },
+    { 2138400769,  908970113,  628395069 },
+    { 2138398721,  219890864,  758486760 },
+    { 2138376193, 1306654379,  977554090 },
+    { 2138351617,  298822498, 2004708503 },
+    { 2138337281,  441457816, 1049002108 },
+    { 2138320897, 1517731724, 1442269609 },
+    { 2138290177, 1355911197, 1647139103 },
+    { 2138234881,  531313247, 1746591962 },
+    { 2138214401, 1899410930,  781416444 },
+    { 2138202113, 1813477173, 1622508515 },
+    { 2138191873, 1086458299, 1025408615 },
+    { 2138183681, 1998800427,  827063290 },
+    { 2138173441, 1921308898,  749670117 },
+    { 2138103809, 1620902804, 2126787647 },
+    { 2138099713,  828647069, 1892961817 },
+    { 2138085377,  179405355, 1525506535 },
+    { 2138060801,  615683235, 1259580138 },
+    { 2138044417, 2030277840, 1731266562 },
+    { 2138042369, 2087222316, 1627902259 },
+    { 2138032129,  126388712, 1108640984 },
+    { 2138011649,  715026550, 1017980050 },
+    { 2137993217, 1693714349, 1351778704 },
+    { 2137888769, 1289762259, 1053090405 },
+    { 2137853953,  199991890, 1254192789 },
+    { 2137833473,  941421685,  896995556 },
+    { 2137817089,  750416446, 1251031181 },
+    { 2137792513,  798075119,  368077456 },
+    { 2137786369,  878543495, 1035375025 },
+    { 2137767937,    9351178, 1156563902 },
+    { 2137755649, 1382297614, 1686559583 },
+    { 2137724929, 1345472850, 1681096331 },
+    { 2137704449,  834666929,  630551727 },
+    { 2137673729, 1646165729, 1892091571 },
+    { 2137620481,  778943821,   48456461 },
+    { 2137618433, 1730837875, 1713336725 },
+    { 2137581569,  805610339, 1378891359 },
+    { 2137538561,  204342388, 1950165220 },
+    { 2137526273, 1947629754, 1500789441 },
+    { 2137516033,  719902645, 1499525372 },
+    { 2137491457,  230451261,  556382829 },
+    { 2137440257,  979573541,  412760291 },
+    { 2137374721,  927841248, 1954137185 },
+    { 2137362433, 1243778559,  861024672 },
+    { 2137313281, 1341338501,  980638386 },
+    { 2137311233,  937415182, 1793212117 },
+    { 2137255937,  795331324, 1410253405 },
+    { 2137243649,  150756339, 1966999887 },
+    { 2137182209,  163346914, 1939301431 },
+    { 2137171969, 1952552395,  758913141 },
+    { 2137159681,  570788721,  218668666 },
+    { 2137147393, 1896656810, 2045670345 },
+    { 2137141249,  358493842,  518199643 },
+    { 2137139201, 1505023029,  674695848 },
+    { 2137133057,   27911103,  830956306 },
+    { 2137122817,  439771337, 1555268614 },
+    { 2137116673,  790988579, 1871449599 },
+    { 2137110529,  432109234,  811805080 },
+    { 2137102337, 1357900653, 1184997641 },
+    { 2137098241,  515119035, 1715693095 },
+    { 2137090049,  408575203, 2085660657 },
+    { 2137085953, 2097793407, 1349626963 },
+    { 2137055233, 1556739954, 1449960883 },
+    { 2137030657, 1545758650, 1369303716 },
+    { 2136987649,  332602570,  103875114 },
+    { 2136969217, 1499989506, 1662964115 },
+    { 2136924161,  857040753,    4738842 },
+    { 2136895489, 1948872712,  570436091 },
+    { 2136893441,   58969960, 1568349634 },
+    { 2136887297, 2127193379,  273612548 },
+    { 2136850433,  111208983, 1181257116 },
+    { 2136809473, 1627275942, 1680317971 },
+    { 2136764417, 1574888217,   14011331 },
+    { 2136741889,   14011055, 1129154251 },
+    { 2136727553,   35862563, 1838555253 },
+    { 2136721409,  310235666, 1363928244 },
+    { 2136698881, 1612429202, 1560383828 },
+    { 2136649729, 1138540131,  800014364 },
+    { 2136606721,  602323503, 1433096652 },
+    { 2136563713,  182209265, 1919611038 },
+    { 2136555521,  324156477,  165591039 },
+    { 2136549377,  195513113,  217165345 },
+    { 2136526849, 1050768046,  939647887 },
+    { 2136508417, 1886286237, 1619926572 },
+    { 2136477697,  609647664,   35065157 },
+    { 2136471553,  679352216, 1452259468 },
+    { 2136457217,  128630031,  824816521 },
+    { 2136422401,   19787464, 1526049830 },
+    { 2136420353,  698316836, 1530623527 },
+    { 2136371201, 1651862373, 1804812805 },
+    { 2136334337,  326596005,  336977082 },
+    { 2136322049,   63253370, 1904972151 },
+    { 2136297473,  312176076,  172182411 },
+    { 2136248321,  381261841,  369032670 },
+    { 2136242177,  358688773, 1640007994 },
+    { 2136229889,  512677188,   75585225 },
+    { 2136219649, 2095003250, 1970086149 },
+    { 2136207361, 1909650722,  537760675 },
+    { 2136176641, 1334616195, 1533487619 },
+    { 2136158209, 2096285632, 1793285210 },
+    { 2136143873, 1897347517,  293843959 },
+    { 2136133633,  923586222, 1022655978 },
+    { 2136096769, 1464868191, 1515074410 },
+    { 2136094721, 2020679520, 2061636104 },
+    { 2136076289,  290798503, 1814726809 },
+    { 2136041473,  156415894, 1250757633 },
+    { 2135996417,  297459940, 1132158924 },
+    { 2135955457,  538755304, 1688831340 },
+    { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+    uint32_t w;
+
+    w = (uint32_t)x;
+    w += p & -(w >> 31);
+    return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+    return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+    uint32_t y;
+
+    y = 2 - p;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+    /*
+     * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+     * 2^31 - p.
+     */
+    return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a + b - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a - b;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+    a += p & -(a & 1);
+    return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+    uint64_t z, w;
+    uint32_t d;
+
+    z = (uint64_t)a * (uint64_t)b;
+    w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+    d = (uint32_t)((z + w) >> 31) - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+    uint32_t z;
+
+    /*
+     * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+     * representation), then double it with an addition.
+     */
+    z = modp_R(p);
+    z = modp_add(z, z, p);
+
+    /*
+     * Square it five times to obtain 2^32 in Montgomery representation
+     * (i.e. 2^63 mod p).
+     */
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+
+    /*
+     * Halve the value mod p to get 2^62.
+     */
+    z = (z + (p & -(z & 1))) >> 1;
+    return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+    int i;
+    uint32_t r, z;
+
+    /*
+     * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+     * representation of (2^31)^e mod p, where e = x-1.
+     * R2 is 2^31 in Montgomery representation.
+     */
+    x --;
+    r = R2;
+    z = modp_R(p);
+    for (i = 0; (1U << i) <= x; i ++) {
+        if ((x & (1U << i)) != 0) {
+            z = modp_montymul(z, r, p, p0i);
+        }
+        r = modp_montymul(r, r, p, p0i);
+    }
+    return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ *   a     dividend
+ *   b     divisor
+ *   p     odd prime modulus
+ *   p0i   -1/p mod 2^31
+ *   R     2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+    uint32_t z, e;
+    int i;
+
+    e = p - 2;
+    z = R;
+    for (i = 30; i >= 0; i --) {
+        uint32_t z2;
+
+        z = modp_montymul(z, z, p, p0i);
+        z2 = modp_montymul(z, b, p, p0i);
+        z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+    }
+
+    /*
+     * The loop above just assumed that b was in Montgomery
+     * representation, i.e. really contained b*R; under that
+     * assumption, it returns 1/b in Montgomery representation,
+     * which is R/b. But we gave it b in normal representation,
+     * so the loop really returned R/(b/R) = R^2/b.
+     *
+     * We want a/b, so we need one Montgomery multiplication with a,
+     * which also remove one of the R factors, and another such
+     * multiplication to remove the second R factor.
+     */
+    z = modp_montymul(z, 1, p, p0i);
+    return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+    0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
+    192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
+    96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
+    144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
+    48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
+    240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
+    72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
+    168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
+    24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
+    216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
+    120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
+    132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
+    36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
+    228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
+    84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
+    180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
+    12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
+    204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
+    108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
+    156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
+    60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
+    252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
+    66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
+    162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
+    18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
+    210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
+    114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
+    138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
+    42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
+    234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
+    90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
+    186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
+    6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
+    198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
+    102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
+    150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
+    54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
+    246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
+    78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
+    174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
+    30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
+    222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
+    126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
+    129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
+    33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
+    225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
+    81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
+    177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
+    9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
+    201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
+    105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
+    153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
+    57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
+    249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
+    69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
+    165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
+    21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
+    213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
+    117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
+    141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
+    45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
+    237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
+    93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
+    189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
+    3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
+    195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
+    99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
+    147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
+    51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
+    243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
+    75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
+    171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
+    27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
+    219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
+    123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
+    135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
+    39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
+    231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
+    87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
+    183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
+    15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
+    207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
+    111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
+    159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
+    63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
+    255,  767,  511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ *   gm[rev(i)] = g^i mod p
+ *   igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+           uint32_t g, uint32_t p, uint32_t p0i) {
+    size_t u, n;
+    unsigned k;
+    uint32_t ig, x1, x2, R2;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * We want g such that g^(2N) = 1 mod p, but the provided
+     * generator has order 2048. We must square it a few times.
+     */
+    R2 = modp_R2(p, p0i);
+    g = modp_montymul(g, R2, p, p0i);
+    for (k = logn; k < 10; k ++) {
+        g = modp_montymul(g, g, p, p0i);
+    }
+
+    ig = modp_div(R2, g, p, p0i, modp_R(p));
+    k = 10 - logn;
+    x1 = x2 = modp_R(p);
+    for (u = 0; u < n; u ++) {
+        size_t v;
+
+        v = REV10[u << k];
+        gm[v] = x1;
+        igm[v] = x2;
+        x1 = modp_montymul(x1, g, p, p0i);
+        x2 = modp_montymul(x2, ig, p, p0i);
+    }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+              uint32_t p, uint32_t p0i) {
+    size_t t, m, n;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, u, v1;
+
+        ht = t >> 1;
+        for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = gm[m + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + ht * stride;
+            for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = modp_montymul(*r2, s, p, p0i);
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_sub(x, y, p);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+               uint32_t p, uint32_t p0i) {
+    size_t t, m, n, k;
+    uint32_t ni;
+    uint32_t *r;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = 1;
+    for (m = n; m > 1; m >>= 1) {
+        size_t hm, dt, u, v1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = igm[hm + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + t * stride;
+            for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = *r2;
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_montymul(
+                          modp_sub(x, y, p), s, p, p0i);;
+            }
+        }
+        t = dt;
+    }
+
+    /*
+     * We need 1/n in Montgomery representation, i.e. R/n. Since
+     * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+     * thus a simple shift will do.
+     */
+    ni = (uint32_t)1 << (31 - logn);
+    for (k = 0, r = a; k < n; k ++, r += stride) {
+        *r = modp_montymul(*r, ni, p, p0i);
+    }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+                  uint32_t p, uint32_t p0i, uint32_t R2) {
+    size_t hn, u;
+
+    hn = (size_t)1 << (logn - 1);
+    for (u = 0; u < hn; u ++) {
+        uint32_t w0, w1;
+
+        w0 = f[(u << 1) + 0];
+        w1 = f[(u << 1) + 1];
+        f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+    }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ *  - Rebuild the resultant and the polynomial coefficients from their
+ *    values modulo small primes (of length 31 bits each).
+ *
+ *  - Compute an extended GCD between the two computed resultants.
+ *
+ *  - Extract top bits and add scaled values during the successive steps
+ *    of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+         uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    cc = 0;
+    m = -ctl;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, w;
+
+        aw = a[u];
+        w = aw - b[u] - cc;
+        cc = w >> 31;
+        aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+        a[u] = aw;
+    }
+    return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < mlen; u ++) {
+        uint64_t z;
+
+        z = (uint64_t)m[u] * (uint64_t)x + cc;
+        m[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ *  d is unsigned
+ *  p is prime
+ *  2^30 < p < 2^31
+ *  p0i = -(1/p) mod 2^31
+ *  R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+                        uint32_t p, uint32_t p0i, uint32_t R2) {
+    uint32_t x;
+    size_t u;
+
+    /*
+     * Algorithm: we inject words one by one, starting with the high
+     * word. Each step is:
+     *  - multiply x by 2^31
+     *  - add new word
+     */
+    x = 0;
+    u = dlen;
+    while (u -- > 0) {
+        uint32_t w;
+
+        x = modp_montymul(x, R2, p, p0i);
+        w = d[u] - p;
+        w += p & -(w >> 31);
+        x = modp_add(x, w, p);
+    }
+    return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+                      uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+    uint32_t z;
+
+    if (dlen == 0) {
+        return 0;
+    }
+    z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+    z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+    return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+                   const uint32_t *y, size_t len, uint32_t s) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t xw, yw;
+        uint64_t z;
+
+        xw = x[u];
+        yw = y[u];
+        z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+    size_t u;
+    uint32_t r, bb;
+
+    /*
+     * Compare x with p/2. We use the shifted version of p, and p
+     * is odd, so we really compare with (p-1)/2; we want to perform
+     * the subtraction if and only if x > (p-1)/2.
+     */
+    r = 0;
+    bb = 0;
+    u = len;
+    while (u -- > 0) {
+        uint32_t wx, wp, cc;
+
+        /*
+         * Get the two words to compare in wx and wp (both over
+         * 31 bits exactly).
+         */
+        wx = x[u];
+        wp = (p[u] >> 1) | (bb << 30);
+        bb = p[u] & 1;
+
+        /*
+         * We set cc to -1, 0 or 1, depending on whether wp is
+         * lower than, equal to, or greater than wx.
+         */
+        cc = wp - wx;
+        cc = ((-cc) >> 31) | -(cc >> 31);
+
+        /*
+         * If r != 0 then it is either 1 or -1, and we keep its
+         * value. Otherwise, if r = 0, then we replace it with cc.
+         */
+        r |= cc & ((r & 1) - 1);
+    }
+
+    /*
+     * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+     * is lower than, equal to, or greater than x. We thus want to
+     * do the subtraction only if r = -1.
+     */
+    zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+                 size_t num, const small_prime *primes, int normalize_signed,
+                 uint32_t *tmp) {
+    size_t u;
+    uint32_t *x;
+
+    tmp[0] = primes[0].p;
+    for (u = 1; u < xlen; u ++) {
+        /*
+         * At the entry of each loop iteration:
+         *  - the first u words of each array have been
+         *    reassembled;
+         *  - the first u words of tmp[] contains the
+         * product of the prime moduli processed so far.
+         *
+         * We call 'q' the product of all previous primes.
+         */
+        uint32_t p, p0i, s, R2;
+        size_t v;
+
+        p = primes[u].p;
+        s = primes[u].s;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        for (v = 0, x = xx; v < num; v ++, x += xstride) {
+            uint32_t xp, xq, xr;
+            /*
+             * xp = the integer x modulo the prime p for this
+             *      iteration
+             * xq = (x mod q) mod p
+             */
+            xp = x[u];
+            xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+            /*
+             * New value is (x mod q) + q * (s * (xp - xq) mod p)
+             */
+            xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+            zint_add_mul_small(x, tmp, u, xr);
+        }
+
+        /*
+         * Update product of primes in tmp[].
+         */
+        tmp[u] = zint_mul_small(tmp, u, p);
+    }
+
+    /*
+     * Normalize the reconstructed values around 0.
+     */
+    if (normalize_signed) {
+        for (u = 0, x = xx; u < num; u ++, x += xstride) {
+            zint_norm_zero(x, tmp, xlen);
+        }
+    }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    /*
+     * If ctl = 1 then we flip the bits of a by XORing with
+     * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+     * with 0 and add 0, which leaves the value unchanged.
+     */
+    cc = ctl;
+    m = -ctl >> 1;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw;
+
+        aw = a[u];
+        aw = (aw ^ m) + cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ *  0  both values were positive
+ *  1  new a had to be negated
+ *  2  new b had to be negated
+ *  3  both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+               int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t nega, negb;
+
+    cca = 0;
+    ccb = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    nega = (uint32_t)((uint64_t)cca >> 63);
+    negb = (uint32_t)((uint64_t)ccb >> 63);
+    zint_negate(a, len, nega);
+    zint_negate(b, len, negb);
+    return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+    size_t u;
+    uint32_t cc, xm, ym;
+
+    /*
+     * First pass: compare a (assumed nonnegative) with m. Note that
+     * if the top word uses 32 bits, subtracting m must yield a
+     * value less than 2^31 since a < 2*m.
+     */
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        cc = (a[u] - m[u] - cc) >> 31;
+    }
+
+    /*
+     * If neg = 1 then we must add m (regardless of cc)
+     * If neg = 0 and cc = 0 then we must subtract m
+     * If neg = 0 and cc = 1 then we must do nothing
+     *
+     * In the loop below, we conditionally subtract either m or -m
+     * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+     * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+     */
+    xm = -neg >> 1;
+    ym = -(neg | (1 - cc));
+    cc = neg;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, mw;
+
+        aw = a[u];
+        mw = (m[u] ^ xm) & ym;
+        aw = aw - mw - cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+                   uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t fa, fb;
+
+    /*
+     * These are actually four combined Montgomery multiplications.
+     */
+    cca = 0;
+    ccb = 0;
+    fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+    fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb
+             + m[u] * (uint64_t)fa + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+             + m[u] * (uint64_t)fb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    /*
+     * At this point:
+     *   -m <= a < 2*m
+     *   -m <= b < 2*m
+     * (this is a case of Montgomery reduction)
+     * The top words of 'a' and 'b' may have a 32-th bit set.
+     * We want to add or subtract the modulus, as required.
+     */
+    zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+    zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ *   0 <= u <= y
+ *   0 <= v <= x
+ *   x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+            const uint32_t *x, const uint32_t *y,
+            size_t len, uint32_t *tmp) {
+    /*
+     * Algorithm is an extended binary GCD. We maintain 6 values
+     * a, b, u0, u1, v0 and v1 with the following invariants:
+     *
+     *  a = x*u0 - y*v0
+     *  b = x*u1 - y*v1
+     *  0 <= a <= x
+     *  0 <= b <= y
+     *  0 <= u0 < y
+     *  0 <= v0 < x
+     *  0 <= u1 <= y
+     *  0 <= v1 < x
+     *
+     * Initial values are:
+     *
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     *
+     * Each iteration reduces either a or b, and maintains the
+     * invariants. Algorithm stops when a = b, at which point their
+     * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+     * the values (u,v) we want to return.
+     *
+     * The formal definition of the algorithm is a sequence of steps:
+     *
+     *  - If a is even, then:
+     *        a <- a/2
+     *        u0 <- u0/2 mod y
+     *        v0 <- v0/2 mod x
+     *
+     *  - Otherwise, if b is even, then:
+     *        b <- b/2
+     *        u1 <- u1/2 mod y
+     *        v1 <- v1/2 mod x
+     *
+     *  - Otherwise, if a > b, then:
+     *        a <- (a-b)/2
+     *        u0 <- (u0-u1)/2 mod y
+     *        v0 <- (v0-v1)/2 mod x
+     *
+     *  - Otherwise:
+     *        b <- (b-a)/2
+     *        u1 <- (u1-u0)/2 mod y
+     *        v1 <- (v1-v0)/2 mod y
+     *
+     * We can show that the operations above preserve the invariants:
+     *
+     *  - If a is even, then u0 and v0 are either both even or both
+     *    odd (since a = x*u0 - y*v0, and x and y are both odd).
+     *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+     *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+     *    the a = x*u0 - y*v0 invariant is preserved.
+     *
+     *  - The same holds for the case where b is even.
+     *
+     *  - If a and b are odd, and a > b, then:
+     *
+     *      a-b = x*(u0-u1) - y*(v0-v1)
+     *
+     *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+     *    a-b > 0; therefore, it must be that v0 < v1, and the
+     *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+     *    which preserves the invariants. Otherwise, if u0 > u1,
+     *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+     *    b >= 0, hence a-b <= x. It follows that, in that case,
+     *    v0-v1 >= 0. The first part of the update is then:
+     *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+     *    invariants.
+     *
+     *    Either way, once the subtraction is done, the new value of
+     *    a, which is the difference of two odd values, is even,
+     *    and the remaining of this step is a subcase of the
+     *    first algorithm case (i.e. when a is even).
+     *
+     *  - If a and b are odd, and b > a, then the a similar
+     *    argument holds.
+     *
+     * The values a and b start at x and y, respectively. Since x
+     * and y are odd, their GCD is odd, and it is easily seen that
+     * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+     * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+     * or b is reduced by at least one bit at each iteration, so
+     * the algorithm necessarily converges on the case a = b, at
+     * which point the common value is the GCD.
+     *
+     * In the algorithm expressed above, when a = b, the fourth case
+     * applies, and sets b = 0. Since a contains the GCD of x and y,
+     * which are both odd, a must be odd, and subsequent iterations
+     * (if any) will simply divide b by 2 repeatedly, which has no
+     * consequence. Thus, the algorithm can run for more iterations
+     * than necessary; the final GCD will be in a, and the (u,v)
+     * coefficients will be (u0,v0).
+     *
+     *
+     * The presentation above is bit-by-bit. It can be sped up by
+     * noticing that all decisions are taken based on the low bits
+     * and high bits of a and b. We can extract the two top words
+     * and low word of each of a and b, and compute reduction
+     * parameters pa, pb, qa and qb such that the new values for
+     * a and b are:
+     *    a' = (a*pa + b*pb) / (2^31)
+     *    b' = (a*qa + b*qb) / (2^31)
+     * the two divisions being exact. The coefficients are obtained
+     * just from the extracted words, and may be slightly off, requiring
+     * an optional correction: if a' < 0, then we replace pa with -pa
+     * and pb with -pb. Each such step will reduce the total length
+     * (sum of lengths of a and b) by at least 30 bits at each
+     * iteration.
+     */
+    uint32_t *u0, *u1, *v0, *v1, *a, *b;
+    uint32_t x0i, y0i;
+    uint32_t num, rc;
+    size_t j;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    /*
+     * u0 and v0 are the u and v result buffers; the four other
+     * values (u1, v1, a and b) are taken from tmp[].
+     */
+    u0 = u;
+    v0 = v;
+    u1 = tmp;
+    v1 = u1 + len;
+    a = v1 + len;
+    b = a + len;
+
+    /*
+     * We'll need the Montgomery reduction coefficients.
+     */
+    x0i = modp_ninv31(x[0]);
+    y0i = modp_ninv31(y[0]);
+
+    /*
+     * Initialize a, b, u0, u1, v0 and v1.
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     * Note that x is odd, so computing x-1 is easy.
+     */
+    memcpy(a, x, len * sizeof * x);
+    memcpy(b, y, len * sizeof * y);
+    u0[0] = 1;
+    memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+    memset(v0, 0, len * sizeof * v0);
+    memcpy(u1, y, len * sizeof * u1);
+    memcpy(v1, x, len * sizeof * v1);
+    v1[0] --;
+
+    /*
+     * Each input operand may be as large as 31*len bits, and we
+     * reduce the total length by at least 30 bits at each iteration.
+     */
+    for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+        uint32_t c0, c1;
+        uint32_t a0, a1, b0, b1;
+        uint64_t a_hi, b_hi;
+        uint32_t a_lo, b_lo;
+        int64_t pa, pb, qa, qb;
+        int i;
+        uint32_t r;
+
+        /*
+         * Extract the top words of a and b. If j is the highest
+         * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+         * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+         * If a and b are down to one word each, then we use
+         * a[0] and b[0].
+         */
+        c0 = (uint32_t) -1;
+        c1 = (uint32_t) -1;
+        a0 = 0;
+        a1 = 0;
+        b0 = 0;
+        b1 = 0;
+        j = len;
+        while (j -- > 0) {
+            uint32_t aw, bw;
+
+            aw = a[j];
+            bw = b[j];
+            a0 ^= (a0 ^ aw) & c0;
+            a1 ^= (a1 ^ aw) & c1;
+            b0 ^= (b0 ^ bw) & c0;
+            b1 ^= (b1 ^ bw) & c1;
+            c1 = c0;
+            c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+        }
+
+        /*
+         * If c1 = 0, then we grabbed two words for a and b.
+         * If c1 != 0 but c0 = 0, then we grabbed one word. It
+         * is not possible that c1 != 0 and c0 != 0, because that
+         * would mean that both integers are zero.
+         */
+        a1 |= a0 & c1;
+        a0 &= ~c1;
+        b1 |= b0 & c1;
+        b0 &= ~c1;
+        a_hi = ((uint64_t)a0 << 31) + a1;
+        b_hi = ((uint64_t)b0 << 31) + b1;
+        a_lo = a[0];
+        b_lo = b[0];
+
+        /*
+         * Compute reduction factors:
+         *
+         *   a' = a*pa + b*pb
+         *   b' = a*qa + b*qb
+         *
+         * such that a' and b' are both multiple of 2^31, but are
+         * only marginally larger than a and b.
+         */
+        pa = 1;
+        pb = 0;
+        qa = 0;
+        qb = 1;
+        for (i = 0; i < 31; i ++) {
+            /*
+             * At each iteration:
+             *
+             *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+             *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+             *   a <- a/2 if: a is even
+             *   b <- b/2 if: a is odd, b is even
+             *
+             * We multiply a_lo and b_lo by 2 at each
+             * iteration, thus a division by 2 really is a
+             * non-multiplication by 2.
+             */
+            uint32_t rt, oa, ob, cAB, cBA, cA;
+            uint64_t rz;
+
+            /*
+             * rt = 1 if a_hi > b_hi, 0 otherwise.
+             */
+            rz = b_hi - a_hi;
+            rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+                                   & (a_hi ^ rz))) >> 63);
+
+            /*
+             * cAB = 1 if b must be subtracted from a
+             * cBA = 1 if a must be subtracted from b
+             * cA = 1 if a must be divided by 2
+             *
+             * Rules:
+             *
+             *   cAB and cBA cannot both be 1.
+             *   If a is not divided by 2, b is.
+             */
+            oa = (a_lo >> i) & 1;
+            ob = (b_lo >> i) & 1;
+            cAB = oa & ob & rt;
+            cBA = oa & ob & ~rt;
+            cA = cAB | (oa ^ 1);
+
+            /*
+             * Conditional subtractions.
+             */
+            a_lo -= b_lo & -cAB;
+            a_hi -= b_hi & -(uint64_t)cAB;
+            pa -= qa & -(int64_t)cAB;
+            pb -= qb & -(int64_t)cAB;
+            b_lo -= a_lo & -cBA;
+            b_hi -= a_hi & -(uint64_t)cBA;
+            qa -= pa & -(int64_t)cBA;
+            qb -= pb & -(int64_t)cBA;
+
+            /*
+             * Shifting.
+             */
+            a_lo += a_lo & (cA - 1);
+            pa += pa & ((int64_t)cA - 1);
+            pb += pb & ((int64_t)cA - 1);
+            a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+            b_lo += b_lo & -cA;
+            qa += qa & -(int64_t)cA;
+            qb += qb & -(int64_t)cA;
+            b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+        }
+
+        /*
+         * Apply the computed parameters to our values. We
+         * may have to correct pa and pb depending on the
+         * returned value of zint_co_reduce() (when a and/or b
+         * had to be negated).
+         */
+        r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+        pa -= (pa + pa) & -(int64_t)(r & 1);
+        pb -= (pb + pb) & -(int64_t)(r & 1);
+        qa -= (qa + qa) & -(int64_t)(r >> 1);
+        qb -= (qb + qb) & -(int64_t)(r >> 1);
+        zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+        zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+    }
+
+    /*
+     * At that point, array a[] should contain the GCD, and the
+     * results (u,v) should already be set. We check that the GCD
+     * is indeed 1. We also check that the two operands x and y
+     * are odd.
+     */
+    rc = a[0] ^ 1;
+    for (j = 1; j < len; j ++) {
+        rc |= a[j];
+    }
+    return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+                          const uint32_t *y, size_t ylen, int32_t k,
+                          uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    int32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t wy, wys, ccu;
+        uint64_t z;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        /*
+         * The expression below does not overflow.
+         */
+        z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+        /*
+         * Right-shifting the signed value z would yield
+         * implementation-defined results (arithmetic shift is
+         * not guaranteed). However, we can cast to unsigned,
+         * and get the next carry as an unsigned word. We can
+         * then convert it back to signed by using the guaranteed
+         * fact that 'int32_t' uses two's complement with no
+         * trap representation or padding bit, and with a layout
+         * compatible with that of 'uint32_t'.
+         */
+        ccu = (uint32_t)(z >> 31);
+        cc = *(int32_t *)&ccu;
+    }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+                const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    uint32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t w, wy, wys;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        w = x[u] - wys - cc;
+        x[u] = w & 0x7FFFFFFF;
+        cc = w >> 31;
+    }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+    uint32_t w;
+
+    w = x[0];
+    w |= (w & 0x40000000) << 1;
+    return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+               unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    if (flen == 0) {
+        for (u = 0; u < n; u ++) {
+            d[u] = fpr_zero;
+        }
+        return;
+    }
+    for (u = 0; u < n; u ++, f += fstride) {
+        size_t v;
+        uint32_t neg, cc, xm;
+        fpr x, fsc;
+
+        /*
+         * Get sign of the integer; if it is negative, then we
+         * will load its absolute value instead, and negate the
+         * result.
+         */
+        neg = -(f[flen - 1] >> 30);
+        xm = neg >> 1;
+        cc = neg & 1;
+        x = fpr_zero;
+        fsc = fpr_one;
+        for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+            uint32_t w;
+
+            w = (f[v] ^ xm) + cc;
+            cc = w >> 31;
+            w &= 0x7FFFFFFF;
+            w -= (w << 1) & neg;
+            x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+        }
+        d[u] = x;
+    }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = zint_one_to_plain(s + u);
+        if (z < -lim || z > lim) {
+            return 0;
+        }
+        d[u] = (int8_t)z;
+    }
+    return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+                const uint32_t *f, size_t flen, size_t fstride,
+                const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t kf;
+        size_t v;
+        uint32_t *x;
+        const uint32_t *y;
+
+        kf = -k[u];
+        x = F + u * Fstride;
+        y = f;
+        for (v = 0; v < n; v ++) {
+            zint_add_scaled_mul_small(
+                x, Flen, y, flen, kf, sch, scl);
+            if (u + v == n - 1) {
+                x = F;
+                kf = -kf;
+            } else {
+                x += Fstride;
+            }
+            y += fstride;
+        }
+    }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+                    const uint32_t *f, size_t flen, size_t fstride,
+                    const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+                    uint32_t *tmp) {
+    uint32_t *gm, *igm, *fk, *t1, *x;
+    const uint32_t *y;
+    size_t n, u, tlen;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    tlen = flen + 1;
+    gm = tmp;
+    igm = gm + MKN(logn);
+    fk = igm + MKN(logn);
+    t1 = fk + n * tlen;
+
+    primes = PRIMES;
+
+    /*
+     * Compute k*f in fk[], in RNS notation.
+     */
+    for (u = 0; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0; v < n; v ++) {
+            t1[v] = modp_set(k[v], p);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, y = f, x = fk + u;
+                v < n; v ++, y += fstride, x += tlen) {
+            *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+        }
+        modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+        for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+            *x = modp_montymul(
+                     modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+        }
+        modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild k*f.
+     */
+    zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+    /*
+     * Subtract k*f, scaled, from F.
+     */
+    for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+        zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+    }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT   inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+    /*
+     * We enforce little-endian representation.
+     */
+
+    uint8_t tmp[8];
+
+    inner_shake256_extract(rng, tmp, sizeof tmp);
+    return (uint64_t)tmp[0]
+           | ((uint64_t)tmp[1] << 8)
+           | ((uint64_t)tmp[2] << 16)
+           | ((uint64_t)tmp[3] << 24)
+           | ((uint64_t)tmp[4] << 32)
+           | ((uint64_t)tmp[5] << 40)
+           | ((uint64_t)tmp[6] << 48)
+           | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ *    D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+    1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
+    2353523259288686585u,  1227179971273316331u,   575931623374121527u,
+    242543240509105209u,    91437049221049666u,    30799446349977173u,
+    9255276791179340u,     2478152334826140u,      590642893610164u,
+    125206034929641u,       23590435911403u,        3948334035941u,
+    586753615614u,          77391054539u,           9056793210u,
+    940121950u,             86539696u,              7062824u,
+    510971u,                32764u,                 1862u,
+    94u,                    4u,                    0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+    unsigned u, g;
+    int val;
+
+    g = 1U << (10 - logn);
+    val = 0;
+    for (u = 0; u < g; u ++) {
+        /*
+         * Each iteration generates one value with the
+         * Gaussian distribution for N = 1024.
+         *
+         * We use two random 64-bit values. First value
+         * decides on whether the generated value is 0, and,
+         * if not, the sign of the value. Second random 64-bit
+         * word is used to generate the non-zero value.
+         *
+         * For constant-time code we have to read the complete
+         * table. This has negligible cost, compared with the
+         * remainder of the keygen process (solving the NTRU
+         * equation).
+         */
+        uint64_t r;
+        uint32_t f, v, k, neg;
+
+        /*
+         * First value:
+         *  - flag 'neg' is randomly selected to be 0 or 1.
+         *  - flag 'f' is set to 1 if the generated value is zero,
+         *    or set to 0 otherwise.
+         */
+        r = get_rng_u64(rng);
+        neg = (uint32_t)(r >> 63);
+        r &= ~((uint64_t)1 << 63);
+        f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+        /*
+         * We produce a new random 63-bit integer r, and go over
+         * the array, starting at index 1. We store in v the
+         * index of the first array element which is not greater
+         * than r, unless the flag f was already 1.
+         */
+        v = 0;
+        r = get_rng_u64(rng);
+        r &= ~((uint64_t)1 << 63);
+        for (k = 1; k < (sizeof gauss_1024_12289)
+                / (sizeof gauss_1024_12289[0]); k ++) {
+            uint32_t t;
+
+            t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+            v |= k & -(t & (f ^ 1));
+            f |= t;
+        }
+
+        /*
+         * We apply the sign ('neg' flag). If the value is zero,
+         * the sign has no effect.
+         */
+        v = (v ^ -neg) + neg;
+
+        /*
+         * Generated value is added to val.
+         */
+        val += *(int32_t *)&v;
+    }
+    return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ *  - Within an array, values grow.
+ *
+ *  - The 'SMALL' array must have an entry for maximum depth, corresponding
+ *    to the size of values used in the binary GCD. There is no such value
+ *    for the 'LARGE' array (the binary GCD yields already reduced
+ *    coefficients).
+ *
+ *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ *  - Values must be large enough to handle the common cases, with some
+ *    margins.
+ *
+ *  - Values must not be "too large" either because we will convert some
+ *    integers into floating-point values by considering the top 10 words,
+ *    i.e. 310 bits; hence, for values of length more than 10 words, we
+ *    should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
+ *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
+ *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
+ *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
+ *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
+ *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
+ *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
+ *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
+ *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
+ *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
+ *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+    1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+    2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+    int avg;
+    int std;
+} BITLENGTH[] = {
+    {    4,  0 },
+    {   11,  1 },
+    {   24,  1 },
+    {   50,  1 },
+    {  102,  1 },
+    {  202,  2 },
+    {  401,  4 },
+    {  794,  5 },
+    { 1577,  8 },
+    { 3138, 13 },
+    { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG   4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = MKN(logn);
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = f[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(fpr);
+    if (km) {
+        k += (sizeof(fpr)) - km;
+    }
+    return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(uint32_t);
+    if (km) {
+        k += (sizeof(uint32_t)) - km;
+    }
+    return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        x[u] = fpr_of(f[u]);
+    }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+             int in_ntt, int out_ntt) {
+    size_t n, hn, u;
+    size_t slen, tlen;
+    uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+    const small_prime *primes;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    slen = MAX_BL_SMALL[depth];
+    tlen = MAX_BL_SMALL[depth + 1];
+    primes = PRIMES;
+
+    /*
+     * Prepare room for the result.
+     */
+    fd = data;
+    gd = fd + hn * tlen;
+    fs = gd + hn * tlen;
+    gs = fs + n * slen;
+    gm = gs + n * slen;
+    igm = gm + n;
+    t1 = igm + n;
+    memmove(fs, data, 2 * n * slen * sizeof * data);
+
+    /*
+     * First slen words: we use the input values directly, and apply
+     * inverse NTT as we go.
+     */
+    for (u = 0; u < slen; u ++) {
+        uint32_t p, p0i, R2;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+        }
+
+        for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+
+    /*
+     * Since the fs and gs words have been de-NTTized, we can use the
+     * CRT to rebuild the values.
+     */
+    zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+    zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+    /*
+     * Remaining words: use modular reductions to extract the values.
+     */
+    for (u = slen; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+        for (v = 0, x = fs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        for (v = 0, x = gs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ *   0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+        unsigned logn, unsigned depth, int out_ntt) {
+    size_t n, u;
+    uint32_t *ft, *gt, p0;
+    unsigned d;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    ft = data;
+    gt = ft + n;
+    primes = PRIMES;
+    p0 = primes[0].p;
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p0);
+        gt[u] = modp_set(g[u], p0);
+    }
+
+    if (depth == 0 && out_ntt) {
+        uint32_t *gm, *igm;
+        uint32_t p, p0i;
+
+        p = primes[0].p;
+        p0i = modp_ninv31(p);
+        gm = gt + n;
+        igm = gm + MKN(logn);
+        modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+        modp_NTT2(ft, gm, logn, p, p0i);
+        modp_NTT2(gt, gm, logn, p, p0i);
+        return;
+    }
+
+    if (depth == 0) {
+        return;
+    }
+
+    if (depth == 1) {
+        make_fg_step(data, logn, 0, 0, out_ntt);
+        return;
+    }
+
+    make_fg_step(data, logn, 0, 0, 1);
+    for (d = 1; d + 1 < depth; d ++) {
+        make_fg_step(data, logn - d, d, 1, 1);
+    }
+    make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+                   const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t len;
+    uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+    const small_prime *primes;
+
+    len = MAX_BL_SMALL[logn_top];
+    primes = PRIMES;
+
+    Fp = tmp;
+    Gp = Fp + len;
+    fp = Gp + len;
+    gp = fp + len;
+    t1 = gp + len;
+
+    make_fg(fp, f, g, logn_top, logn_top, 0);
+
+    /*
+     * We use the CRT to rebuild the resultants as big integers.
+     * There are two such big integers. The resultants are always
+     * nonnegative.
+     */
+    zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+    /*
+     * Apply the binary GCD. The zint_bezout() function works only
+     * if both inputs are odd.
+     *
+     * We can test on the result and return 0 because that would
+     * imply failure of the NTRU solving equation, and the (f,g)
+     * values will be abandoned in that case.
+     */
+    if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+        return 0;
+    }
+
+    /*
+     * Multiply the two values by the target value q. Values must
+     * fit in the destination arrays.
+     * We can again test on the returned words: a non-zero output
+     * of zint_mul_small() means that we exceeded our array
+     * capacity, and that implies failure and rejection of (f,g).
+     */
+    q = 12289;
+    if (zint_mul_small(Fp, len, q) != 0
+            || zint_mul_small(Gp, len, q) != 0) {
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+                        const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+    /*
+     * In this function, 'logn' is the log2 of the degree for
+     * this step. If N = 2^logn, then:
+     *  - the F and G values already in fk->tmp (from the deeper
+     *    levels) have degree N/2;
+     *  - this function should return F and G of degree N.
+     */
+    unsigned logn;
+    size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+    int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+    uint32_t *x, *y;
+    int32_t *k;
+    const small_prime *primes;
+
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2 or N/3)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+    primes = PRIMES;
+
+    /*
+     * Fd and Gd are the F and G from the deeper level.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+
+    /*
+     * Compute the input f and g for this level. Note that we get f
+     * and g in RNS + NTT representation.
+     */
+    ft = Gd + dlen * hn;
+    make_fg(ft, f, g, logn_top, depth, 1);
+
+    /*
+     * Move the newly computed f and g to make room for our candidate
+     * F and G (unreduced).
+     */
+    Ft = tmp;
+    Gt = Ft + n * llen;
+    t1 = Gt + n * llen;
+    memmove(t1, ft, 2 * n * slen * sizeof * ft);
+    ft = t1;
+    gt = ft + slen * n;
+    t1 = gt + slen * n;
+
+    /*
+     * Move Fd and Gd _after_ f and g.
+     */
+    memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+    Fd = t1;
+    Gd = Fd + hn * dlen;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt (only n/2 values in each).
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * We do not need Fd and Gd after that point.
+     */
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * If we processed slen words, then f and g have been
+         * de-NTTized, and are in RNS; we can rebuild them.
+         */
+        if (u == slen) {
+            zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+            zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+        }
+
+        gm = t1;
+        igm = gm + n;
+        fx = igm + n;
+        gx = fx + n;
+
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        if (u < slen) {
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = *x;
+                gx[v] = *y;
+            }
+            modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+            modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+        } else {
+            uint32_t Rx;
+
+            Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+            for (v = 0, x = ft, y = gt;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = zint_mod_small_signed(x, slen,
+                                              p, p0i, R2, Rx);
+                gx[v] = zint_mod_small_signed(y, slen,
+                                              p, p0i, R2, Rx);
+            }
+            modp_NTT2(fx, gm, logn, p, p0i);
+            modp_NTT2(gx, gm, logn, p, p0i);
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed in
+         * a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * General case:
+         *
+         *   we divide degree by d = 2 or 3
+         *   f'(x^d) = N(f)(x^d) = f * adj(f)
+         *   g'(x^d) = N(g)(x^d) = g * adj(g)
+         *   f'*G' - g'*F' = q
+         *   F = F'(x^d) * adj(g)
+         *   G = G'(x^d) * adj(f)
+         *
+         * We compute things in the NTT. We group roots of phi
+         * such that all roots x in a group share the same x^d.
+         * If the roots in a group are x_1, x_2... x_d, then:
+         *
+         *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+         *
+         * Thus, we have:
+         *
+         *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   ...
+         *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+         *
+         * In all cases, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, in our chosen NTT representation, roots
+         * from the same group are consecutive in RAM.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+                v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild F and G with the CRT.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+    zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+    /*
+     * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+     * order).
+     */
+
+    /*
+     * Apply Babai reduction to bring back F and G to size slen.
+     *
+     * We use the FFT to compute successive approximations of the
+     * reduction coefficient. We first isolate the top bits of
+     * the coefficients of f and g, and convert them to floating
+     * point; with the FFT, we compute adj(f), adj(g), and
+     * 1/(f*adj(f)+g*adj(g)).
+     *
+     * Then, we repeatedly apply the following:
+     *
+     *   - Get the top bits of the coefficients of F and G into
+     *     floating point, and use the FFT to compute:
+     *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+     *
+     *   - Convert back that value into normal representation, and
+     *     round it to the nearest integers, yielding a polynomial k.
+     *     Proper scaling is applied to f, g, F and G so that the
+     *     coefficients fit on 32 bits (signed).
+     *
+     *   - Subtract k*f from F and k*g from G.
+     *
+     * Under normal conditions, this process reduces the size of F
+     * and G by some bits at each iteration. For constant-time
+     * operation, we do not want to measure the actual length of
+     * F and G; instead, we do the following:
+     *
+     *   - f and g are converted to floating-point, with some scaling
+     *     if necessary to keep values in the representable range.
+     *
+     *   - For each iteration, we _assume_ a maximum size for F and G,
+     *     and use the values at that size. If we overreach, then
+     *     we get zeros, which is harmless: the resulting coefficients
+     *     of k will be 0 and the value won't be reduced.
+     *
+     *   - We conservatively assume that F and G will be reduced by
+     *     at least 25 bits at each iteration.
+     *
+     * Even when reaching the bottom of the reduction, reduction
+     * coefficient will remain low. If it goes out-of-range, then
+     * something wrong occurred and the whole NTRU solving fails.
+     */
+
+    /*
+     * Memory layout:
+     *  - We need to compute and keep adj(f), adj(g), and
+     *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+     *    respectively).
+     *  - At each iteration we need two extra fp buffer (N fp values),
+     *    and produce a k (N 32-bit words). k will be shared with one
+     *    of the fp buffers.
+     *  - To compute k*f and k*g efficiently (with the NTT), we need
+     *    some extra room; we reuse the space of the temporary buffers.
+     *
+     * Arrays of 'fpr' are obtained from the temporary array itself.
+     * We ensure that the base is at a properly aligned offset (the
+     * source array tmp[] is supposed to be already aligned).
+     */
+
+    rt3 = align_fpr(tmp, t1);
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt1 = rt5 + (n >> 1);
+    k = (int32_t *)align_u32(tmp, rt1);
+    rt2 = align_fpr(tmp, k + n);
+    if (rt2 < (rt1 + n)) {
+        rt2 = rt1 + n;
+    }
+    t1 = (uint32_t *)k + n;
+
+    /*
+     * Get f and g into rt3 and rt4 as floating-point approximations.
+     *
+     * We need to "scale down" the floating-point representation of
+     * coefficients when they are too big. We want to keep the value
+     * below 2^310 or so. Thus, when values are larger than 10 words,
+     * we consider only the top 10 words. Array lengths have been
+     * computed so that average maximum length will fall in the
+     * middle or the upper half of these top 10 words.
+     */
+    if (slen > 10) {
+        rlen = 10;
+    } else {
+        rlen = slen;
+    }
+    poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+    poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+    /*
+     * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+     */
+    scale_fg = 31 * (int)(slen - rlen);
+
+    /*
+     * Estimated boundaries for the maximum size (in bits) of the
+     * coefficients of (f,g). We use the measured average, and
+     * allow for a deviation of at most six times the standard
+     * deviation.
+     */
+    minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+    maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+    /*
+     * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+     * and adj(g) in rt3 and rt4, respectively.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt4, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt4, logn);
+
+    /*
+     * Reduce F and G repeatedly.
+     *
+     * The expected maximum bit length of coefficients of F and G
+     * is kept in maxbl_FG, with the corresponding word length in
+     * FGlen.
+     */
+    FGlen = llen;
+    maxbl_FG = 31 * (int)llen;
+
+    /*
+     * Each reduction operation computes the reduction polynomial
+     * "k". We need that polynomial to have coefficients that fit
+     * on 32-bit signed integers, with some scaling; thus, we use
+     * a descending sequence of scaling values, down to zero.
+     *
+     * The size of the coefficients of k is (roughly) the difference
+     * between the size of the coefficients of (F,G) and the size
+     * of the coefficients of (f,g). Thus, the maximum size of the
+     * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+     * this is our starting scale value for k.
+     *
+     * We need to estimate the size of (F,G) during the execution of
+     * the algorithm; we are allowed some overestimation but not too
+     * much (poly_big_to_fp() uses a 310-bit window). Generally
+     * speaking, after applying a reduction with k scaled to
+     * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+     * where 'dd' is a few bits to account for the fact that the
+     * reduction is never perfect (intuitively, dd is on the order
+     * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+     * bits).
+     *
+     * The size of (f,g) is not known exactly, but maxbl_fg is an
+     * upper bound.
+     */
+    scale_k = maxbl_FG - minbl_fg;
+
+    for (;;) {
+        int scale_FG, dc, new_maxbl_FG;
+        uint32_t scl, sch;
+        fpr pdc, pt;
+
+        /*
+         * Convert current F and G into floating-point. We apply
+         * scaling if the current length is more than 10 words.
+         */
+        if (FGlen > 10) {
+            rlen = 10;
+        } else {
+            rlen = FGlen;
+        }
+        scale_FG = 31 * (int)(FGlen - rlen);
+        poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+        poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+        /*
+         * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+         */
+        PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt2, rt4, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(rt2, rt1, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt2, rt5, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt2, logn);
+
+        /*
+         * (f,g) are scaled by 'scale_fg', meaning that the
+         * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+         * to have their true mathematical value.
+         *
+         * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+         * the value we computed in rt2 is scaled by
+         * 'scale_FG-scale_fg'.
+         *
+         * We want that value to be scaled by 'scale_k', hence we
+         * apply a corrective scaling. After scaling, the values
+         * should fit in -2^31-1..+2^31-1.
+         */
+        dc = scale_k - scale_FG + scale_fg;
+
+        /*
+         * We will need to multiply values by 2^(-dc). The value
+         * 'dc' is not secret, so we can compute 2^(-dc) with a
+         * non-constant-time process.
+         * (We could use ldexp(), but we prefer to avoid any
+         * dependency on libm. When using FP emulation, we could
+         * use our fpr_ldexp(), which is constant-time.)
+         */
+        if (dc < 0) {
+            dc = -dc;
+            pt = fpr_two;
+        } else {
+            pt = fpr_onehalf;
+        }
+        pdc = fpr_one;
+        while (dc != 0) {
+            if ((dc & 1) != 0) {
+                pdc = fpr_mul(pdc, pt);
+            }
+            dc >>= 1;
+            pt = fpr_sqr(pt);
+        }
+
+        for (u = 0; u < n; u ++) {
+            fpr xv;
+
+            xv = fpr_mul(rt2[u], pdc);
+
+            /*
+             * Sometimes the values can be out-of-bounds if
+             * the algorithm fails; we must not call
+             * fpr_rint() (and cast to int32_t) if the value
+             * is not in-bounds. Note that the test does not
+             * break constant-time discipline, since any
+             * failure here implies that we discard the current
+             * secret key (f,g).
+             */
+            if (!fpr_lt(fpr_mtwo31m1, xv)
+                    || !fpr_lt(xv, fpr_ptwo31m1)) {
+                return 0;
+            }
+            k[u] = (int32_t)fpr_rint(xv);
+        }
+
+        /*
+         * Values in k[] are integers. They really are scaled
+         * down by maxbl_FG - minbl_fg bits.
+         *
+         * If we are at low depth, then we use the NTT to
+         * compute k*f and k*g.
+         */
+        sch = (uint32_t)(scale_k / 31);
+        scl = (uint32_t)(scale_k % 31);
+        if (depth <= DEPTH_INT_FG) {
+            poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+                                k, sch, scl, logn, t1);
+            poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+                                k, sch, scl, logn, t1);
+        } else {
+            poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+                            k, sch, scl, logn);
+            poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+                            k, sch, scl, logn);
+        }
+
+        /*
+         * We compute the new maximum size of (F,G), assuming that
+         * (f,g) has _maximal_ length (i.e. that reduction is
+         * "late" instead of "early". We also adjust FGlen
+         * accordingly.
+         */
+        new_maxbl_FG = scale_k + maxbl_fg + 10;
+        if (new_maxbl_FG < maxbl_FG) {
+            maxbl_FG = new_maxbl_FG;
+            if ((int)FGlen * 31 >= maxbl_FG + 31) {
+                FGlen --;
+            }
+        }
+
+        /*
+         * We suppose that scaling down achieves a reduction by
+         * at least 25 bits per iteration. We stop when we have
+         * done the loop with an unscaled k.
+         */
+        if (scale_k <= 0) {
+            break;
+        }
+        scale_k -= 25;
+        if (scale_k < 0) {
+            scale_k = 0;
+        }
+    }
+
+    /*
+     * If (F,G) length was lowered below 'slen', then we must take
+     * care to re-extend the sign.
+     */
+    if (FGlen < slen) {
+        for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+            size_t v;
+            uint32_t sw;
+
+            sw = -(Ft[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Ft[v] = sw;
+            }
+            sw = -(Gt[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Gt[v] = sw;
+            }
+        }
+    }
+
+    /*
+     * Compress encoding of all values to 'slen' words (this is the
+     * expected output format).
+     */
+    for (u = 0, x = tmp, y = tmp;
+            u < (n << 1); u ++, x += slen, y += llen) {
+        memmove(x, y, slen * sizeof * y);
+    }
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    /*
+     * The first half of this function is a copy of the corresponding
+     * part in solve_NTRU_intermediate(), for the reconstruction of
+     * the unreduced F and G. The second half (Babai reduction) is
+     * done differently, because the unreduced F and G fit in 53 bits
+     * of precision, allowing a much simpler process with lower RAM
+     * usage.
+     */
+    unsigned depth, logn;
+    size_t n_top, n, hn, slen, dlen, llen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+    uint32_t *x, *y;
+
+    depth = 1;
+    n_top = (size_t)1 << logn_top;
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     */
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+
+    /*
+     * Fd and Gd are the F and G from the deeper level. Ft and Gt
+     * are the destination arrays for the unreduced F and G.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+    Ft = Gd + dlen * hn;
+    Gt = Ft + llen * n;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * Now Fd and Gd are not needed anymore; we can squeeze them out.
+     */
+    memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+    Ft = tmp;
+    memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+    Gt = Ft + llen * n;
+    ft = Gt + llen * n;
+    gt = ft + slen * n;
+
+    t1 = gt + slen * n;
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        unsigned e;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * We recompute things from the source f and g, of full
+         * degree. However, we will need only the n first elements
+         * of the inverse NTT table (igm); the call to modp_mkgm()
+         * below will fill n_top elements in igm[] (thus overflowing
+         * into fx[]) but later code will overwrite these extra
+         * elements.
+         */
+        gm = t1;
+        igm = gm + n_top;
+        fx = igm + n;
+        gx = fx + n_top;
+        modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+        /*
+         * Set ft and gt to f and g modulo p, respectively.
+         */
+        for (v = 0; v < n_top; v ++) {
+            fx[v] = modp_set(f[v], p);
+            gx[v] = modp_set(g[v], p);
+        }
+
+        /*
+         * Convert to NTT and compute our f and g.
+         */
+        modp_NTT2(fx, gm, logn_top, p, p0i);
+        modp_NTT2(gx, gm, logn_top, p, p0i);
+        for (e = logn_top; e > logn; e --) {
+            modp_poly_rec_res(fx, e, p, p0i, R2);
+            modp_poly_rec_res(gx, e, p, p0i, R2);
+        }
+
+        /*
+         * From that point onward, we only need tables for
+         * degree n, so we can save some space.
+         */
+        if (depth > 0) { /* always true */
+            memmove(gm + n, igm, n * sizeof * igm);
+            igm = gm + n;
+            memmove(igm + n, fx, n * sizeof * ft);
+            fx = igm + n;
+            memmove(fx + n, gx, n * sizeof * gt);
+            gx = fx + n;
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed
+         * in a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * Equations are:
+         *
+         *   f'(x^2) = N(f)(x^2) = f * adj(f)
+         *   g'(x^2) = N(g)(x^2) = g * adj(g)
+         *
+         *   f'*G' - g'*F' = q
+         *
+         *   F = F'(x^2) * adj(g)
+         *   G = G'(x^2) * adj(f)
+         *
+         * The NTT representation of f is f(w) for all w which
+         * are roots of phi. In the binary case, as well as in
+         * the ternary case for all depth except the deepest,
+         * these roots can be grouped in pairs (w,-w), and we
+         * then have:
+         *
+         *   f(w) = adj(f)(-w)
+         *   f(-w) = adj(f)(w)
+         *
+         * and w^2 is then a root for phi at the half-degree.
+         *
+         * At the deepest level in the ternary case, this still
+         * holds, in the following sense: the roots of x^2-x+1
+         * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+         * have:
+         *
+         *   f(w) = adj(f)(-w^2)
+         *   f(-w^2) = adj(f)(w)
+         *
+         * In all case, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, the two roots for each pair are consecutive
+         * in our bit-reversal encoding.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+        /*
+         * Also save ft and gt (only up to size slen).
+         */
+        if (u < slen) {
+            modp_iNTT2(fx, igm, logn, p, p0i);
+            modp_iNTT2(gx, igm, logn, p, p0i);
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                *x = fx[v];
+                *y = gx[v];
+            }
+        }
+    }
+
+    /*
+     * Rebuild f, g, F and G with the CRT. Note that the elements of F
+     * and G are consecutive, and thus can be rebuilt in a single
+     * loop; similarly, the elements of f and g are consecutive.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+    zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+    /*
+     * Here starts the Babai reduction, specialized for depth = 1.
+     *
+     * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+     * are converted to floating point. There is no scaling, and a
+     * single pass is sufficient.
+     */
+
+    /*
+     * Convert F and G into floating point (rt1 and rt2).
+     */
+    rt1 = align_fpr(tmp, gt + slen * n);
+    rt2 = rt1 + n;
+    poly_big_to_fp(rt1, Ft, llen, llen, logn);
+    poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+    /*
+     * Integer representation of F and G is no longer needed, we
+     * can remove it.
+     */
+    memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+    ft = tmp;
+    gt = ft + slen * n;
+    rt3 = align_fpr(tmp, gt + slen * n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * Convert f and g into floating point (rt3 and rt4).
+     */
+    poly_big_to_fp(rt3, ft, slen, slen, logn);
+    poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+    /*
+     * Remove unneeded ft and gt.
+     */
+    memmove(tmp, rt1, 4 * n * sizeof * rt1);
+    rt1 = (fpr *)tmp;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * We now have:
+     *   rt1 = F
+     *   rt2 = G
+     *   rt3 = f
+     *   rt4 = g
+     * in that order in RAM. We convert all of them to FFT.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt1, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt2, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = F*adj(f) + G*adj(g)
+     *   rt6 = 1 / (f*adj(f) + g*adj(g))
+     * (Note that rt6 is half-length.)
+     */
+    rt5 = rt4 + n;
+    rt6 = rt5 + n;
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+    /*
+     * Compute k as the rounded version of rt5. Check that none of
+     * the values is larger than 2^63-1 (in absolute value)
+     * because that would make the fpr_rint() do something undefined;
+     * note that any out-of-bounds value here implies a failure and
+     * (f,g) will be discarded, so we can make a simple test.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt5, logn);
+    for (u = 0; u < n; u ++) {
+        fpr z;
+
+        z = rt5[u];
+        if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+            return 0;
+        }
+        rt5[u] = fpr_of(fpr_rint(z));
+    }
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt5, logn);
+
+    /*
+     * Subtract k*f from F, and k*g from G.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt3, rt5, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt4, rt5, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(rt1, rt3, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(rt2, rt4, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt1, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt2, logn);
+
+    /*
+     * Convert back F and G to integers, and return.
+     */
+    Ft = tmp;
+    Gt = Ft + n;
+    rt3 = align_fpr(tmp, Gt + n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    for (u = 0; u < n; u ++) {
+        Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+        Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t n, hn, u;
+    uint32_t p, p0i, R2;
+    uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+    uint32_t *gm, *igm, *ft, *gt;
+    fpr *rt2, *rt3;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     *
+     * Everything should fit in 31-bit integers, hence we can just use
+     * the first small prime p = 2147473409.
+     */
+    p = PRIMES[0].p;
+    p0i = modp_ninv31(p);
+    R2 = modp_R2(p, p0i);
+
+    Fp = tmp;
+    Gp = Fp + hn;
+    ft = Gp + hn;
+    gt = ft + n;
+    gm = gt + n;
+    igm = gm + n;
+
+    modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F' anf G' in NTT representation.
+     */
+    for (u = 0; u < hn; u ++) {
+        Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+        Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+    }
+    modp_NTT2(Fp, gm, logn - 1, p, p0i);
+    modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+    /*
+     * Load f and g and convert them to NTT representation.
+     */
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+
+    /*
+     * Build the unreduced F,G in ft and gt.
+     */
+    for (u = 0; u < n; u += 2) {
+        uint32_t ftA, ftB, gtA, gtB;
+        uint32_t mFp, mGp;
+
+        ftA = ft[u + 0];
+        ftB = ft[u + 1];
+        gtA = gt[u + 0];
+        gtB = gt[u + 1];
+        mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+        mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+        ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+        ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+        gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+        gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+    }
+    modp_iNTT2(ft, igm, logn, p, p0i);
+    modp_iNTT2(gt, igm, logn, p, p0i);
+
+    Gp = Fp + n;
+    t1 = Gp + n;
+    memmove(Fp, ft, 2 * n * sizeof * ft);
+
+    /*
+     * We now need to apply the Babai reduction. At that point,
+     * we have F and G in two n-word arrays.
+     *
+     * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+     * modulo p, using the NTT. We still move memory around in
+     * order to save RAM.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+
+    /*
+     * Compute the NTT tables in t1 and t2. We do not keep t2
+     * (we'll recompute it later on).
+     */
+    modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F and G to NTT.
+     */
+    modp_NTT2(Fp, t1, logn, p, p0i);
+    modp_NTT2(Gp, t1, logn, p, p0i);
+
+    /*
+     * Load f and adj(f) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(f[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[n - u] = modp_set(-f[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Compute F*adj(f) in t2, and f*adj(f) in t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_montymul(w, Fp[u], p, p0i);
+        t3[u] = modp_montymul(w, t4[u], p, p0i);
+    }
+
+    /*
+     * Load g and adj(g) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(g[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(g[u], p);
+        t5[n - u] = modp_set(-g[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Add G*adj(g) to t2, and g*adj(g) to t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_add(t2[u],
+                         modp_montymul(w, Gp[u], p, p0i), p);
+        t3[u] = modp_add(t3[u],
+                         modp_montymul(w, t4[u], p, p0i), p);
+    }
+
+    /*
+     * Convert back t2 and t3 to normal representation (normalized
+     * around 0), and then
+     * move them to t1 and t2. We first need to recompute the
+     * inverse table for NTT.
+     */
+    modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+    modp_iNTT2(t2, t4, logn, p, p0i);
+    modp_iNTT2(t3, t4, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint32_t)modp_norm(t2[u], p);
+        t2[u] = (uint32_t)modp_norm(t3[u], p);
+    }
+
+    /*
+     * At that point, array contents are:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   F*adj(f)+G*adj(g) (t1)
+     *   f*adj(f)+g*adj(g) (t2)
+     *
+     * We want to divide t1 by t2. The result is not integral; it
+     * must be rounded. We thus need to use the FFT.
+     */
+
+    /*
+     * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+     * polynomial is auto-adjoint, all its coordinates in FFT
+     * representation are actually real, so we can truncate off
+     * the imaginary parts.
+     */
+    rt3 = align_fpr(tmp, t3);
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t2)[u]);
+    }
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+    rt2 = align_fpr(tmp, t2);
+    memmove(rt2, rt3, hn * sizeof * rt3);
+
+    /*
+     * Convert F*adj(f)+G*adj(g) in FFT representation.
+     */
+    rt3 = rt2 + hn;
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t1)[u]);
+    }
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+
+    /*
+     * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+     * its rounded normal representation in t1.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_autoadj_fft(rt3, rt2, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt3, logn);
+    for (u = 0; u < n; u ++) {
+        t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+    }
+
+    /*
+     * RAM contents are now:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   k (t1)
+     *
+     * We want to compute F-k*f, and G-k*g.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+    modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(t1, t2, logn, p, p0i);
+    modp_NTT2(t4, t2, logn, p, p0i);
+    modp_NTT2(t5, t2, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t kw;
+
+        kw = modp_montymul(t1[u], R2, p, p0i);
+        Fp[u] = modp_sub(Fp[u],
+                         modp_montymul(kw, t4[u], p, p0i), p);
+        Gp[u] = modp_sub(Gp[u],
+                         modp_montymul(kw, t5[u], p, p0i), p);
+    }
+    modp_iNTT2(Fp, t3, logn, p, p0i);
+    modp_iNTT2(Gp, t3, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+        Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+    }
+
+    return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+           const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+    size_t n, u;
+    uint32_t *ft, *gt, *Ft, *Gt, *gm;
+    uint32_t p, p0i, r;
+    const small_prime *primes;
+
+    n = MKN(logn);
+
+    if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+        return 0;
+    }
+
+    /*
+     * For logn <= 2, we need to use solve_NTRU_intermediate()
+     * directly, because coefficients are a bit too large and
+     * do not fit the hypotheses in solve_NTRU_binary_depth0().
+     */
+    if (logn <= 2) {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 0) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+    } else {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 2) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+        if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+            return 0;
+        }
+        if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+            return 0;
+        }
+    }
+
+    /*
+     * If no buffer has been provided for G, use a temporary one.
+     */
+    if (G == NULL) {
+        G = (int8_t *)(tmp + 2 * n);
+    }
+
+    /*
+     * Final F and G are in fk->tmp, one word per coefficient
+     * (signed value over 31 bits).
+     */
+    if (!poly_big_to_small(F, tmp, lim, logn)
+            || !poly_big_to_small(G, tmp + n, lim, logn)) {
+        return 0;
+    }
+
+    /*
+     * Verify that the NTRU equation is fulfilled. Since all elements
+     * have short lengths, verifying modulo a small prime p works, and
+     * allows using the NTT.
+     *
+     * We put Gt[] first in tmp[], and process it first, so that it does
+     * not overlap with G[] in case we allocated it ourselves.
+     */
+    Gt = tmp;
+    ft = Gt + n;
+    gt = ft + n;
+    Ft = gt + n;
+    gm = Ft + n;
+
+    primes = PRIMES;
+    p = primes[0].p;
+    p0i = modp_ninv31(p);
+    modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Gt[u] = modp_set(G[u], p);
+    }
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+        Ft[u] = modp_set(F[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+    modp_NTT2(Ft, gm, logn, p, p0i);
+    modp_NTT2(Gt, gm, logn, p, p0i);
+    r = modp_montymul(12289, 1, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t z;
+
+        z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+                     modp_montymul(gt[u], Ft[u], p, p0i), p);
+        if (z != r) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+    size_t n, u;
+    unsigned mod2;
+
+    n = MKN(logn);
+    mod2 = 0;
+    for (u = 0; u < n; u ++) {
+        int s;
+
+restart:
+        s = mkgauss(rng, logn);
+
+        /*
+         * We need the coefficient to fit within -127..+127;
+         * realistically, this is always the case except for
+         * the very low degrees (N = 2 or 4), for which there
+         * is no real security anyway.
+         */
+        if (s < -127 || s > 127) {
+            goto restart;
+        }
+
+        /*
+         * We need the sum of all coefficients to be 1; otherwise,
+         * the resultant of the polynomial with X^N+1 will be even,
+         * and the binary GCD will fail.
+         */
+        if (u == n - 1) {
+            if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+                goto restart;
+            }
+        } else {
+            mod2 ^= (unsigned)(s & 1);
+        }
+        f[u] = (int8_t)s;
+    }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_keygen(inner_shake256_context *rng,
+                                      int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+                                      unsigned logn, uint8_t *tmp) {
+    /*
+     * Algorithm is the following:
+     *
+     *  - Generate f and g with the Gaussian distribution.
+     *
+     *  - If either Res(f,phi) or Res(g,phi) is even, try again.
+     *
+     *  - If ||(f,g)|| is too large, try again.
+     *
+     *  - If ||B~_{f,g}|| is too large, try again.
+     *
+     *  - If f is not invertible mod phi mod q, try again.
+     *
+     *  - Compute h = g/f mod phi mod q.
+     *
+     *  - Solve the NTRU equation fG - gF = q; if the solving fails,
+     *    try again. Usual failure condition is when Res(f,phi)
+     *    and Res(g,phi) are not prime to each other.
+     */
+    size_t n, u;
+    uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
+
+    n = MKN(logn);
+    rc = rng;
+
+    /*
+     * We need to generate f and g randomly, until we find values
+     * such that the norm of (g,-f), and of the orthogonalized
+     * vector, are satisfying. The orthogonalized vector is:
+     *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+     * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+     *
+     * In the binary case, coefficients of f and g are generated
+     * independently of each other, with a discrete Gaussian
+     * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+     * the two vectors have expected norm 1.17*sqrt(q), which is
+     * also our acceptance bound: we require both vectors to be no
+     * larger than that (this will be satisfied about 1/4th of the
+     * time, thus we expect sampling new (f,g) about 4 times for that
+     * step).
+     *
+     * We require that Res(f,phi) and Res(g,phi) are both odd (the
+     * NTRU equation solver requires it).
+     */
+    for (;;) {
+        fpr *rt1, *rt2, *rt3;
+        fpr bnorm;
+        uint32_t normf, normg, norm;
+        int lim;
+
+        /*
+         * The poly_small_mkgauss() function makes sure
+         * that the sum of coefficients is 1 modulo 2
+         * (i.e. the resultant of the polynomial with phi
+         * will be odd).
+         */
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
+
+        /*
+         * Verify that all coefficients are within the bounds
+         * defined in max_fg_bits. This is the case with
+         * overwhelming probability; this guarantees that the
+         * key will be encodable with FALCON_COMP_TRIM.
+         */
+        lim = 1 << (PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[logn] - 1);
+        for (u = 0; u < n; u ++) {
+            /*
+             * We can use non-CT tests since on any failure
+             * we will discard f and g.
+             */
+            if (f[u] >= lim || f[u] <= -lim
+                    || g[u] >= lim || g[u] <= -lim) {
+                lim = -1;
+                break;
+            }
+        }
+        if (lim < 0) {
+            continue;
+        }
+
+        /*
+         * Bound is 1.17*sqrt(q). We compute the squared
+         * norms. With q = 12289, the squared bound is:
+         *   (1.17^2)* 12289 = 16822.4121
+         * Since f and g are integral, the squared norm
+         * of (g,-f) is an integer.
+         */
+        normf = poly_small_sqnorm(f, logn);
+        normg = poly_small_sqnorm(g, logn);
+        norm = (normf + normg) | -((normf | normg) >> 31);
+        if (norm >= 16823) {
+            continue;
+        }
+
+        /*
+         * We compute the orthogonalized vector norm.
+         */
+        rt1 = (fpr *)tmp;
+        rt2 = rt1 + n;
+        rt3 = rt2 + n;
+        poly_small_to_fp(rt1, f, logn);
+        poly_small_to_fp(rt2, g, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt2, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(rt1, fpr_q, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(rt2, fpr_q, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt2, rt3, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt1, logn);
+        PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt2, logn);
+        bnorm = fpr_zero;
+        for (u = 0; u < n; u ++) {
+            bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+            bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+        }
+        if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+            continue;
+        }
+
+        /*
+         * Compute public key h = g/f mod X^N+1 mod q. If this
+         * fails, we must restart.
+         */
+        if (h == NULL) {
+            h2 = (uint16_t *)tmp;
+            tmp2 = h2 + n;
+        } else {
+            h2 = h;
+            tmp2 = (uint16_t *)tmp;
+        }
+        if (!PQCLEAN_FALCONPADDED1024_CLEAN_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+            continue;
+        }
+
+        /*
+         * Solve the NTRU equation to get F and G.
+         */
+        lim = (1 << (PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[logn] - 1)) - 1;
+        if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+            continue;
+        }
+
+        /*
+         * Key pair is generated.
+         */
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/pqclean.c
new file mode 100644
index 000000000..eb6cc85a1
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN   40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ *   private key:
+ *      header byte: 0101nnnn
+ *      private f  (6 or 5 bits by element, depending on degree)
+ *      private g  (6 or 5 bits by element, depending on degree)
+ *      private F  (8 bits by element)
+ *
+ *   public key:
+ *      header byte: 0000nnnn
+ *      public h   (14 bits by element)
+ *
+ *   signature:
+ *      header byte: 0011nnnn
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
+ *      padding    to 1280 bytes
+ *
+ *   message + signature:
+ *      signature  1280 bytes
+ *      message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk) {
+    union {
+        uint8_t b[FALCON_KEYGEN_TEMP_10];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[1024], g[1024], F[1024];
+    uint16_t h[1024];
+    unsigned char seed[48];
+    inner_shake256_context rng;
+    size_t u, v;
+
+    /*
+     * Generate key pair.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
+    PQCLEAN_FALCONPADDED1024_CLEAN_keygen(&rng, f, g, F, NULL, h, 10, tmp.b);
+    inner_shake256_ctx_release(&rng);
+
+    /*
+     * Encode private key.
+     */
+    sk[0] = 0x50 + 10;
+    u = 1;
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+            f, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+            g, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+            F, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[10]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+
+    /*
+     * Encode public key.
+     */
+    pk[0] = 0x00 + 10;
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_modq_encode(
+            pk + 1, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1,
+            h, 10);
+    if (v != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+        const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    union {
+        uint8_t b[72 * 1024];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[1024], g[1024], F[1024], G[1024];
+    struct {
+        int16_t sig[1024];
+        uint16_t hm[1024];
+    } r;
+    unsigned char seed[48];
+    inner_shake256_context sc;
+    size_t u, v;
+
+    /*
+     * Decode the private key.
+     */
+    if (sk[0] != 0x50 + 10) {
+        return -1;
+    }
+    u = 1;
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+            f, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10],
+            sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+            g, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10],
+            sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+            F, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[10],
+            sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+    if (!PQCLEAN_FALCONPADDED1024_CLEAN_complete_private(G, f, g, F, 10, tmp.b)) {
+        return -1;
+    }
+
+    /*
+     * Create a random nonce (40 bytes).
+     */
+    randombytes(nonce, NONCELEN);
+
+    /*
+     * Hash message nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(&sc, r.hm, 10, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Initialize a RNG.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
+
+    /*
+     * Compute and return the signature. This loops until a signature
+     * value is found that fits in the provided buffer.
+     */
+    for (;;) {
+        PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+        v = PQCLEAN_FALCONPADDED1024_CLEAN_comp_encode(sigbuf, sigbuflen, r.sig, 10);
+        if (v != 0) {
+            inner_shake256_ctx_release(&sc);
+            memset(sigbuf + v, 0, sigbuflen - v);
+            return 0;
+        }
+    }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+    const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    union {
+        uint8_t b[2 * 1024];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    uint16_t h[1024], hm[1024];
+    int16_t sig[1024];
+    inner_shake256_context sc;
+    size_t v;
+
+    /*
+     * Decode public key.
+     */
+    if (pk[0] != 0x00 + 10) {
+        return -1;
+    }
+    if (PQCLEAN_FALCONPADDED1024_CLEAN_modq_decode(h, 10,
+            pk + 1, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1)
+            != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+    PQCLEAN_FALCONPADDED1024_CLEAN_to_ntt_monty(h, 10);
+
+    /*
+     * Decode signature.
+     */
+    if (sigbuflen == 0) {
+        return -1;
+    }
+
+    v = PQCLEAN_FALCONPADDED1024_CLEAN_comp_decode(sig, 10, sigbuf, sigbuflen);
+    if (v == 0) {
+        return -1;
+    }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
+
+    /*
+     * Hash nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(&sc, hm, 10, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Verify signature.
+     */
+    if (!PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw(hm, sig, h, 10, tmp.b)) {
+        return -1;
+    }
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    size_t vlen;
+
+    vlen = PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sig[0] = 0x30 + 10;
+    *siglen = 1 + NONCELEN + vlen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    if (siglen < 1 + NONCELEN) {
+        return -1;
+    }
+    if (sig[0] != 0x30 + 10) {
+        return -1;
+    }
+    return do_verify(sig + 1,
+                     sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    uint8_t *sigbuf;
+    size_t sigbuflen;
+
+    /*
+     * Move the message to its final location; this is a memmove() so
+     * it handles overlaps properly.
+     */
+    memmove(sm + PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES, m, mlen);
+    sigbuf = sm + 1 + NONCELEN;
+    sigbuflen = PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sm[0] = 0x30 + 10;
+    sigbuflen ++;
+    *smlen = mlen + NONCELEN + sigbuflen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+    const uint8_t *sigbuf;
+    size_t pmlen, sigbuflen;
+
+    if (smlen < PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES) {
+        return -1;
+    }
+    sigbuflen = PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+    pmlen = smlen - PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES;
+    if (sm[0] != 0x30 + 10) {
+        return -1;
+    }
+    sigbuf = sm + 1 + NONCELEN;
+
+    /*
+     * The one-byte signature header has been verified. Nonce is at sm+1
+     * followed by the signature (pointed to by sigbuf). The message
+     * follows the signature value.
+     */
+    if (do_verify(sm + 1, sigbuf, sigbuflen,
+                  sm + PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES, pmlen, pk) < 0) {
+        return -1;
+    }
+
+    /*
+     * Signature is correct, we just have to copy/move the message
+     * to its final destination. The memmove() properly handles
+     * overlaps.
+     */
+    memmove(m, sm + PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES, pmlen);
+    *mlen = pmlen;
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/rng.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/rng.c
new file mode 100644
index 000000000..169d35fb2
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/rng.c
@@ -0,0 +1,188 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include <assert.h>
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
+    /*
+     * To ensure reproducibility for a given seed, we
+     * must enforce little-endian interpretation of
+     * the state words.
+     */
+    uint8_t tmp[56];
+    uint64_t th, tl;
+    int i;
+
+    uint32_t *d32 = (uint32_t *) p->state.d;
+    uint64_t *d64 = (uint64_t *) p->state.d;
+
+    inner_shake256_extract(src, tmp, 56);
+    for (i = 0; i < 14; i ++) {
+        uint32_t w;
+
+        w = (uint32_t)tmp[(i << 2) + 0]
+            | ((uint32_t)tmp[(i << 2) + 1] << 8)
+            | ((uint32_t)tmp[(i << 2) + 2] << 16)
+            | ((uint32_t)tmp[(i << 2) + 3] << 24);
+        d32[i] = w;
+    }
+    tl = d32[48 / sizeof(uint32_t)];
+    th = d32[52 / sizeof(uint32_t)];
+    d64[48 / sizeof(uint64_t)] = tl + (th << 32);
+    PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(prng *p) {
+
+    static const uint32_t CW[] = {
+        0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+    };
+
+    uint64_t cc;
+    size_t u;
+
+    /*
+     * State uses local endianness. Only the output bytes must be
+     * converted to little endian (if used on a big-endian machine).
+     */
+    cc = *(uint64_t *)(p->state.d + 48);
+    for (u = 0; u < 8; u ++) {
+        uint32_t state[16];
+        size_t v;
+        int i;
+
+        memcpy(&state[0], CW, sizeof CW);
+        memcpy(&state[4], p->state.d, 48);
+        state[14] ^= (uint32_t)cc;
+        state[15] ^= (uint32_t)(cc >> 32);
+        for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d)   do { \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] << 16) | (state[d] >> 16); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] << 12) | (state[b] >> 20); \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] <<  8) | (state[d] >> 24); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] <<  7) | (state[b] >> 25); \
+    } while (0)
+
+            QROUND( 0,  4,  8, 12);
+            QROUND( 1,  5,  9, 13);
+            QROUND( 2,  6, 10, 14);
+            QROUND( 3,  7, 11, 15);
+            QROUND( 0,  5, 10, 15);
+            QROUND( 1,  6, 11, 12);
+            QROUND( 2,  7,  8, 13);
+            QROUND( 3,  4,  9, 14);
+
+#undef QROUND
+
+        }
+
+        for (v = 0; v < 4; v ++) {
+            state[v] += CW[v];
+        }
+        for (v = 4; v < 14; v ++) {
+            state[v] += ((uint32_t *)p->state.d)[v - 4];
+        }
+        state[14] += ((uint32_t *)p->state.d)[10]
+                     ^ (uint32_t)cc;
+        state[15] += ((uint32_t *)p->state.d)[11]
+                     ^ (uint32_t)(cc >> 32);
+        cc ++;
+
+        /*
+         * We mimic the interleaving that is used in the AVX2
+         * implementation.
+         */
+        for (v = 0; v < 16; v ++) {
+            p->buf.d[(u << 2) + (v << 5) + 0] =
+                (uint8_t)state[v];
+            p->buf.d[(u << 2) + (v << 5) + 1] =
+                (uint8_t)(state[v] >> 8);
+            p->buf.d[(u << 2) + (v << 5) + 2] =
+                (uint8_t)(state[v] >> 16);
+            p->buf.d[(u << 2) + (v << 5) + 3] =
+                (uint8_t)(state[v] >> 24);
+        }
+    }
+    *(uint64_t *)(p->state.d + 48) = cc;
+
+    p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len) {
+    uint8_t *buf;
+
+    buf = dst;
+    while (len > 0) {
+        size_t clen;
+
+        clen = (sizeof p->buf.d) - p->ptr;
+        if (clen > len) {
+            clen = len;
+        }
+        memcpy(buf, p->buf.d, clen);
+        buf += clen;
+        len -= clen;
+        p->ptr += clen;
+        if (p->ptr == sizeof p->buf.d) {
+            PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/sign.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/sign.c
new file mode 100644
index 000000000..a7dbbfc62
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/sign.c
@@ -0,0 +1,1248 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ *   N = 2^logn
+ *   phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+    /*
+     * For logn = 0 (polynomials are constant), the "tree" is a
+     * single element. Otherwise, the tree node has size 2^logn, and
+     * has two child trees for size logn-1 each. Thus, treesize s()
+     * must fulfill these two relations:
+     *
+     *   s(0) = 1
+     *   s(logn) = (2^logn) + 2*s(logn-1)
+     */
+    return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+                fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g0[0];
+        return;
+    }
+    hn = n >> 1;
+
+    /*
+     * The LDL decomposition yields L (which is written in the tree)
+     * and the diagonal of D. Since d00 = g0, we just write d11
+     * into tmp.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+    /*
+     * Split d00 (currently in g0) and d11 (currently in tmp). We
+     * reuse g0 and g1 as temporary storage spaces:
+     *   d00 splits into g1, g1+hn
+     *   d11 splits into g0, g0+hn
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(g1, g1 + hn, g0, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+    /*
+     * Each split result is the first row of a new auto-adjoint
+     * quasicyclic matrix for the next recursive step.
+     */
+    ffLDL_fft_inner(tree + n,
+                    g1, g1 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+          const fpr *g01, const fpr *g11,
+          unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *d00, *d11;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g00[0];
+        return;
+    }
+    hn = n >> 1;
+    d00 = tmp;
+    d11 = tmp + n;
+    tmp += n << 1;
+
+    memcpy(d00, g00, n * sizeof * g00);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(tmp, tmp + hn, d00, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(d00, d00 + hn, d11, logn);
+    memcpy(d11, tmp, n * sizeof * tmp);
+    ffLDL_fft_inner(tree + n,
+                    d11, d11 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+    /*
+     * TODO: make an iterative version.
+     */
+    size_t n;
+
+    n = MKN(logn);
+    if (n == 1) {
+        /*
+         * We actually store in the tree leaf the inverse of
+         * the value mandated by the specification: this
+         * saves a division both here and in the sampler.
+         */
+        tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+    } else {
+        ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+        ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+                               orig_logn, logn - 1);
+    }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        r[u] = fpr_of(t[u]);
+    }
+}
+
+/*
+ * The expanded private key contains:
+ *  - The B0 matrix (four elements)
+ *  - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+    (void)logn;
+    return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+    return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+    return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+    return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+    return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp) {
+    size_t n;
+    fpr *rf, *rg, *rF, *rG;
+    fpr *b00, *b01, *b10, *b11;
+    fpr *g00, *g01, *g11, *gxx;
+    fpr *tree;
+
+    n = MKN(logn);
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * We load the private key elements directly into the B0 matrix,
+     * since B0 = [[g, -f], [G, -F]].
+     */
+    rf = b01;
+    rg = b00;
+    rF = b11;
+    rG = b10;
+
+    smallints_to_fpr(rf, f, logn);
+    smallints_to_fpr(rg, g, logn);
+    smallints_to_fpr(rF, F, logn);
+    smallints_to_fpr(rG, G, logn);
+
+    /*
+     * Compute the FFT for the key elements, and negate f and F.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rf, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rg, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rF, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rG, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(rf, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(rF, logn);
+
+    /*
+     * The Gram matrix is G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle).
+     */
+    g00 = (fpr *)tmp;
+    g01 = g00 + n;
+    g11 = g01 + n;
+    gxx = g11 + n;
+
+    memcpy(g00, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(g00, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(g00, gxx, logn);
+
+    memcpy(g01, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(g01, b10, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(gxx, b11, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(g01, gxx, logn);
+
+    memcpy(g11, b10, n * sizeof * b10);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(g11, logn);
+    memcpy(gxx, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(g11, gxx, logn);
+
+    /*
+     * Compute the Falcon tree.
+     */
+    ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+    /*
+     * Normalize tree.
+     */
+    ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+                       fpr *t0, fpr *t1,
+                       fpr *g00, fpr *g01, fpr *g11,
+                       unsigned orig_logn, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *z0, *z1;
+
+    /*
+     * Deepest level: the LDL tree leaf value is just g00 (the
+     * array has length only 1 at this point); we normalize it
+     * with regards to sigma, then use it for sampling.
+     */
+    if (logn == 0) {
+        fpr leaf;
+
+        leaf = g00[0];
+        leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+        t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+        t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+        return;
+    }
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Decompose G into LDL. We only need d00 (identical to g00),
+     * d11, and l10; we do that in place.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDL_fft(g00, g01, g11, logn);
+
+    /*
+     * Split d00 and d11 and expand them into half-size quasi-cyclic
+     * Gram matrices. We also save l10 in tmp[].
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(tmp, tmp + hn, g00, logn);
+    memcpy(g00, tmp, n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(tmp, tmp + hn, g11, logn);
+    memcpy(g11, tmp, n * sizeof * tmp);
+    memcpy(tmp, g01, n * sizeof * g01);
+    memcpy(g01, g00, hn * sizeof * g00);
+    memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+    /*
+     * The half-size Gram matrices for the recursive LDL tree
+     * building are now:
+     *   - left sub-tree: g00, g00+hn, g01
+     *   - right sub-tree: g11, g11+hn, g01+hn
+     * l10 is in tmp[].
+     */
+
+    /*
+     * We split t1 and use the first recursive call on the two
+     * halves, using the right sub-tree. The result is merged
+     * back into tmp + 2*n.
+     */
+    z1 = tmp + n;
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+                           g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * l10.
+     * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+     * in tmp + (n << 1). The buffer in z1 is free.
+     *
+     * In the end, z1 is written over t1, and tb0 is in t0.
+     */
+    memcpy(z1, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(z1, tmp + (n << 1), logn);
+    memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(t0, tmp, logn);
+
+    /*
+     * Second recursive invocation, on the split tb0 (currently in t0)
+     * and the left sub-tree.
+     */
+    z0 = tmp;
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z0, z0 + hn, t0, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+                           g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+               fpr *z0, fpr *z1,
+               const fpr *tree,
+               const fpr *t0, const fpr *t1, unsigned logn,
+               fpr *tmp) {
+    size_t n, hn;
+    const fpr *tree0, *tree1;
+
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
+        fpr x0, x1, sigma;
+
+        x0 = t0[0];
+        x1 = t1[0];
+        sigma = tree[0];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+        return;
+    }
+
+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    tree0 = tree + n;
+    tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+    /*
+     * We split t1 into z1 (reused as temporary storage), then do
+     * the recursive invocation, with output in tmp. We finally
+     * merge back into z1.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree1, z1, z1 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+     */
+    memcpy(tmp, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tmp, tree, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(tmp, t0, logn);
+
+    /*
+     * Second recursive invocation.
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z0, z0 + hn, tmp, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree0, z0, z0 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+             const fpr *expanded_key,
+             const uint16_t *hm,
+             unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    const fpr *b00, *b01, *b10, *b11, *tree;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+    t0 = tmp;
+    t1 = t0 + n;
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t0, ni, logn);
+
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Apply sampling. Output is written back in [tx, ty].
+     */
+    ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(t0, tx, n * sizeof * tx);
+    memcpy(t1, ty, n * sizeof * ty);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(t1, ty, logn);
+
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t1, logn);
+
+    /*
+     * Compute the signature.
+     */
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+            const int8_t *f, const int8_t *g,
+            const int8_t *F, const int8_t *G,
+            const uint16_t *hm, unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+
+    /*
+     * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b11, logn);
+
+    /*
+     * Compute the Gram matrix G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle). g10 is not kept
+     * since it is equal to adj(g01).
+     *
+     * We _replace_ the matrix B with the Gram matrix, but we
+     * must keep b01 and b11 for computing the target vector.
+     */
+    t0 = b11 + n;
+    t1 = t0 + n;
+
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(t0, logn);    // t0 <- b01*adj(b01)
+
+    memcpy(t1, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(t1, b10, logn);   // t1 <- b00*adj(b10)
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(b00, logn);   // b00 <- b00*adj(b00)
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(b00, t0, logn);      // b00 <- g00
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(b01, b11, logn);  // b01 <- b01*adj(b11)
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(b01, t1, logn);      // b01 <- g01
+
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(b10, logn);   // b10 <- b10*adj(b10)
+    memcpy(t1, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(t1, logn);    // t1 <- b11*adj(b11)
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(b10, t1, logn);      // b10 <- g11
+
+    /*
+     * We rename variables to make things clearer. The three elements
+     * of the Gram matrix uses the first 3*n slots of tmp[], followed
+     * by b11 and b01 (in that order).
+     */
+    g00 = b00;
+    g01 = b01;
+    g11 = b10;
+    b01 = t0;
+    t0 = b01 + n;
+    t1 = t0 + n;
+
+    /*
+     * Memory layout at that point:
+     *   g00 g01 g11 b11 b01 t0 t1
+     */
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t0, ni, logn);
+
+    /*
+     * b01 and b11 can be discarded, so we move back (t0,t1).
+     * Memory layout is now:
+     *      g00 g01 g11 t0 t1
+     */
+    memcpy(b11, t0, n * 2 * sizeof * t0);
+    t0 = g11 + n;
+    t1 = t0 + n;
+
+    /*
+     * Apply sampling; result is written over (t0,t1).
+     */
+    ffSampling_fft_dyntree(samp, samp_ctx,
+                           t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+    /*
+     * We arrange the layout back to:
+     *     b00 b01 b10 b11 t0 t1
+     *
+     * We did not conserve the matrix basis, so we must recompute
+     * it now.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    memmove(b11 + n, t0, n * 2 * sizeof * t0);
+    t0 = b11 + n;
+    t1 = t0 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b11, logn);
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(tx, t0, n * sizeof * t0);
+    memcpy(ty, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(t1, ty, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t1, logn);
+
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_gaussian0_sampler(prng *p) {
+
+    static const uint32_t dist[] = {
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
+    };
+
+    uint32_t v0, v1, v2, hi;
+    uint64_t lo;
+    size_t u;
+    int z;
+
+    /*
+     * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+     */
+    lo = prng_get_u64(p);
+    hi = prng_get_u8(p);
+    v0 = (uint32_t)lo & 0xFFFFFF;
+    v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+    v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+    /*
+     * Sampled value is z, such that v0..v2 is lower than the first
+     * z elements of the table.
+     */
+    z = 0;
+    for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) {
+        uint32_t w0, w1, w2, cc;
+
+        w0 = dist[u + 2];
+        w1 = dist[u + 1];
+        w2 = dist[u + 0];
+        cc = (v0 - w0) >> 31;
+        cc = (v1 - w1 - cc) >> 31;
+        cc = (v2 - w2 - cc) >> 31;
+        z += (int)cc;
+    }
+    return z;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+    int s, i;
+    fpr r;
+    uint32_t sw, w;
+    uint64_t z;
+
+    /*
+     * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+     * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+     */
+    s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+    r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+    /*
+     * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+     * (the minimum value for sigma), r = 0 and b = 1, then we get
+     * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+     * with probability about 0.000000000230383991, which is
+     * approximatively equal to 2^(-32). In any case, if s >= 64,
+     * then BerExp will be non-zero with probability less than
+     * 2^(-64), so we can simply saturate s at 63.
+     */
+    sw = (uint32_t)s;
+    sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+    s = (int)sw;
+
+    /*
+     * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+     * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+     * We scale it up to 2^64, then right-shift it by s bits because
+     * we really want exp(-x) = 2^(-s)*exp(-r).
+     *
+     * The "-1" operation makes sure that the value fits on 64 bits
+     * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+     * case). The bias is negligible since fpr_expm_p63() only computes
+     * with 51 bits of precision or so.
+     */
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+    /*
+     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+     * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+     * PRNG output to limit its consumption, the sign of the difference
+     * yields the expected result.
+     */
+    i = 64;
+    do {
+        i -= 8;
+        w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+    } while (!w && i > 0);
+    return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
+    sampler_context *spc;
+    int s;
+    fpr r, dss, ccs;
+
+    spc = ctx;
+
+    /*
+     * Center is mu. We compute mu = s + r where s is an integer
+     * and 0 <= r < 1.
+     */
+    s = (int)fpr_floor(mu);
+    r = fpr_sub(mu, fpr_of(s));
+
+    /*
+     * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+     */
+    dss = fpr_half(fpr_sqr(isigma));
+
+    /*
+     * ccs = sigma_min / sigma = sigma_min * isigma.
+     */
+    ccs = fpr_mul(isigma, spc->sigma_min);
+
+    /*
+     * We now need to sample on center r.
+     */
+    for (;;) {
+        int z0, z, b;
+        fpr x;
+
+        /*
+         * Sample z for a Gaussian distribution. Then get a
+         * random bit b to turn the sampling into a bimodal
+         * distribution: if b = 1, we use z+1, otherwise we
+         * use -z. We thus have two situations:
+         *
+         *  - b = 1: z >= 1 and sampled against a Gaussian
+         *    centered on 1.
+         *  - b = 0: z <= 0 and sampled against a Gaussian
+         *    centered on 0.
+         */
+        z0 = PQCLEAN_FALCONPADDED1024_CLEAN_gaussian0_sampler(&spc->p);
+        b = (int)prng_get_u8(&spc->p) & 1;
+        z = b + ((b << 1) - 1) * z0;
+
+        /*
+         * Rejection sampling. We want a Gaussian centered on r;
+         * but we sampled against a Gaussian centered on b (0 or
+         * 1). But we know that z is always in the range where
+         * our sampling distribution is greater than the Gaussian
+         * distribution, so rejection works.
+         *
+         * We got z with distribution:
+         *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+         * We target distribution:
+         *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
+         * Rejection sampling works by keeping the value z with
+         * probability S(z)/G(z), and starting again otherwise.
+         * This requires S(z) <= G(z), which is the case here.
+         * Thus, we simply need to keep our z with probability:
+         *    P = exp(-x)
+         * where:
+         *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+         *
+         * Here, we scale up the Bernouilli distribution, which
+         * makes rejection more probable, but makes rejection
+         * rate sufficiently decorrelated from the Gaussian
+         * center and standard deviation that the whole sampler
+         * can be said to be constant-time.
+         */
+        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+        if (BerExp(&spc->p, x, ccs)) {
+            /*
+             * Rejection sampling was centered on r, but the
+             * actual center is mu = s + r.
+             */
+            return s + z;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *expanded_key,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED1024_CLEAN_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_tree(samp, samp_ctx, sig,
+                         expanded_key, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+                                        const int8_t *f, const int8_t *g,
+                                        const int8_t *F, const int8_t *G,
+                                        const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED1024_CLEAN_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_dyn(samp, samp_ctx, sig,
+                        f, g, F, G, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/vrfy.c
new file mode 100644
index 000000000..58dbf0bec
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ *   n = 2^logn  (2 <= n <= 1024)
+ *   phi = X^n + 1
+ *   q = 12289
+ *   q0i = -1/q mod 2^16
+ *   R = 2^16 mod q
+ *   R2 = 2^32 mod q
+ */
+
+#define Q     12289
+#define Q0I   12287
+#define R      4091
+#define R2    10952
+
+/*
+ * Table for NTT, binary case:
+ *   GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+    4091,  7888, 11060, 11208,  6960,  4342,  6275,  9759,
+    1591,  6399,  9477,  5266,   586,  5825,  7538,  9710,
+    1134,  6407,  1711,   965,  7099,  7674,  3743,  6442,
+    10414,  8100,  1885,  1688,  1364, 10329, 10164,  9180,
+    12210,  6240,   997,   117,  4783,  4407,  1549,  7072,
+    2829,  6458,  4431,  8877,  7144,  2564,  5664,  4042,
+    12189,   432, 10751,  1237,  7610,  1534,  3983,  7863,
+    2181,  6308,  8720,  6570,  4843,  1690,    14,  3872,
+    5569,  9368, 12163,  2019,  7543,  2315,  4673,  7340,
+    1553,  1156,  8401, 11389,  1020,  2967, 10772,  7045,
+    3316, 11236,  5285, 11578, 10637, 10086,  9493,  6180,
+    9277,  6130,  3323,   883, 10469,   489,  1502,  2851,
+    11061,  9729,  2742, 12241,  4970, 10481, 10078,  1195,
+    730,  1762,  3854,  2030,  5892, 10922,  9020,  5274,
+    9179,  3604,  3782, 10206,  3180,  3467,  4668,  2446,
+    7613,  9386,   834,  7703,  6836,  3403,  5351, 12276,
+    3580,  1739, 10820,  9787, 10209,  4070, 12250,  8525,
+    10401,  2749,  7338, 10574,  6040,   943,  9330,  1477,
+    6865,  9668,  3585,  6633, 12145,  4063,  3684,  7680,
+    8188,  6902,  3533,  9807,  6090,   727, 10099,  7003,
+    6945,  1949,  9731, 10559,  6057,   378,  7871,  8763,
+    8901,  9229,  8846,  4551,  9589, 11664,  7630,  8821,
+    5680,  4956,  6251,  8388, 10156,  8723,  2341,  3159,
+    1467,  5460,  8553,  7783,  2649,  2320,  9036,  6188,
+    737,  3698,  4699,  5753,  9046,  3687,    16,   914,
+    5186, 10531,  4552,  1964,  3509,  8436,  7516,  5381,
+    10733,  3281,  7037,  1060,  2895,  7156,  8887,  5357,
+    6409,  8197,  2962,  6375,  5064,  6634,  5625,   278,
+    932, 10229,  8927,  7642,   351,  9298,   237,  5858,
+    7692,  3146, 12126,  7586,  2053, 11285,  3802,  5204,
+    4602,  1748, 11300,   340,  3711,  4614,   300, 10993,
+    5070, 10049, 11616, 12247,  7421, 10707,  5746,  5654,
+    3835,  5553,  1224,  8476,  9237,  3845,   250, 11209,
+    4225,  6326,  9680, 12254,  4136,  2778,   692,  8808,
+    6410,  6718, 10105, 10418,  3759,  7356, 11361,  8433,
+    6437,  3652,  6342,  8978,  5391,  2272,  6476,  7416,
+    8418, 10824, 11986,  5733,   876,  7030,  2167,  2436,
+    3442,  9217,  8206,  4858,  5964,  2746,  7178,  1434,
+    7389,  8879, 10661, 11457,  4220,  1432, 10832,  4328,
+    8557,  1867,  9454,  2416,  3816,  9076,   686,  5393,
+    2523,  4339,  6115,   619,   937,  2834,  7775,  3279,
+    2363,  7488,  6112,  5056,   824, 10204, 11690,  1113,
+    2727,  9848,   896,  2028,  5075,  2654, 10464,  7884,
+    12169,  5434,  3070,  6400,  9132, 11672, 12153,  4520,
+    1273,  9739, 11468,  9937, 10039,  9720,  2262,  9399,
+    11192,   315,  4511,  1158,  6061,  6751, 11865,   357,
+    7367,  4550,   983,  8534,  8352, 10126,  7530,  9253,
+    4367,  5221,  3999,  8777,  3161,  6990,  4130, 11652,
+    3374, 11477,  1753,   292,  8681,  2806, 10378, 12188,
+    5800, 11811,  3181,  1988,  1024,  9340,  2477, 10928,
+    4582,  6750,  3619,  5503,  5233,  2463,  8470,  7650,
+    7964,  6395,  1071,  1272,  3474, 11045,  3291, 11344,
+    8502,  9478,  9837,  1253,  1857,  6233,  4720, 11561,
+    6034,  9817,  3339,  1797,  2879,  6242,  5200,  2114,
+    7962,  9353, 11363,  5475,  6084,  9601,  4108,  7323,
+    10438,  9471,  1271,   408,  6911,  3079,   360,  8276,
+    11535,  9156,  9049, 11539,   850,  8617,   784,  7919,
+    8334, 12170,  1846, 10213, 12184,  7827, 11903,  5600,
+    9779,  1012,   721,  2784,  6676,  6552,  5348,  4424,
+    6816,  8405,  9959,  5150,  2356,  5552,  5267,  1333,
+    8801,  9661,  7308,  5788,  4910,   909, 11613,  4395,
+    8238,  6686,  4302,  3044,  2285, 12249,  1963,  9216,
+    4296, 11918,   695,  4371,  9793,  4884,  2411, 10230,
+    2650,   841,  3890, 10231,  7248,  8505, 11196,  6688,
+    4059,  6060,  3686,  4722, 11853,  5816,  7058,  6868,
+    11137,  7926,  4894, 12284,  4102,  3908,  3610,  6525,
+    7938,  7982, 11977,  6755,   537,  4562,  1623,  8227,
+    11453,  7544,   906, 11816,  9548, 10858,  9703,  2815,
+    11736,  6813,  6979,   819,  8903,  6271, 10843,   348,
+    7514,  8339,  6439,   694,   852,  5659,  2781,  3716,
+    11589,  3024,  1523,  8659,  4114, 10738,  3303,  5885,
+    2978,  7289, 11884,  9123,  9323, 11830,    98,  2526,
+    2116,  4131, 11407,  1844,  3645,  3916,  8133,  2224,
+    10871,  8092,  9651,  5989,  7140,  8480,  1670,   159,
+    10923,  4918,   128,  7312,   725,  9157,  5006,  6393,
+    3494,  6043, 10972,  6181, 11838,  3423, 10514,  7668,
+    3693,  6658,  6905, 11953, 10212, 11922,  9101,  8365,
+    5110,    45,  2400,  1921,  4377,  2720,  1695,    51,
+    2808,   650,  1896,  9997,  9971, 11980,  8098,  4833,
+    4135,  4257,  5838,  4765, 10985, 11532,   590, 12198,
+    482, 12173,  2006,  7064, 10018,  3912, 12016, 10519,
+    11362,  6954,  2210,   284,  5413,  6601,  3865, 10339,
+    11188,  6231,   517,  9564, 11281,  3863,  1210,  4604,
+    8160, 11447,   153,  7204,  5763,  5089,  9248, 12154,
+    11748,  1354,  6672,   179,  5532,  2646,  5941, 12185,
+    862,  3158,   477,  7279,  5678,  7914,  4254,   302,
+    2893, 10114,  6890,  9560,  9647, 11905,  4098,  9824,
+    10269,  1353, 10715,  5325,  6254,  3951,  1807,  6449,
+    5159,  1308,  8315,  3404,  1877,  1231,   112,  6398,
+    11724, 12272,  7286,  1459, 12274,  9896,  3456,   800,
+    1397, 10678,   103,  7420,  7976,   936,   764,   632,
+    7996,  8223,  8445,  7758, 10870,  9571,  2508,  1946,
+    6524, 10158,  1044,  4338,  2457,  3641,  1659,  4139,
+    4688,  9733, 11148,  3946,  2082,  5261,  2036, 11850,
+    7636, 12236,  5366,  2380,  1399,  7720,  2100,  3217,
+    10912,  8898,  7578, 11995,  2791,  1215,  3355,  2711,
+    2267,  2004,  8568, 10176,  3214,  2337,  1750,  4729,
+    4997,  7415,  6315, 12044,  4374,  7157,  4844,   211,
+    8003, 10159,  9290, 11481,  1735,  2336,  5793,  9875,
+    8192,   986,  7527,  1401,   870,  3615,  8465,  2756,
+    9770,  2034, 10168,  3264,  6132,    54,  2880,  4763,
+    11805,  3074,  8286,  9428,  4881,  6933,  1090, 10038,
+    2567,   708,   893,  6465,  4962, 10024,  2090,  5718,
+    10743,   780,  4733,  4623,  2134,  2087,  4802,   884,
+    5372,  5795,  5938,  4333,  6559,  7549,  5269, 10664,
+    4252,  3260,  5917, 10814,  5768,  9983,  8096,  7791,
+    6800,  7491,  6272,  1907, 10947,  6289, 11803,  6032,
+    11449,  1171,  9201,  7933,  2479,  7970, 11337,  7062,
+    8911,  6728,  6542,  8114,  8828,  6595,  3545,  4348,
+    4610,  2205,  6999,  8106,  5560, 10390,  9321,  2499,
+    2413,  7272,  6881, 10582,  9308,  9437,  3554,  3326,
+    5991, 11969,  3415, 12283,  9838, 12063,  4332,  7830,
+    11329,  6605, 12271,  2044, 11611,  7353, 11201, 11582,
+    3733,  8943,  9978,  1627,  7168,  3935,  5050,  2762,
+    7496, 10383,   755,  1654, 12053,  4952, 10134,  4394,
+    6592,  7898,  7497,  8904, 12029,  3581, 10748,  5674,
+    10358,  4901,  7414,  8771,   710,  6764,  8462,  7193,
+    5371,  7274, 11084,   290,  7864,  6827, 11822,  2509,
+    6578,  4026,  5807,  1458,  5721,  5762,  4178,  2105,
+    11621,  4852,  8897,  2856, 11510,  9264,  2520,  8776,
+    7011,  2647,  1898,  7039,  5950, 11163,  5488,  6277,
+    9182, 11456,   633, 10046, 11554,  5633,  9587,  2333,
+    7008,  7084,  5047,  7199,  9865,  8997,   569,  6390,
+    10845,  9679,  8268, 11472,  4203,  1997,     2,  9331,
+    162,  6182,  2000,  3649,  9792,  6363,  7557,  6187,
+    8510,  9935,  5536,  9019,  3706, 12009,  1452,  3067,
+    5494,  9692,  4865,  6019,  7106,  9610,  4588, 10165,
+    6261,  5887,  2652, 10172,  1580, 10379,  4638,  9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ *   iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+    4091,  4401,  1081,  1229,  2530,  6014,  7947,  5329,
+    2579,  4751,  6464, 11703,  7023,  2812,  5890, 10698,
+    3109,  2125,  1960, 10925, 10601, 10404,  4189,  1875,
+    5847,  8546,  4615,  5190, 11324, 10578,  5882, 11155,
+    8417, 12275, 10599,  7446,  5719,  3569,  5981, 10108,
+    4426,  8306, 10755,  4679, 11052,  1538, 11857,   100,
+    8247,  6625,  9725,  5145,  3412,  7858,  5831,  9460,
+    5217, 10740,  7882,  7506, 12172, 11292,  6049,    79,
+    13,  6938,  8886,  5453,  4586, 11455,  2903,  4676,
+    9843,  7621,  8822,  9109,  2083,  8507,  8685,  3110,
+    7015,  3269,  1367,  6397, 10259,  8435, 10527, 11559,
+    11094,  2211,  1808,  7319,    48,  9547,  2560,  1228,
+    9438, 10787, 11800,  1820, 11406,  8966,  6159,  3012,
+    6109,  2796,  2203,  1652,   711,  7004,  1053,  8973,
+    5244,  1517,  9322, 11269,   900,  3888, 11133, 10736,
+    4949,  7616,  9974,  4746, 10270,   126,  2921,  6720,
+    6635,  6543,  1582,  4868,    42,   673,  2240,  7219,
+    1296, 11989,  7675,  8578, 11949,   989, 10541,  7687,
+    7085,  8487,  1004, 10236,  4703,   163,  9143,  4597,
+    6431, 12052,  2991, 11938,  4647,  3362,  2060, 11357,
+    12011,  6664,  5655,  7225,  5914,  9327,  4092,  5880,
+    6932,  3402,  5133,  9394, 11229,  5252,  9008,  1556,
+    6908,  4773,  3853,  8780, 10325,  7737,  1758,  7103,
+    11375, 12273,  8602,  3243,  6536,  7590,  8591, 11552,
+    6101,  3253,  9969,  9640,  4506,  3736,  6829, 10822,
+    9130,  9948,  3566,  2133,  3901,  6038,  7333,  6609,
+    3468,  4659,   625,  2700,  7738,  3443,  3060,  3388,
+    3526,  4418, 11911,  6232,  1730,  2558, 10340,  5344,
+    5286,  2190, 11562,  6199,  2482,  8756,  5387,  4101,
+    4609,  8605,  8226,   144,  5656,  8704,  2621,  5424,
+    10812,  2959, 11346,  6249,  1715,  4951,  9540,  1888,
+    3764,    39,  8219,  2080,  2502,  1469, 10550,  8709,
+    5601,  1093,  3784,  5041,  2058,  8399, 11448,  9639,
+    2059,  9878,  7405,  2496,  7918, 11594,   371,  7993,
+    3073, 10326,    40, 10004,  9245,  7987,  5603,  4051,
+    7894,   676, 11380,  7379,  6501,  4981,  2628,  3488,
+    10956,  7022,  6737,  9933,  7139,  2330,  3884,  5473,
+    7865,  6941,  5737,  5613,  9505, 11568, 11277,  2510,
+    6689,   386,  4462,   105,  2076, 10443,   119,  3955,
+    4370, 11505,  3672, 11439,   750,  3240,  3133,   754,
+    4013, 11929,  9210,  5378, 11881, 11018,  2818,  1851,
+    4966,  8181,  2688,  6205,  6814,   926,  2936,  4327,
+    10175,  7089,  6047,  9410, 10492,  8950,  2472,  6255,
+    728,  7569,  6056, 10432, 11036,  2452,  2811,  3787,
+    945,  8998,  1244,  8815, 11017, 11218,  5894,  4325,
+    4639,  3819,  9826,  7056,  6786,  8670,  5539,  7707,
+    1361,  9812,  2949, 11265, 10301,  9108,   478,  6489,
+    101,  1911,  9483,  3608, 11997, 10536,   812,  8915,
+    637,  8159,  5299,  9128,  3512,  8290,  7068,  7922,
+    3036,  4759,  2163,  3937,  3755, 11306,  7739,  4922,
+    11932,   424,  5538,  6228, 11131,  7778, 11974,  1097,
+    2890, 10027,  2569,  2250,  2352,   821,  2550, 11016,
+    7769,   136,   617,  3157,  5889,  9219,  6855,   120,
+    4405,  1825,  9635,  7214, 10261, 11393,  2441,  9562,
+    11176,   599,  2085, 11465,  7233,  6177,  4801,  9926,
+    9010,  4514,  9455, 11352, 11670,  6174,  7950,  9766,
+    6896, 11603,  3213,  8473,  9873,  2835, 10422,  3732,
+    7961,  1457, 10857,  8069,   832,  1628,  3410,  4900,
+    10855,  5111,  9543,  6325,  7431,  4083,  3072,  8847,
+    9853, 10122,  5259, 11413,  6556,   303,  1465,  3871,
+    4873,  5813, 10017,  6898,  3311,  5947,  8637,  5852,
+    3856,   928,  4933,  8530,  1871,  2184,  5571,  5879,
+    3481, 11597,  9511,  8153,    35,  2609,  5963,  8064,
+    1080, 12039,  8444,  3052,  3813, 11065,  6736,  8454,
+    2340,  7651,  1910, 10709,  2117,  9637,  6402,  6028,
+    2124,  7701,  2679,  5183,  6270,  7424,  2597,  6795,
+    9222, 10837,   280,  8583,  3270,  6753,  2354,  3779,
+    6102,  4732,  5926,  2497,  8640, 10289,  6107, 12127,
+    2958, 12287, 10292,  8086,   817,  4021,  2610,  1444,
+    5899, 11720,  3292,  2424,  5090,  7242,  5205,  5281,
+    9956,  2702,  6656,   735,  2243, 11656,   833,  3107,
+    6012,  6801,  1126,  6339,  5250, 10391,  9642,  5278,
+    3513,  9769,  3025,   779,  9433,  3392,  7437,   668,
+    10184,  8111,  6527,  6568, 10831,  6482,  8263,  5711,
+    9780,   467,  5462,  4425, 11999,  1205,  5015,  6918,
+    5096,  3827,  5525, 11579,  3518,  4875,  7388,  1931,
+    6615,  1541,  8708,   260,  3385,  4792,  4391,  5697,
+    7895,  2155,  7337,   236, 10635, 11534,  1906,  4793,
+    9527,  7239,  8354,  5121, 10662,  2311,  3346,  8556,
+    707,  1088,  4936,   678, 10245,    18,  5684,   960,
+    4459,  7957,   226,  2451,     6,  8874,   320,  6298,
+    8963,  8735,  2852,  2981,  1707,  5408,  5017,  9876,
+    9790,  2968,  1899,  6729,  4183,  5290, 10084,  7679,
+    7941,  8744,  5694,  3461,  4175,  5747,  5561,  3378,
+    5227,   952,  4319,  9810,  4356,  3088, 11118,   840,
+    6257,   486,  6000,  1342, 10382,  6017,  4798,  5489,
+    4498,  4193,  2306,  6521,  1475,  6372,  9029,  8037,
+    1625,  7020,  4740,  5730,  7956,  6351,  6494,  6917,
+    11405,  7487, 10202, 10155,  7666,  7556, 11509,  1546,
+    6571, 10199,  2265,  7327,  5824, 11396, 11581,  9722,
+    2251, 11199,  5356,  7408,  2861,  4003,  9215,   484,
+    7526,  9409, 12235,  6157,  9025,  2121, 10255,  2519,
+    9533,  3824,  8674, 11419, 10888,  4762, 11303,  4097,
+    2414,  6496,  9953, 10554,   808,  2999,  2130,  4286,
+    12078,  7445,  5132,  7915,   245,  5974,  4874,  7292,
+    7560, 10539,  9952,  9075,  2113,  3721, 10285, 10022,
+    9578,  8934, 11074,  9498,   294,  4711,  3391,  1377,
+    9072, 10189,  4569, 10890,  9909,  6923,    53,  4653,
+    439, 10253,  7028, 10207,  8343,  1141,  2556,  7601,
+    8150, 10630,  8648,  9832,  7951, 11245,  2131,  5765,
+    10343,  9781,  2718,  1419,  4531,  3844,  4066,  4293,
+    11657, 11525, 11353,  4313,  4869, 12186,  1611, 10892,
+    11489,  8833,  2393,    15, 10830,  5003,    17,   565,
+    5891, 12177, 11058, 10412,  8885,  3974, 10981,  7130,
+    5840, 10482,  8338,  6035,  6964,  1574, 10936,  2020,
+    2465,  8191,   384,  2642,  2729,  5399,  2175,  9396,
+    11987,  8035,  4375,  6611,  5010, 11812,  9131, 11427,
+    104,  6348,  9643,  6757, 12110,  5617, 10935,   541,
+    135,  3041,  7200,  6526,  5085, 12136,   842,  4129,
+    7685, 11079,  8426,  1008,  2725, 11772,  6058,  1101,
+    1950,  8424,  5688,  6876, 12005, 10079,  5335,   927,
+    1770,   273,  8377,  2271,  5225, 10283,   116, 11807,
+    91, 11699,   757,  1304,  7524,  6451,  8032,  8154,
+    7456,  4191,   309,  2318,  2292, 10393, 11639,  9481,
+    12238, 10594,  9569,  7912, 10368,  9889, 12244,  7179,
+    3924,  3188,   367,  2077,   336,  5384,  5631,  8596,
+    4621,  1775,  8866,   451,  6108,  1317,  6246,  8795,
+    5896,  7283,  3132, 11564,  4977, 12161,  7371,  1366,
+    12130, 10619,  3809,  5149,  6300,  2638,  4197,  1418,
+    10065,  4156,  8373,  8644, 10445,   882,  8158, 10173,
+    9763, 12191,   459,  2966,  3166,   405,  5000,  9311,
+    6404,  8986,  1551,  8175,  3630, 10766,  9265,   700,
+    8573,  9508,  6630, 11437, 11595,  5850,  3950,  4775,
+    11941,  1446,  6018,  3386, 11470,  5310,  5476,   553,
+    9474,  2586,  1431,  2741,   473, 11383,  4745,   836,
+    4062, 10666,  7727, 11752,  5534,   312,  4307,  4351,
+    5764,  8679,  8381,  8187,     5,  7395,  4363,  1152,
+    5421,  5231,  6473,   436,  7567,  8603,  6229,  8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+    /*
+     * If x < 0, the cast to uint32_t will set the high bit to 1.
+     */
+    uint32_t y;
+
+    y = (uint32_t)x;
+    y += Q & -(y >> 31);
+    return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+    /*
+     * We compute x + y - q. If the result is negative, then the
+     * high bit will be set, and 'd >> 31' will be equal to 1;
+     * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+     * it will be an all-zero pattern. In other words, this
+     * implements a conditional addition of q.
+     */
+    uint32_t d;
+
+    d = x + y - Q;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+    /*
+     * As in mq_add(), we use a conditional addition to ensure the
+     * result is in the 0..q-1 range.
+     */
+    uint32_t d;
+
+    d = x - y;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+    x += Q & -(x & 1);
+    return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+    uint32_t z, w;
+
+    /*
+     * We compute x*y + k*q with a value of k chosen so that the 16
+     * low bits of the result are 0. We can then shift the value.
+     * After the shift, result may still be larger than q, but it
+     * will be lower than 2*q, so a conditional subtraction works.
+     */
+
+    z = x * y;
+    w = ((z * Q0I) & 0xFFFF) * Q;
+
+    /*
+     * When adding z and w, the result will have its low 16 bits
+     * equal to 0. Since x, y and z are lower than q, the sum will
+     * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+     * fit on 29 bits.
+     */
+    z = (z + w) >> 16;
+
+    /*
+     * After the shift, analysis shows that the value will be less
+     * than 2q. We do a subtraction then conditional subtraction to
+     * ensure the result is in the expected range.
+     */
+    z -= Q;
+    z += Q & -(z >> 31);
+    return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+    return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+    /*
+     * We invert y by computing y^(q-2) mod q.
+     *
+     * We use the following addition chain for exponent e = 12287:
+     *
+     *   e0 = 1
+     *   e1 = 2 * e0 = 2
+     *   e2 = e1 + e0 = 3
+     *   e3 = e2 + e1 = 5
+     *   e4 = 2 * e3 = 10
+     *   e5 = 2 * e4 = 20
+     *   e6 = 2 * e5 = 40
+     *   e7 = 2 * e6 = 80
+     *   e8 = 2 * e7 = 160
+     *   e9 = e8 + e2 = 163
+     *   e10 = e9 + e8 = 323
+     *   e11 = 2 * e10 = 646
+     *   e12 = 2 * e11 = 1292
+     *   e13 = e12 + e9 = 1455
+     *   e14 = 2 * e13 = 2910
+     *   e15 = 2 * e14 = 5820
+     *   e16 = e15 + e10 = 6143
+     *   e17 = 2 * e16 = 12286
+     *   e18 = e17 + e0 = 12287
+     *
+     * Additions on exponents are converted to Montgomery
+     * multiplications. We define all intermediate results as so
+     * many local variables, and let the C compiler work out which
+     * must be kept around.
+     */
+    uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+    uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+    y0 = mq_montymul(y, R2);
+    y1 = mq_montysqr(y0);
+    y2 = mq_montymul(y1, y0);
+    y3 = mq_montymul(y2, y1);
+    y4 = mq_montysqr(y3);
+    y5 = mq_montysqr(y4);
+    y6 = mq_montysqr(y5);
+    y7 = mq_montysqr(y6);
+    y8 = mq_montysqr(y7);
+    y9 = mq_montymul(y8, y2);
+    y10 = mq_montymul(y9, y8);
+    y11 = mq_montysqr(y10);
+    y12 = mq_montysqr(y11);
+    y13 = mq_montymul(y12, y9);
+    y14 = mq_montysqr(y13);
+    y15 = mq_montysqr(y14);
+    y16 = mq_montymul(y15, y10);
+    y17 = mq_montysqr(y16);
+    y18 = mq_montymul(y17, y0);
+
+    /*
+     * Final multiplication with x, which is not in Montgomery
+     * representation, computes the correct division result.
+     */
+    return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, i, j1;
+
+        ht = t >> 1;
+        for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+            size_t j, j2;
+            uint32_t s;
+
+            s = GMb[m + i];
+            j2 = j1 + ht;
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v;
+
+                u = a[j];
+                v = mq_montymul(a[j + ht], s);
+                a[j] = (uint16_t)mq_add(u, v);
+                a[j + ht] = (uint16_t)mq_sub(u, v);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+    uint32_t ni;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    while (m > 1) {
+        size_t hm, dt, i, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+            size_t j, j2;
+            uint32_t s;
+
+            j2 = j1 + t;
+            s = iGMb[hm + i];
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v, w;
+
+                u = a[j];
+                v = a[j + t];
+                a[j] = (uint16_t)mq_add(u, v);
+                w = mq_sub(u, v);
+                a[j + t] = (uint16_t)
+                           mq_montymul(w, s);
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * To complete the inverse NTT, we must now divide all values by
+     * n (the vector size). We thus need the inverse of n, i.e. we
+     * need to divide 1 by 2 logn times. But we also want it in
+     * Montgomery representation, i.e. we also want to multiply it
+     * by R = 2^16. In the common case, this should be a simple right
+     * shift. The loop below is generic and works also in corner cases;
+     * its computation time is negligible.
+     */
+    ni = R;
+    for (m = n; m > 1; m >>= 1) {
+        ni = mq_rshift1(ni);
+    }
+    for (m = 0; m < n; m ++) {
+        a[m] = (uint16_t)mq_montymul(a[m], ni);
+    }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], R2);
+    }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+    }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_sub(f[u], g[u]);
+    }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn) {
+    mq_NTT(h, logn);
+    mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+        const uint16_t *h, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+
+    /*
+     * Reduce s2 elements modulo q ([0..q-1] range).
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+     */
+    mq_NTT(tt, logn);
+    mq_poly_montymul_ntt(tt, h, logn);
+    mq_iNTT(tt, logn);
+    mq_poly_sub(tt, c0, logn);
+
+    /*
+     * Normalize -s1 elements into the [-q/2..q/2] range.
+     */
+    for (u = 0; u < n; u ++) {
+        int32_t w;
+
+        w = (int32_t)tt[u];
+        w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+        ((int16_t *)tt)[u] = (int16_t)w;
+    }
+
+    /*
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
+     * is short enough.
+     */
+    return PQCLEAN_FALCONPADDED1024_CLEAN_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        tt[u] = (uint16_t)mq_conv_small(f[u]);
+        h[u] = (uint16_t)mq_conv_small(g[u]);
+    }
+    mq_NTT(h, logn);
+    mq_NTT(tt, logn);
+    for (u = 0; u < n; u ++) {
+        if (tt[u] == 0) {
+            return 0;
+        }
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *t1, *t2;
+
+    n = (size_t)1 << logn;
+    t1 = (uint16_t *)tmp;
+    t2 = t1 + n;
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint16_t)mq_conv_small(g[u]);
+        t2[u] = (uint16_t)mq_conv_small(F[u]);
+    }
+    mq_NTT(t1, logn);
+    mq_NTT(t2, logn);
+    mq_poly_tomonty(t1, logn);
+    mq_poly_montymul_ntt(t1, t2, logn);
+    for (u = 0; u < n; u ++) {
+        t2[u] = (uint16_t)mq_conv_small(f[u]);
+    }
+    mq_NTT(t2, logn);
+    for (u = 0; u < n; u ++) {
+        if (t2[u] == 0) {
+            return 0;
+        }
+        t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+    }
+    mq_iNTT(t1, logn);
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+        int32_t gi;
+
+        w = t1[u];
+        w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+        gi = *(int32_t *)&w;
+        if (gi < -127 || gi > +127) {
+            return 0;
+        }
+        G[u] = (int8_t)gi;
+    }
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED1024_CLEAN_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/LICENSE b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/LICENSE
new file mode 100644
index 000000000..4df2d7836
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/LICENSE
@@ -0,0 +1,57 @@
+This ARMv8 NEON implementation is provided under the Apache 2.0 license:
+
+/*
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+Based on the reference code provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin <thomas.pornin@nccgroup.com>.
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/api.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/api.h
new file mode 100644
index 000000000..deba20b36
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED512_AARCH64_API_H
+#define PQCLEAN_FALCONPADDED512_AARCH64_API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES   1281
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES   897
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES            666
+
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_ALGNAME          "Falcon-padded-512"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ *   public (pk): PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES
+ *   private (sk): PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/codec.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/codec.c
new file mode 100644
index 000000000..3fe3a9452
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/codec.c
@@ -0,0 +1,554 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_modq_encode(
+    void *out, size_t max_out_len,
+    const uint16_t *x, unsigned logn) {
+    size_t n, out_len, u;
+    uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = 1 << logn;
+    out_len = ((n * 14) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+
+    for (u = 0; u < n; u ++) {
+        if (x[u] >= FALCON_Q) {
+            return 0;
+        }
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << 14) | x[u];
+        acc_len += 14;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_modq_decode(uint16_t *x, const void *in, size_t max_in_len, unsigned logn) {
+    size_t n, in_len, u;
+    const uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = 1 << logn;
+    in_len = ((n * 14) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    u = 0;
+    while (u < n) {
+        acc = (acc << 8) | (*buf ++);
+        acc_len += 8;
+        if (acc_len >= 14) {
+            unsigned w;
+
+            acc_len -= 14;
+            w = (acc >> acc_len) & 0x3FFF;
+            if (w >= 12289) {
+                return 0;
+            }
+            x[u ++] = (uint16_t)w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint16_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_decode(
+    int16_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            w |= -(w & mask2);
+            x[u ++] = (int16_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(void *out, size_t max_out_len,
+        const int8_t *x, uint8_t bits) {
+    size_t u, out_len;
+    int8_t minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    out_len = (size_t) ((FALCON_N * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+
+    maxv = (int8_t) (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    if (PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int8(x, minv, maxv)) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < FALCON_N; u ++) {
+        acc = (acc << bits) | ((uint8_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(int8_t *x, unsigned bits,
+        const void *in, size_t max_in_len) {
+    size_t in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    in_len = ((FALCON_N * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < FALCON_N) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < FALCON_N) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            x[u ++] = (int8_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x) {
+    uint8_t *buf;
+    size_t u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    buf = out;
+
+    /*
+     * Make sure that all values are within the -2047..+2047 range.
+     */
+    if (PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int16(x, -2047, 2047)) {
+        return 0;
+    }
+
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < FALCON_N; u ++) {
+        int t;
+        unsigned w;
+
+        /*
+         * Get sign and absolute value of next integer; push the
+         * sign bit.
+         */
+        acc <<= 1;
+        t = x[u];
+        if (t < 0) {
+            t = -t;
+            acc |= 1;
+        }
+        w = (unsigned)t;
+
+        /*
+         * Push the low 7 bits of the absolute value.
+         */
+        acc <<= 7;
+        acc |= w & 127u;
+        w >>= 7;
+
+        /*
+         * We pushed exactly 8 bits.
+         */
+        acc_len += 8;
+
+        /*
+         * Push as many zeros as necessary, then a one. Since the
+         * absolute value is at most 2047, w can only range up to
+         * 15 at this point, thus we will add at most 16 bits
+         * here. With the 8 bits above and possibly up to 7 bits
+         * from previous iterations, we may go up to 31 bits, which
+         * will fit in the accumulator, which is an uint32_t.
+         */
+        acc <<= (w + 1);
+        acc |= 1;
+        acc_len += w + 1;
+
+        /*
+         * Produce all full bytes.
+         */
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            if (buf != NULL) {
+                if (v >= max_out_len) {
+                    return 0;
+                }
+                buf[v] = (uint8_t)(acc >> acc_len);
+            }
+            v ++;
+        }
+    }
+
+    /*
+     * Flush remaining bits (if any).
+     */
+    if (acc_len > 0) {
+        if (buf != NULL) {
+            if (v >= max_out_len) {
+                return 0;
+            }
+            buf[v] = (uint8_t)(acc << (8 - acc_len));
+        }
+        v ++;
+    }
+
+    return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len) {
+    const uint8_t *buf;
+    size_t u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < FALCON_N; u ++) {
+        unsigned b, s, m;
+
+        /*
+         * Get next eight bits: sign and low seven bits of the
+         * absolute value.
+         */
+        if (v >= max_in_len) {
+            return 0;
+        }
+        acc = (acc << 8) | (uint32_t)buf[v ++];
+        b = acc >> acc_len;
+        s = b & 128;
+        m = b & 127;
+
+        /*
+         * Get next bits until a 1 is reached.
+         */
+        for (;;) {
+            if (acc_len == 0) {
+                if (v >= max_in_len) {
+                    return 0;
+                }
+                acc = (acc << 8) | (uint32_t)buf[v ++];
+                acc_len = 8;
+            }
+            acc_len --;
+            if (((acc >> acc_len) & 1) != 0) {
+                break;
+            }
+            m += 128;
+            if (m > 2047) {
+                return 0;
+            }
+        }
+
+        /*
+         * "-0" is forbidden.
+         */
+        if (s && m == 0) {
+            return 0;
+        }
+
+        x[u] = (int16_t)(s ? -(int)m : (int)m);
+    }
+
+    /*
+     * Unused bits in the last byte must be zero.
+     */
+    if ((acc & ((1u << acc_len) - 1u)) != 0) {
+        return 0;
+    }
+
+    return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
+ *      1       2     129       56.31       143       60.02
+ *      2       4     123       40.93       160       46.52
+ *      3       8      97       28.97       159       38.01
+ *      4      16     100       21.48       154       32.50
+ *      5      32      71       15.41       151       29.36
+ *      6      64      59       11.07       138       27.77
+ *      7     128      39        7.91       144       27.00
+ *      8     256      32        5.63       148       26.61
+ *      9     512      22        4.00       137       26.46
+ *     10    1024      15        2.84       146       26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    7,
+    7,
+    6,
+    6,
+    5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ *   log(n)     n   max sig coeff (theoretical)
+ *      1       2       412
+ *      2       4       583
+ *      3       8       824
+ *      4      16      1166
+ *      5      32      1649
+ *      6      64      2332
+ *      7     128      3299
+ *      8     256      4665
+ *      9     512      6598
+ *     10    1024      9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_sig_bits[] = {
+    0, /* unused */
+    10,
+    11,
+    11,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/common.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/common.c
new file mode 100644
index 000000000..b461baa8c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/common.c
@@ -0,0 +1,549 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "macrous.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
+
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 5 * FALCON_Q) {
+            while (w >= FALCON_Q) {
+                w -= FALCON_Q;
+            }
+            *x++ = (uint16_t)w;
+            n--;
+        }
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
+    /*
+     * Each 16-bit sample is a value in 0..65535. The value is
+     * kept if it falls in 0..61444 (because 61445 = 5*12289)
+     * and rejected otherwise; thus, each sample has probability
+     * about 0.93758 of being selected.
+     *
+     * We want to oversample enough to be sure that we will
+     * have enough values with probability at least 1 - 2^(-256).
+     * Depending on degree N, this leads to the following
+     * required oversampling:
+     *
+     *   logn     n  oversampling
+     *     1      2     65
+     *     2      4     67
+     *     3      8     71
+     *     4     16     77
+     *     5     32     86
+     *     6     64    100
+     *     7    128    122
+     *     8    256    154
+     *     9    512    205
+     *    10   1024    287
+     *
+     * If logn >= 7, then the provided temporary buffer is large
+     * enough. Otherwise, we use a stack buffer of 63 entries
+     * (i.e. 126 bytes) for the values that do not fit in tmp[].
+     */
+
+    static const uint16_t overtab[] = {
+        0, /* unused */
+        65,
+        67,
+        71,
+        77,
+        86,
+        100,
+        122,
+        154,
+        205,
+        287
+    };
+
+    unsigned n, n2, u, m, p, over;
+    uint16_t *tt1, tt2[63];
+
+    /*
+     * We first generate m 16-bit value. Values 0..n-1 go to x[].
+     * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+     * We also reduce modulo q the values; rejected values are set
+     * to 0xFFFF.
+     */
+    n = 1U << logn;
+    n2 = n << 1;
+    over = overtab[logn];
+    m = n + over;
+    tt1 = (uint16_t *)tmp;
+    for (u = 0; u < m; u++) {
+        uint8_t buf[2];
+        uint32_t w, wr;
+
+        inner_shake256_extract(sc, buf, sizeof buf);
+        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+        wr |= ((w - 61445) >> 31) - 1;
+        if (u < n) {
+            x[u] = (uint16_t)wr;
+        } else if (u < n2) {
+            tt1[u - n] = (uint16_t)wr;
+        } else {
+            tt2[u - n2] = (uint16_t)wr;
+        }
+    }
+
+    /*
+     * Now we must "squeeze out" the invalid values. We do this in
+     * a logarithmic sequence of passes; each pass computes where a
+     * value should go, and moves it down by 'p' slots if necessary,
+     * where 'p' uses an increasing powers-of-two scale. It can be
+     * shown that in all cases where the loop decides that a value
+     * has to be moved down by p slots, the destination slot is
+     * "free" (i.e. contains an invalid value).
+     */
+    for (p = 1; p <= over; p <<= 1) {
+        unsigned v;
+
+        /*
+         * In the loop below:
+         *
+         *   - v contains the index of the final destination of
+         *     the value; it is recomputed dynamically based on
+         *     whether values are valid or not.
+         *
+         *   - u is the index of the value we consider ("source");
+         *     its address is s.
+         *
+         *   - The loop may swap the value with the one at index
+         *     u-p. The address of the swap destination is d.
+         */
+        v = 0;
+        for (u = 0; u < m; u++) {
+            uint16_t *s, *d;
+            unsigned j, sv, dv, mk;
+
+            if (u < n) {
+                s = &x[u];
+            } else if (u < n2) {
+                s = &tt1[u - n];
+            } else {
+                s = &tt2[u - n2];
+            }
+            sv = *s;
+
+            /*
+             * The value in sv should ultimately go to
+             * address v, i.e. jump back by u-v slots.
+             */
+            j = u - v;
+
+            /*
+             * We increment v for the next iteration, but
+             * only if the source value is valid. The mask
+             * 'mk' is -1 if the value is valid, 0 otherwise,
+             * so we _subtract_ mk.
+             */
+            mk = (sv >> 15) - 1U;
+            v -= mk;
+
+            /*
+             * In this loop we consider jumps by p slots; if
+             * u < p then there is nothing more to do.
+             */
+            if (u < p) {
+                continue;
+            }
+
+            /*
+             * Destination for the swap: value at address u-p.
+             */
+            if ((u - p) < n) {
+                d = &x[u - p];
+            } else if ((u - p) < n2) {
+                d = &tt1[(u - p) - n];
+            } else {
+                d = &tt2[(u - p) - n2];
+            }
+            dv = *d;
+
+            /*
+             * The swap should be performed only if the source
+             * is valid AND the jump j has its 'p' bit set.
+             */
+            mk &= -(((j & p) + 0x1FF) >> 9);
+
+            *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+        }
+    }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+    0, /* unused */
+    101498,
+    208714,
+    428865,
+    892039,
+    1852696,
+    3842630,
+    7959734,
+    16468416,
+    34034726,
+    70265242
+};
+
+/* see inner.h
+ * In NEON, there is sign saturating doubling add instruction sqdmlal/sqdmlal2,
+ * thus, we enable 2 parallel dependency rather than 1 for better scheduling.
+ * Each for loop is tuned for cache locality.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short(const int16_t *s1, const int16_t *s2) {
+    // Total SIMD register 18 = 16 + 2
+    int16x8x4_t neon_s1, neon_s2, neon_s3, neon_s4; // 16
+    int32x4_t neon_s, neon_sh;                      // 2
+    int32x2_t tmp;
+    uint32_t s;
+    neon_s = vdupq_n_s32(0);
+    neon_sh = vdupq_n_s32(0);
+
+    for (unsigned u = 0; u < FALCON_N; u += 128) {
+        vload_s16_x4(neon_s1, &s1[u]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+        vload_s16_x4(neon_s2, &s1[u + 32]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+        vload_s16_x4(neon_s3, &s1[u + 64]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+        vload_s16_x4(neon_s4, &s1[u + 96]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+    }
+    for (unsigned u = 0; u < FALCON_N; u += 128) {
+        vload_s16_x4(neon_s1, &s2[u]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+        vload_s16_x4(neon_s2, &s2[u + 32]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+        vload_s16_x4(neon_s3, &s2[u + 64]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+        vload_s16_x4(neon_s4, &s2[u + 96]);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+    }
+    // 32x4
+    neon_s = vhaddq_s32(neon_s, neon_sh);
+    // 32x4 -> 32x2
+    tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+    // 32x2 -> 32x1
+    // Use saturating add to prevent overflow
+    s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+    return s <= l2bound[FALCON_LOGN];
+}
+
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+        const int16_t *hm, const fpr *t0,
+        const fpr *t1) {
+    // Total SIMD registers: 26 = 16 + 8 + 2
+    int16x8x4_t neon_hm, neon_ts;                         // 8
+    float64x2x4_t neon_tf0, neon_tf1, neon_tf2, neon_tf3; // 16
+    int64x2x4_t neon_ts0, neon_ts1, neon_ts2, neon_ts3;   // 16
+    int32x4x4_t neon_ts4, neon_ts5;                       // 8
+    int32x4_t neon_s, neon_sh;                            // 2
+    int32x2_t tmp;
+    uint32_t s;
+
+    neon_s = vdupq_n_s32(0);
+    neon_sh = vdupq_n_s32(0);
+
+    // s1tmp
+    for (int i = 0; i < FALCON_N; i += 32) {
+        vloadx4(neon_tf0, &t0[i]);
+        vloadx4(neon_tf1, &t0[i + 8]);
+        vfrintx4(neon_ts0, neon_tf0);
+        vfrintx4(neon_ts1, neon_tf1);
+
+        neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+        neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+        neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+        neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+        vloadx4(neon_tf2, &t0[i + 16]);
+        vloadx4(neon_tf3, &t0[i + 24]);
+        vfrintx4(neon_ts2, neon_tf2);
+        vfrintx4(neon_ts3, neon_tf3);
+
+        neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+        neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+        neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+        neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+        neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+        neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+        neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+        neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+        // hm = hm - fpr_rint(t0)
+        vload_s16_x4(neon_hm, &hm[i]);
+        neon_hm.val[0] = vsubq_s16(neon_hm.val[0], neon_ts.val[0]);
+        neon_hm.val[1] = vsubq_s16(neon_hm.val[1], neon_ts.val[1]);
+        neon_hm.val[2] = vsubq_s16(neon_hm.val[2], neon_ts.val[2]);
+        neon_hm.val[3] = vsubq_s16(neon_hm.val[3], neon_ts.val[3]);
+        vstore_s16_x4(&s1tmp[i], neon_hm);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[0]), vget_low_s16(neon_hm.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[1]), vget_low_s16(neon_hm.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[2]), vget_low_s16(neon_hm.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[3]), vget_low_s16(neon_hm.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[0], neon_hm.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[1], neon_hm.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[2], neon_hm.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[3], neon_hm.val[3]);
+    }
+
+    // s2tmp
+    for (int i = 0; i < FALCON_N; i += 32) {
+        vloadx4(neon_tf0, &t1[i]);
+        vloadx4(neon_tf1, &t1[i + 8]);
+
+        vfrintx4(neon_ts0, neon_tf0);
+        vfrintx4(neon_ts1, neon_tf1);
+
+        neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+        neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+        neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+        neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+        vloadx4(neon_tf2, &t1[i + 16]);
+        vloadx4(neon_tf3, &t1[i + 24]);
+
+        vfrintx4(neon_ts2, neon_tf2);
+        vfrintx4(neon_ts3, neon_tf3);
+
+        neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+        neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+        neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+        neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+        neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+        neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+        neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+        neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+        neon_ts.val[0] = vnegq_s16(neon_ts.val[0]);
+        neon_ts.val[1] = vnegq_s16(neon_ts.val[1]);
+        neon_ts.val[2] = vnegq_s16(neon_ts.val[2]);
+        neon_ts.val[3] = vnegq_s16(neon_ts.val[3]);
+        vstore_s16_x4(&s2tmp[i], neon_ts);
+
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[0]), vget_low_s16(neon_ts.val[0]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[1]), vget_low_s16(neon_ts.val[1]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[2]), vget_low_s16(neon_ts.val[2]));
+        neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[3]), vget_low_s16(neon_ts.val[3]));
+
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[0], neon_ts.val[0]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[1], neon_ts.val[1]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[2], neon_ts.val[2]);
+        neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[3], neon_ts.val[3]);
+    }
+
+    // 32x4
+    neon_s = vhaddq_s32(neon_s, neon_sh);
+    // 32x4 -> 32x2
+    tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+    // 32x2 -> 32x1
+    // Use saturating add to prevent overflow
+    s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+    return s <= l2bound[FALCON_LOGN];
+}
+
+int32_t PQCLEAN_FALCONPADDED512_AARCH64_poly_small_sqnorm(const int8_t *f) {
+    int8x16x4_t a;
+    int16x8x4_t b, c;
+    int32x4_t norm, norm_sh;
+
+    norm = vdupq_n_s32(0);
+    norm_sh = vdupq_n_s32(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        a = vld1q_s8_x4(&f[0]);
+
+        b.val[0] = vmovl_s8(vget_low_s8(a.val[0]));
+        b.val[1] = vmovl_high_s8(a.val[0]);
+        b.val[2] = vmovl_s8(vget_low_s8(a.val[1]));
+        b.val[3] = vmovl_high_s8(a.val[1]);
+
+        c.val[0] = vmovl_s8(vget_low_s8(a.val[2]));
+        c.val[1] = vmovl_high_s8(a.val[2]);
+        c.val[2] = vmovl_s8(vget_low_s8(a.val[3]));
+        c.val[3] = vmovl_high_s8(a.val[3]);
+
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[0]), vget_low_s16(b.val[0]));
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[1]), vget_low_s16(b.val[1]));
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[2]), vget_low_s16(b.val[2]));
+        norm = vqdmlal_s16(norm, vget_low_s16(b.val[3]), vget_low_s16(b.val[3]));
+
+        norm = vqdmlal_high_s16(norm, b.val[0], b.val[0]);
+        norm = vqdmlal_high_s16(norm, b.val[1], b.val[1]);
+        norm = vqdmlal_high_s16(norm, b.val[2], b.val[2]);
+        norm = vqdmlal_high_s16(norm, b.val[3], b.val[3]);
+
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[0]), vget_low_s16(c.val[0]));
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[1]), vget_low_s16(c.val[1]));
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[2]), vget_low_s16(c.val[2]));
+        norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[3]), vget_low_s16(c.val[3]));
+
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[0], c.val[0]);
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[1], c.val[1]);
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[2], c.val[2]);
+        norm_sh = vqdmlal_high_s16(norm_sh, c.val[3], c.val[3]);
+    }
+    // 32x4
+    norm = vhaddq_s32(norm, norm_sh);
+    // 32x4 -> 32x2
+    int32x2_t tmp;
+    tmp = vqadd_s32(vget_low_s32(norm), vget_high_s32(norm));
+
+    // 32x2 -> 32x1
+    // Use saturating add to prevent overflow
+    int32_t s;
+    s = vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+    return s;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft.c
new file mode 100644
index 000000000..9de1bc33e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft.c
@@ -0,0 +1,1038 @@
+/*
+ * High-speed vectorize FFT code for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Forward FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log2(fpr *f) {
+    fpr x_re, x_im, y_re, y_im, v_re, v_im, t_re, t_im, s;
+
+    x_re = f[0];
+    y_re = f[1];
+    x_im = f[2];
+    y_im = f[3];
+    s = fpr_tab_log2[0];
+
+    t_re = y_re * s;
+    t_im = y_im * s;
+
+    v_re = t_re - t_im;
+    v_im = t_re + t_im;
+
+    f[0] = x_re + v_re;
+    f[1] = x_re - v_re;
+    f[2] = x_im + v_im;
+    f[3] = x_im - v_im;
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 4 complex points (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log3(fpr *f) {
+    // Total SIMD registers: 18 = 4 + 6 + 8
+    float64x2x4_t tmp;                                        // 4
+    float64x2x2_t s_re_im, x, y;                              // 6
+    float64x2_t v_re, v_im, x_re, x_im, y_re, y_im, t_x, t_y; // 8
+
+    vloadx4(tmp, &f[0]);
+    s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+    vfmul(v_re, tmp.val[1], s_re_im.val[0]);
+    vfmul(v_im, tmp.val[3], s_re_im.val[0]);
+
+    vfsub(t_x, v_re, v_im);
+    vfadd(t_y, v_re, v_im);
+
+    vfsub(tmp.val[1], tmp.val[0], t_x);
+    vfsub(tmp.val[3], tmp.val[2], t_y);
+
+    vfadd(tmp.val[0], tmp.val[0], t_x);
+    vfadd(tmp.val[2], tmp.val[2], t_y);
+
+    x_re = vtrn1q_f64(tmp.val[0], tmp.val[1]);
+    y_re = vtrn2q_f64(tmp.val[0], tmp.val[1]);
+    x_im = vtrn1q_f64(tmp.val[2], tmp.val[3]);
+    y_im = vtrn2q_f64(tmp.val[2], tmp.val[3]);
+
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    FWD_TOP(v_re, v_im, y_re, y_im, s_re_im.val[0], s_re_im.val[1]);
+
+    FPC_ADD(x.val[0], y.val[0], x_re, x_im, v_re, v_im);
+    FPC_SUB(x.val[1], y.val[1], x_re, x_im, v_re, v_im);
+
+    vstore2(&f[0], x);
+    vstore2(&f[4], y);
+}
+
+/*
+ * Vectorized 3 layers of Forward FFT for 8 complex points (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log4(fpr *f) {
+    // Total SIMD register: 26 = 8 + 18
+    float64x2x4_t t0, t1;                                          // 8
+    float64x2x2_t x_re, x_im, y_re, y_im, v1, v2, tx, ty, s_re_im; // 18
+
+    vloadx4(t0, &f[0]);
+    vloadx4(t1, &f[8]);
+    vload(s_re_im.val[0], &fpr_tab_log2[0]);
+
+    vfmul(v1.val[0], t0.val[2], s_re_im.val[0]);
+    vfmul(v1.val[1], t0.val[3], s_re_im.val[0]);
+
+    vfmul(v2.val[0], t1.val[2], s_re_im.val[0]);
+    vfmul(v2.val[1], t1.val[3], s_re_im.val[0]);
+
+    vfsub(tx.val[0], v1.val[0], v2.val[0]);
+    vfsub(tx.val[1], v1.val[1], v2.val[1]);
+
+    vfadd(ty.val[0], v1.val[0], v2.val[0]);
+    vfadd(ty.val[1], v1.val[1], v2.val[1]);
+
+    FWD_BOT(t0.val[0], t1.val[0], t0.val[2], t1.val[2], tx.val[0], ty.val[0]);
+    FWD_BOT(t0.val[1], t1.val[1], t0.val[3], t1.val[3], tx.val[1], ty.val[1]);
+
+    vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+    FWD_TOP_LANE(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0]);
+    FWD_TOP_LANE(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0]);
+
+    FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+    FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+    x_re.val[0] = t0.val[0];
+    x_re.val[1] = t0.val[2];
+    y_re.val[0] = t0.val[1];
+    y_re.val[1] = t0.val[3];
+
+    x_im.val[0] = t1.val[0];
+    x_im.val[1] = t1.val[2];
+    y_im.val[0] = t1.val[1];
+    y_im.val[1] = t1.val[3];
+
+    t0.val[0] = vzip1q_f64(x_re.val[0], x_re.val[1]);
+    t0.val[1] = vzip2q_f64(x_re.val[0], x_re.val[1]);
+    t0.val[2] = vzip1q_f64(y_re.val[0], y_re.val[1]);
+    t0.val[3] = vzip2q_f64(y_re.val[0], y_re.val[1]);
+
+    t1.val[0] = vzip1q_f64(x_im.val[0], x_im.val[1]);
+    t1.val[1] = vzip2q_f64(x_im.val[0], x_im.val[1]);
+    t1.val[2] = vzip1q_f64(y_im.val[0], y_im.val[1]);
+    t1.val[3] = vzip2q_f64(y_im.val[0], y_im.val[1]);
+
+    vload2(s_re_im, &fpr_tab_log4[0]);
+
+    FWD_TOP(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0], s_re_im.val[1]);
+    FWD_TOP(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+    FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+    FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+    vstore4(&f[0], t0);
+    vstore4(&f[8], t1);
+}
+
+/*
+ * Vectorized 4 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(fpr *f, const unsigned logn) {
+    // Total SIMD register: 34 = 2 + 32
+    float64x2x2_t s_re_im;                                        // 2
+    float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im, v_re, v_im; // 32
+
+    const unsigned int falcon_n = 1 << logn;
+    const unsigned int hn = falcon_n >> 1;
+
+    unsigned int level = logn - 3;
+    const fpr *fpr_tab2 = fpr_table[level++],
+               *fpr_tab3 = fpr_table[level++],
+                *fpr_tab4 = fpr_table[level++],
+                 *fpr_tab5 = fpr_table[level];
+    int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+    for (unsigned j = 0; j < hn; j += 16) {
+        vload(s_re_im.val[0], &fpr_tab2[k2]);
+
+        /*
+         * We only increase k2 when j value has the form j = 32*x + 16
+         * Modulo 32 both sides, then check if (j % 32) == 16.
+         */
+        k2 += 2 * ((j & 31) == 16);
+
+        vloadx4(y_re, &f[j + 8]);
+        vloadx4(y_im, &f[j + 8 + hn]);
+
+        if (logn == 5) {
+            // Handle special case when use fpr_tab_log2, where re == im
+            // This reduce number of multiplications,
+            // although equal number of instructions as the "else" branch
+            vfmulx4_i(t_im, y_im, s_re_im.val[0]);
+            vfmulx4_i(t_re, y_re, s_re_im.val[0]);
+            vfsubx4(v_re, t_re, t_im);
+            vfaddx4(v_im, t_re, t_im);
+        } else {
+            FWD_TOP_LANEx4(v_re, v_im, y_re, y_im, s_re_im.val[0]);
+        }
+
+        vloadx4(x_re, &f[j]);
+        vloadx4(x_im, &f[j + hn]);
+
+        if ((j >> 4) & 1) {
+            FWD_BOTJx4(x_re, x_im, y_re, y_im, v_re, v_im);
+        } else {
+            FWD_BOTx4(x_re, x_im, y_re, y_im, v_re, v_im);
+        }
+
+        vload(s_re_im.val[0], &fpr_tab3[k3]);
+        k3 += 2;
+
+        FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[2], x_im.val[2], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[2], y_im.val[2], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0]);
+
+        FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0]);
+        FWD_BOT(x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+        FWD_BOTJ(y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2]);
+        FWD_BOTJ(y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+        vloadx2(s_re_im, &fpr_tab4[k4]);
+        k4 += 4;
+
+        FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+        FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[1]);
+        FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[1]);
+
+        FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+        FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+        FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+        FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+        transpose_f64(x_re, x_re, v_re, 0, 2, 0);
+        transpose_f64(x_re, x_re, v_re, 1, 3, 1);
+        transpose_f64(x_im, x_im, v_im, 0, 2, 0);
+        transpose_f64(x_im, x_im, v_im, 1, 3, 1);
+
+        v_re.val[0] = x_re.val[2];
+        x_re.val[2] = x_re.val[1];
+        x_re.val[1] = v_re.val[0];
+
+        v_im.val[0] = x_im.val[2];
+        x_im.val[2] = x_im.val[1];
+        x_im.val[1] = v_im.val[0];
+
+        transpose_f64(y_re, y_re, v_re, 0, 2, 2);
+        transpose_f64(y_re, y_re, v_re, 1, 3, 3);
+        transpose_f64(y_im, y_im, v_im, 0, 2, 2);
+        transpose_f64(y_im, y_im, v_im, 1, 3, 3);
+
+        v_re.val[0] = y_re.val[2];
+        y_re.val[2] = y_re.val[1];
+        y_re.val[1] = v_re.val[0];
+
+        v_im.val[0] = y_im.val[2];
+        y_im.val[2] = y_im.val[1];
+        y_im.val[1] = v_im.val[0];
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        FWD_TOP(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        FWD_TOP(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        FWD_TOP(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        FWD_TOP(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+        FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+        FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+
+        vstore4(&f[j], x_re);
+        vstore4(&f[j + hn], x_im);
+
+        FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+        FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+        vstore4(&f[j + 8], y_re);
+        vstore4(&f[j + 8 + hn], y_im);
+    }
+}
+
+/*
+ * Vectorized 1 layer of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(fpr *f, const unsigned logn) {
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+    const unsigned ht = n >> 2;
+
+    // Total SIMD register: 25 = 1 + 24
+    float64x2_t s_re_im;                                          // 1
+    float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im, v_re, v_im; // 24
+
+    s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+    for (unsigned j = 0; j < ht; j += 8) {
+        vloadx4(b_re, &f[j + ht]);
+        vfmulx4_i(t_re, b_re, s_re_im);
+
+        vloadx4(b_im, &f[j + ht + hn]);
+        vfmulx4_i(t_im, b_im, s_re_im);
+
+        vfsubx4(v_re, t_re, t_im);
+        vfaddx4(v_im, t_re, t_im);
+
+        vloadx4(a_re, &f[j]);
+        vloadx4(a_im, &f[j + hn]);
+
+        FWD_BOTx4(a_re, a_im, b_re, b_im, v_re, v_im);
+        vstorex4(&f[j + ht], b_re);
+        vstorex4(&f[j], a_re);
+
+        vstorex4(&f[j + ht + hn], b_im);
+        vstorex4(&f[j + hn], a_im);
+    }
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(fpr *f, const unsigned logn, const unsigned level) {
+    const unsigned int falcon_n = 1 << logn;
+    const unsigned int hn = falcon_n >> 1;
+
+    // Total SIMD register: 26 = 8 + 16 + 2
+    float64x2x4_t t_re, t_im;                   // 8
+    float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+                  y1_re, y2_re, y1_im, y2_im;   // 16
+    float64x2_t s1_re_im, s2_re_im;             // 2
+
+    const fpr *fpr_tab1 = NULL, *fpr_tab2 = NULL;
+    unsigned l, len, start, j, k1, k2;
+    unsigned bar = logn - level + 2;
+
+    for (l = level - 1; l > 4; l -= 2) {
+        len = 1 << (l - 2);
+        fpr_tab1 = fpr_table[bar++];
+        fpr_tab2 = fpr_table[bar++];
+        k1 = 0;
+        k2 = 0;
+
+        for (start = 0; start < hn; start += 1U << l) {
+            vload(s1_re_im, &fpr_tab1[k1]);
+            vload(s2_re_im, &fpr_tab2[k2]);
+            k1 += 2U * ((start & 127) == 64);
+            k2 += 2;
+
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(y1_re, &f[j + 2 * len]);
+                vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(x2_re, &f[j + len]);
+                vloadx2(x2_im, &f[j + len + hn]);
+
+                FWD_BOT(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOT(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+                FWD_BOT(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOT(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+                FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+                vstorex2(&f[j + len], x2_re);
+                vstorex2(&f[j + len + hn], x2_im);
+
+                FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                vstorex2(&f[j + 2 * len], y1_re);
+                vstorex2(&f[j + 2 * len + hn], y1_im);
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+            }
+
+            start += 1U << l;
+            if (start >= hn) {
+                break;
+            }
+
+            vload(s1_re_im, &fpr_tab1[k1]);
+            vload(s2_re_im, &fpr_tab2[k2]);
+            k1 += 2U * ((start & 127) == 64);
+            k2 += 2;
+
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(y1_re, &f[j + 2 * len]);
+                vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(x2_re, &f[j + len]);
+                vloadx2(x2_im, &f[j + len + hn]);
+
+                FWD_BOTJ(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOTJ(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+                FWD_BOTJ(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOTJ(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+                FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+                FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+                FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+                FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+                vstorex2(&f[j + len], x2_re);
+                vstorex2(&f[j + len + hn], x2_im);
+
+                FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+                FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+                vstorex2(&f[j + 2 * len], y1_re);
+                vstorex2(&f[j + 2 * len + hn], y1_im);
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+            }
+        }
+    }
+}
+
+/*
+ * 1 layer of Inverse FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log2(fpr *f) {
+    fpr x_re, x_im, y_re, y_im, s;
+    x_re = f[0];
+    y_re = f[1];
+    x_im = f[2];
+    y_im = f[3];
+    s = fpr_tab_log2[0] * 0.5;
+
+    f[0] = (x_re + y_re) * 0.5;
+    f[2] = (x_im + y_im) * 0.5;
+
+    x_re = (x_re - y_re) * s;
+    x_im = (x_im - y_im) * s;
+
+    f[1] = x_im + x_re;
+    f[3] = x_im - x_re;
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 4 complex point (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log3(fpr *f) {
+    // Total SIMD registers: 12 = 4 + 8
+    float64x2x4_t tmp;                          // 4
+    float64x2x2_t x_re_im, y_re_im, v, s_re_im; // 8
+
+    vload2(x_re_im, &f[0]);
+    vload2(y_re_im, &f[4]);
+
+    vfsub(v.val[0], x_re_im.val[0], x_re_im.val[1]);
+    vfsub(v.val[1], y_re_im.val[0], y_re_im.val[1]);
+    vfadd(x_re_im.val[0], x_re_im.val[0], x_re_im.val[1]);
+    vfadd(x_re_im.val[1], y_re_im.val[0], y_re_im.val[1]);
+
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    vfmul(y_re_im.val[0], v.val[1], s_re_im.val[1]);
+    vfmla(y_re_im.val[0], y_re_im.val[0], v.val[0], s_re_im.val[0]);
+    vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+    vfmls(y_re_im.val[1], y_re_im.val[1], v.val[0], s_re_im.val[1]);
+
+    tmp.val[0] = vtrn1q_f64(x_re_im.val[0], y_re_im.val[0]);
+    tmp.val[1] = vtrn2q_f64(x_re_im.val[0], y_re_im.val[0]);
+    tmp.val[2] = vtrn1q_f64(x_re_im.val[1], y_re_im.val[1]);
+    tmp.val[3] = vtrn2q_f64(x_re_im.val[1], y_re_im.val[1]);
+
+    s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+    vfadd(x_re_im.val[0], tmp.val[0], tmp.val[1]);
+    vfadd(x_re_im.val[1], tmp.val[2], tmp.val[3]);
+    vfsub(v.val[0], tmp.val[0], tmp.val[1]);
+    vfsub(v.val[1], tmp.val[2], tmp.val[3]);
+
+    vfmuln(tmp.val[0], x_re_im.val[0], 0.25);
+    vfmuln(tmp.val[2], x_re_im.val[1], 0.25);
+
+    vfmuln(s_re_im.val[0], s_re_im.val[0], 0.25);
+
+    vfmul(y_re_im.val[0], v.val[0], s_re_im.val[0]);
+    vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+
+    vfadd(tmp.val[1], y_re_im.val[1], y_re_im.val[0]);
+    vfsub(tmp.val[3], y_re_im.val[1], y_re_im.val[0]);
+
+    vstorex4(&f[0], tmp);
+}
+
+/*
+ * Vectorized 3 layers of Inverse FFT for 8 complex point (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log4(fpr *f) {
+    // Total SIMD registers: 18 = 12 + 6
+    float64x2x4_t re, im, t;           // 12
+    float64x2x2_t t_re, t_im, s_re_im; // 6
+
+    vload4(re, &f[0]);
+    vload4(im, &f[8]);
+
+    INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+    INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+    vload2(s_re_im, &fpr_tab_log4[0]);
+
+    INV_BOTJ(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+    INV_BOTJm(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+    // re: 0, 4 | 1, 5 | 2, 6 | 3, 7
+    // im: 8, 12| 9, 13|10, 14|11, 15
+    transpose_f64(re, re, t, 0, 1, 0);
+    transpose_f64(re, re, t, 2, 3, 1);
+    transpose_f64(im, im, t, 0, 1, 2);
+    transpose_f64(im, im, t, 2, 3, 3);
+
+    // re: 0, 1 | 4,  5 | 2, 3 | 6, 7
+    // im: 8, 9 | 12, 13|10, 11| 14, 15
+    t.val[0] = re.val[1];
+    re.val[1] = re.val[2];
+    re.val[2] = t.val[0];
+
+    t.val[1] = im.val[1];
+    im.val[1] = im.val[2];
+    im.val[2] = t.val[1];
+
+    // re: 0, 1 |  2,  3| 4,  5 | 6, 7
+    // im: 8, 9 | 10, 11| 12, 13| 14, 15
+    INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+    INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+    vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+    INV_BOTJ_LANE(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+    INV_BOTJm_LANE(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+    INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[2], im.val[2]);
+    INV_TOPJ(t_re.val[1], t_im.val[1], re.val[1], im.val[1], re.val[3], im.val[3]);
+
+    vfmuln(re.val[0], re.val[0], 0.12500000000);
+    vfmuln(re.val[1], re.val[1], 0.12500000000);
+    vfmuln(im.val[0], im.val[0], 0.12500000000);
+    vfmuln(im.val[1], im.val[1], 0.12500000000);
+
+    s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+    vfmuln(s_re_im.val[0], s_re_im.val[0], 0.12500000000);
+
+    vfmul(t_re.val[0], t_re.val[0], s_re_im.val[0]);
+    vfmul(t_re.val[1], t_re.val[1], s_re_im.val[0]);
+    vfmul(t_im.val[0], t_im.val[0], s_re_im.val[0]);
+    vfmul(t_im.val[1], t_im.val[1], s_re_im.val[0]);
+
+    vfsub(im.val[2], t_im.val[0], t_re.val[0]);
+    vfsub(im.val[3], t_im.val[1], t_re.val[1]);
+    vfadd(re.val[2], t_im.val[0], t_re.val[0]);
+    vfadd(re.val[3], t_im.val[1], t_re.val[1]);
+
+    vstorex4(&f[0], re);
+    vstorex4(&f[8], im);
+}
+
+/*
+ * Vectorized 4 layers of Inverse FFT for 16 complex point (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(fpr *f, const unsigned logn, const unsigned last) {
+    // Total SIMD register: 26 = 24 + 2
+    float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im; // 24
+    float64x2x2_t s_re_im;                            // 2
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+
+    unsigned int level = logn;
+    const fpr *fpr_tab5 = fpr_table[level--],
+               *fpr_tab4 = fpr_table[level--],
+                *fpr_tab3 = fpr_table[level--],
+                 *fpr_tab2 = fpr_table[level];
+    int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+    for (unsigned j = 0; j < hn; j += 16) {
+
+        vload4(x_re, &f[j]);
+        vload4(x_im, &f[j + hn]);
+
+        INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+        INV_TOPJm(t_re.val[2], t_im.val[2], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+        vload4(y_re, &f[j + 8]);
+        vload4(y_im, &f[j + 8 + hn]);
+
+        INV_TOPJ(t_re.val[1], t_im.val[1], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+        INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        INV_BOTJ(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+        INV_BOTJm(x_re.val[3], x_im.val[3], t_re.val[2], t_im.val[2], s_re_im.val[0], s_re_im.val[1]);
+
+        vload2(s_re_im, &fpr_tab5[k5]);
+        k5 += 4;
+
+        INV_BOTJ(y_re.val[1], y_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        INV_BOTJm(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+        transpose_f64(x_re, x_re, t_re, 0, 1, 0);
+        transpose_f64(x_re, x_re, t_re, 2, 3, 1);
+        transpose_f64(y_re, y_re, t_re, 0, 1, 2);
+        transpose_f64(y_re, y_re, t_re, 2, 3, 3);
+
+        transpose_f64(x_im, x_im, t_im, 0, 1, 0);
+        transpose_f64(x_im, x_im, t_im, 2, 3, 1);
+        transpose_f64(y_im, y_im, t_im, 0, 1, 2);
+        transpose_f64(y_im, y_im, t_im, 2, 3, 3);
+
+        t_re.val[0] = x_re.val[1];
+        x_re.val[1] = x_re.val[2];
+        x_re.val[2] = t_re.val[0];
+
+        t_re.val[1] = y_re.val[1];
+        y_re.val[1] = y_re.val[2];
+        y_re.val[2] = t_re.val[1];
+
+        t_im.val[0] = x_im.val[1];
+        x_im.val[1] = x_im.val[2];
+        x_im.val[2] = t_im.val[0];
+
+        t_im.val[1] = y_im.val[1];
+        y_im.val[1] = y_im.val[2];
+        y_im.val[2] = t_im.val[1];
+
+        INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+        INV_TOPJm(t_re.val[1], t_im.val[1], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+        INV_TOPJ(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+        INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+        vloadx2(s_re_im, &fpr_tab4[k4]);
+        k4 += 4;
+
+        INV_BOTJ_LANE(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+        INV_BOTJm_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+        INV_BOTJ_LANE(y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2], s_re_im.val[1]);
+        INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[1]);
+
+        INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2]);
+        INV_TOPJ(t_re.val[1], t_im.val[1], x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3]);
+
+        INV_TOPJm(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2]);
+        INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3]);
+
+        vload(s_re_im.val[0], &fpr_tab3[k3]);
+        k3 += 2;
+
+        INV_BOTJ_LANE(x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+        INV_BOTJ_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+        INV_BOTJm_LANE(y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2], s_re_im.val[0]);
+        INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0]);
+
+        if ((j >> 4) & 1) {
+            INV_TOPJmx4(t_re, t_im, x_re, x_im, y_re, y_im);
+        } else {
+            INV_TOPJx4(t_re, t_im, x_re, x_im, y_re, y_im);
+        }
+
+        vload(s_re_im.val[0], &fpr_tab2[k2]);
+        k2 += 2 * ((j & 31) == 16);
+
+        if (last) {
+            vfmuln(s_re_im.val[0], s_re_im.val[0], fpr_p2_tab[logn]);
+            vfmulnx4(x_re, x_re, fpr_p2_tab[logn]);
+            vfmulnx4(x_im, x_im, fpr_p2_tab[logn]);
+        }
+        vstorex4(&f[j], x_re);
+        vstorex4(&f[j + hn], x_im);
+
+        if (logn == 5) {
+            // Special case in fpr_tab_log2 where re == im
+            vfmulx4_i(t_re, t_re, s_re_im.val[0]);
+            vfmulx4_i(t_im, t_im, s_re_im.val[0]);
+
+            vfaddx4(y_re, t_im, t_re);
+            vfsubx4(y_im, t_im, t_re);
+        } else {
+            if ((j >> 4) & 1) {
+                INV_BOTJm_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+            } else {
+                INV_BOTJ_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+            }
+        }
+
+        vstorex4(&f[j + 8], y_re);
+        vstorex4(&f[j + 8 + hn], y_im);
+    }
+}
+
+/*
+ * Vectorized 1 layer of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(fpr *f, const unsigned logn, const unsigned last) {
+    // Total SIMD register 26 = 24 + 2
+    float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im; // 24
+    float64x2_t s_re_im;                              // 2
+
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+    const unsigned ht = n >> 2;
+
+    for (unsigned j = 0; j < ht; j += 8) {
+        vloadx4(a_re, &f[j]);
+        vloadx4(a_im, &f[j + hn]);
+        vloadx4(b_re, &f[j + ht]);
+        vloadx4(b_im, &f[j + ht + hn]);
+
+        INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im);
+
+        s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+        if (last) {
+            vfmuln(s_re_im, s_re_im, fpr_p2_tab[logn]);
+            vfmulnx4(a_re, a_re, fpr_p2_tab[logn]);
+            vfmulnx4(a_im, a_im, fpr_p2_tab[logn]);
+        }
+
+        vstorex4(&f[j], a_re);
+        vstorex4(&f[j + hn], a_im);
+
+        vfmulx4_i(t_re, t_re, s_re_im);
+        vfmulx4_i(t_im, t_im, s_re_im);
+
+        vfaddx4(b_re, t_im, t_re);
+        vfsubx4(b_im, t_im, t_re);
+
+        vstorex4(&f[j + ht], b_re);
+        vstorex4(&f[j + ht + hn], b_im);
+    }
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(fpr *f, const unsigned logn, const unsigned level, unsigned last) {
+    const unsigned int falcon_n = 1 << logn;
+    const unsigned int hn = falcon_n >> 1;
+
+    // Total SIMD register: 26 = 16 + 8 + 2
+    float64x2x4_t t_re, t_im;                   // 8
+    float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+                  y1_re, y2_re, y1_im, y2_im;   // 16
+    float64x2_t s1_re_im, s2_re_im;             // 2
+
+    const fpr *fpr_inv_tab1 = NULL, *fpr_inv_tab2 = NULL;
+    unsigned l, len, start, j, k1, k2;
+    unsigned bar = logn - 4;
+
+    for (l = 4; l < logn - level - 1; l += 2) {
+        len = 1 << l;
+        last -= 1;
+        fpr_inv_tab1 = fpr_table[bar--];
+        fpr_inv_tab2 = fpr_table[bar--];
+        k1 = 0;
+        k2 = 0;
+
+        for (start = 0; start < hn; start += 1U << (l + 2)) {
+            vload(s1_re_im, &fpr_inv_tab1[k1]);
+            vload(s2_re_im, &fpr_inv_tab2[k2]);
+            k1 += 2;
+            k2 += 2U * ((start & 127) == 64);
+            if (!last) {
+                vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+            }
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(y1_re, &f[j + len]);
+                vloadx2(y1_im, &f[j + len + hn]);
+
+                INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+                INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+                vloadx2(x2_re, &f[j + 2 * len]);
+                vloadx2(x2_im, &f[j + 2 * len + hn]);
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+                INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+                INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+                INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+                INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+                INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+                INV_TOPJ(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJ(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJ_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+                INV_BOTJ_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+                INV_BOTJ_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+                INV_BOTJ_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+                vstorex2(&f[j + 2 * len], x2_re);
+                vstorex2(&f[j + 2 * len + hn], x2_im);
+
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+
+                if (!last) {
+                    vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+                    vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+                }
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+
+                vstorex2(&f[j + len], y1_re);
+                vstorex2(&f[j + len + hn], y1_im);
+            }
+
+            start += 1U << (l + 2);
+            if (start >= hn) {
+                break;
+            }
+
+            vload(s1_re_im, &fpr_inv_tab1[k1]);
+            vload(s2_re_im, &fpr_inv_tab2[k2]);
+            k1 += 2;
+            k2 += 2U * ((start & 127) == 64);
+            if (!last) {
+                vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+            }
+
+            for (j = start; j < start + len; j += 4) {
+
+                vloadx2(x1_re, &f[j]);
+                vloadx2(x1_im, &f[j + hn]);
+                vloadx2(y1_re, &f[j + len]);
+                vloadx2(y1_im, &f[j + len + hn]);
+
+                INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+                INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+                vloadx2(x2_re, &f[j + 2 * len]);
+                vloadx2(x2_im, &f[j + 2 * len + hn]);
+                vloadx2(y2_re, &f[j + 3 * len]);
+                vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+                INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+                INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+                INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+                INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+                INV_TOPJm(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+                INV_TOPJm(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+                INV_TOPJm(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+                INV_TOPJm(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+                INV_BOTJm_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+                INV_BOTJm_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+                INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+                INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+                vstorex2(&f[j + 2 * len], x2_re);
+                vstorex2(&f[j + 2 * len + hn], x2_im);
+
+                vstorex2(&f[j + 3 * len], y2_re);
+                vstorex2(&f[j + 3 * len + hn], y2_im);
+
+                if (!last) {
+                    vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+                    vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+                    vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+                }
+
+                vstorex2(&f[j], x1_re);
+                vstorex2(&f[j + hn], x1_im);
+
+                vstorex2(&f[j + len], y1_re);
+                vstorex2(&f[j + len + hn], y1_im);
+            }
+        }
+    }
+}
+
+/*
+ * Scalable vectorized Forward FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_FFT(fpr *f, const unsigned logn) {
+    unsigned level = logn;
+    switch (logn) {
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_log2(f);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_log3(f);
+        break;
+
+    case 4:
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_log4(f);
+        break;
+
+    case 5:
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, 5);
+        break;
+
+    case 6:
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(f, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
+        break;
+
+    case 7:
+    case 9:
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(f, logn, level);
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
+        break;
+
+    case 8:
+    case 10:
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(f, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(f, logn, level - 1);
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
+        break;
+
+    default:
+        break;
+    }
+}
+
+/*
+ * Scalable vectorized Inverse FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_iFFT(fpr *f, const unsigned logn) {
+    const unsigned level = (logn - 5) & 1;
+
+    switch (logn) {
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log2(f);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log3(f);
+        break;
+
+    case 4:
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log4(f);
+        break;
+
+    case 5:
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, 5, 1);
+        break;
+
+    case 6:
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(f, logn, 1);
+        break;
+
+    case 7:
+    case 9:
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(f, logn, level, 1);
+        break;
+
+    case 8:
+    case 10:
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(f, logn, level, 0);
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(f, logn, 1);
+        break;
+
+    default:
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft_tree.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft_tree.c
new file mode 100644
index 000000000..7ff6baca4
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft_tree.c
@@ -0,0 +1,247 @@
+/*
+ * High-speed vectorize FFT tree for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Merge FFT for 2 complex points (4 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log2(fpr *f, const fpr *f0, const fpr *f1) {
+    fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+    a_re = f0[0];
+    a_im = f0[1];
+    s = fpr_tab_log2[0];
+    b_re = f1[0] * s;
+    b_im = f1[1] * s;
+
+    d_re = b_re - b_im;
+    d_im = b_re + b_im;
+
+    f[0] = a_re + d_re;
+    f[2] = a_im + d_im;
+    f[1] = a_re - d_re;
+    f[3] = a_im - d_im;
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log3(fpr *f, const fpr *f0, const fpr *f1) {
+    // Total SIMD registers: 12 = 10 + 2
+    float64x2x2_t g1, g0, g_re, g_im, s_re_im; // 10
+    float64x2_t t_re, t_im;                    // 2
+
+    vloadx2(g1, &f1[0]);
+
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    FWD_TOP(t_re, t_im, g1.val[0], g1.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+    vloadx2(g0, &f0[0]);
+
+    FPC_ADD(g_re.val[0], g_im.val[0], g0.val[0], g0.val[1], t_re, t_im);
+    FPC_SUB(g_re.val[1], g_im.val[1], g0.val[0], g0.val[1], t_re, t_im);
+
+    vstore2(&f[0], g_re);
+    vstore2(&f[4], g_im);
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log4(fpr *f, const fpr *f0, const fpr *f1, const unsigned logn) {
+    const unsigned n = 1 << logn;
+    const unsigned ht = n >> 2;
+    const fpr *fpr_merge = fpr_table[logn];
+
+    // Total SIMD register 22 = 14 + 8
+    float64x2x2_t g1_re, g1_im, g0_re, g0_im, s_re_im, t_re, t_im; // 14
+    float64x2x4_t g_re, g_im;                                      // 8
+
+    for (unsigned j = 0; j < ht; j += 4) {
+        vload2(g1_re, &f1[j]);
+        vload2(g1_im, &f1[j + ht]);
+
+        vload2(s_re_im, &fpr_merge[j]);
+
+        FWD_TOP(t_re.val[0], t_im.val[0], g1_re.val[0], g1_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+        vload2(g0_re, &f0[j]);
+
+        FWD_TOP(t_re.val[1], t_im.val[1], g1_re.val[1], g1_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+        vload2(g0_im, &f0[j + ht]);
+
+        FPC_ADD(g_re.val[0], g_im.val[0], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+        FPC_SUB(g_re.val[1], g_im.val[1], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+        FPC_ADDJ(g_re.val[2], g_im.val[2], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+        FPC_SUBJ(g_re.val[3], g_im.val[3], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+
+        vstore4(&f[j << 1], g_re);
+        vstore4(&f[(j + ht) << 1], g_im);
+    }
+}
+
+/*
+ * 1 layer of Split FFT for 2 complex points (4 coefficients).
+ */
+static void
+PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log2(fpr *restrict f0, fpr *restrict f1, const fpr *restrict f) {
+    fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+    a_re = f[0];
+    b_re = f[1];
+    a_im = f[2];
+    b_im = f[3];
+    s = fpr_tab_log2[0] * 0.5;
+
+    f0[0] = (a_re + b_re) * 0.5;
+    f0[1] = (a_im + b_im) * 0.5;
+
+    d_re = (a_re - b_re) * s;
+    d_im = (a_im - b_im) * s;
+
+    f1[0] = d_im + d_re;
+    f1[1] = d_im - d_re;
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log3(fpr *f0, fpr *f1, const fpr *f) {
+    // Total SIMD registers: 12
+    float64x2x2_t re, im, g0, g1, s_re_im, tm; // 12
+
+    vload2(re, &f[0]);
+    vload2(im, &f[4]);
+
+    FPC_ADD(g0.val[0], g0.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+    FPC_SUB(tm.val[0], tm.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+    vload2(s_re_im, &fpr_tab_log3[0]);
+
+    vfmuln(g0.val[0], g0.val[0], 0.5);
+    vfmuln(g0.val[1], g0.val[1], 0.5);
+    vstorex2(&f0[0], g0);
+
+    vfmuln(s_re_im.val[0], s_re_im.val[0], 0.5);
+    vfmuln(s_re_im.val[1], s_re_im.val[1], 0.5);
+
+    INV_BOTJ(g1.val[0], g1.val[1], tm.val[0], tm.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+    vstorex2(&f1[0], g1);
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log4(fpr *f0, fpr *f1, const fpr *f, const unsigned logn) {
+    const unsigned n = 1 << logn;
+    const unsigned hn = n >> 1;
+    const unsigned ht = n >> 2;
+    const fpr *fpr_split = fpr_table[logn];
+
+    // Total SIMD register 23 = 1 + 8 + 14
+    float64x2_t half;                                              // 1
+    float64x2x4_t g_re, g_im;                                      // 8
+    float64x2x2_t s_re_im, t_re, t_im, g1_re, g1_im, g0_re, g0_im; // 14
+
+    half = vdupq_n_f64(0.5);
+    for (unsigned j = 0; j < ht; j += 4) {
+        unsigned j2 = j << 1;
+        vload4(g_re, &f[j2]);
+        vload4(g_im, &f[j2 + hn]);
+
+        FPC_ADD(g0_re.val[0], g0_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+        FPC_ADD(g0_re.val[1], g0_im.val[1], g_re.val[2], g_im.val[2], g_re.val[3], g_im.val[3]);
+
+        FPC_SUB(t_re.val[0], t_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+        FPC_SUB(t_re.val[1], t_im.val[1], g_re.val[3], g_im.val[3], g_re.val[2], g_im.val[2]);
+
+        vload2(s_re_im, &fpr_split[j]);
+
+        vfmul(g0_re.val[0], g0_re.val[0], half);
+        vfmul(g0_re.val[1], g0_re.val[1], half);
+        vstore2(&f0[j], g0_re);
+
+        vfmul(g0_im.val[0], g0_im.val[0], half);
+        vfmul(g0_im.val[1], g0_im.val[1], half);
+        vstore2(&f0[j + ht], g0_im);
+
+        vfmul(s_re_im.val[0], s_re_im.val[0], half);
+        vfmul(s_re_im.val[1], s_re_im.val[1], half);
+
+        INV_BOTJ(g1_re.val[0], g1_im.val[0], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+        INV_BOTJm(g1_re.val[1], g1_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+        vstore2(&f1[j], g1_re);
+        vstore2(&f1[j + ht], g1_im);
+    }
+}
+
+/*
+ * Vectorized Split FFT implementation
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1, const fpr *f, const unsigned logn) {
+    switch (logn) {
+    case 1:
+        //  n = 2; hn = 1; qn = 0;
+        f0[0] = f[0];
+        f1[0] = f[1];
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log2(f0, f1, f);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log3(f0, f1, f);
+        break;
+
+    default:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log4(f0, f1, f, logn);
+        break;
+    }
+}
+
+/*
+ * Vectorized Merge FFT implementation
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(fpr *restrict f, const fpr *restrict f0,
+        const fpr *restrict f1, const unsigned logn) {
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1;
+        f[0] = f0[0];
+        f[1] = f1[0];
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log2(f, f0, f1);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log3(f, f0, f1);
+        break;
+
+    default:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log4(f, f0, f1, logn);
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.c
new file mode 100644
index 000000000..94e92a56c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.c
@@ -0,0 +1,204 @@
+/*
+ * Compressed floating-point Twiddle Factor.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+
+const fpr fpr_p2_tab[] = {
+    2.00000000000,
+    1.00000000000,
+    0.50000000000,
+    0.25000000000,
+    0.12500000000,
+    0.06250000000,
+    0.03125000000,
+    0.01562500000,
+    0.00781250000,
+    0.00390625000,
+    0.00195312500
+};
+
+const fpr fpr_tab_log2[] = {
+    0.707106781186547524400844362, 0.707106781186547524400844362, // 4, 5
+};
+
+const fpr fpr_tab_log3[] = {
+    0.923879532511286756128183189, 0.382683432365089771728459984, // 8, 9
+    -0.382683432365089771728459984, 0.923879532511286756128183189,
+};
+
+const fpr fpr_tab_log4[] = {
+    0.980785280403230449126182236, 0.195090322016128267848284868, // 16
+    0.555570233019602224742830814, 0.831469612302545237078788378, // 20
+};
+
+const fpr fpr_tab_log5[] = {
+    0.995184726672196886244836953, 0.098017140329560601994195564, // 32
+    0.634393284163645498215171613, 0.773010453362736960810906610, // 36
+    0.881921264348355029712756864, 0.471396736825997648556387626, // 40
+    0.290284677254462367636192376, 0.956940335732208864935797887, // 44
+};
+
+const fpr fpr_tab_log6[] = {
+    0.998795456205172392714771605, 0.049067674327418014254954977, // 64
+    0.671558954847018400625376850, 0.740951125354959091175616897, // 68
+    0.903989293123443331586200297, 0.427555093430282094320966857, // 72
+    0.336889853392220050689253213, 0.941544065183020778412509403, // 76
+    0.970031253194543992603984207, 0.242980179903263889948274162, // 80
+    0.514102744193221726593693839, 0.857728610000272069902269984, // 84
+    0.803207531480644909806676513, 0.595699304492433343467036529, // 88
+    0.146730474455361751658850130, 0.989176509964780973451673738, // 92
+};
+
+const fpr fpr_tab_log7[] = {
+    0.999698818696204220115765650, 0.024541228522912288031734529, // 128
+    0.689540544737066924616730630, 0.724247082951466920941069243, // 132
+    0.914209755703530654635014829, 0.405241314004989870908481306, // 136
+    0.359895036534988148775104572, 0.932992798834738887711660256, // 140
+    0.975702130038528544460395766, 0.219101240156869797227737547, // 144
+    0.534997619887097210663076905, 0.844853565249707073259571205, // 148
+    0.817584813151583696504920884, 0.575808191417845300745972454, // 152
+    0.170961888760301226363642357, 0.985277642388941244774018433, // 156
+    0.992479534598709998156767252, 0.122410675199216198498704474, // 160
+    0.615231590580626845484913563, 0.788346427626606262009164705, // 164
+    0.870086991108711418652292404, 0.492898192229784036873026689, // 168
+    0.266712757474898386325286515, 0.963776065795439866686464356, // 172
+    0.949528180593036667195936074, 0.313681740398891476656478846, // 176
+    0.449611329654606600046294579, 0.893224301195515320342416447, // 180
+    0.757208846506484547575464054, 0.653172842953776764084203014, // 184
+    0.073564563599667423529465622, 0.997290456678690216135597140, // 188
+};
+
+const fpr fpr_tab_log8[] = {
+    0.999924701839144540921646491, 0.012271538285719926079408262, // 256
+    0.698376249408972853554813503, 0.715730825283818654125532623, // 260
+    0.919113851690057743908477789, 0.393992040061048108596188661, // 264
+    0.371317193951837543411934967, 0.928506080473215565937167396, // 268
+    0.978317370719627633106240097, 0.207111376192218549708116020, // 272
+    0.545324988422046422313987347, 0.838224705554838043186996856, // 276
+    0.824589302785025264474803737, 0.565731810783613197389765011, // 280
+    0.183039887955140958516532578, 0.983105487431216327180301155, // 284
+    0.993906970002356041546922813, 0.110222207293883058807899140, // 288
+    0.624859488142386377084072816, 0.780737228572094478301588484, // 292
+    0.876070094195406607095844268, 0.482183772079122748517344481, // 296
+    0.278519689385053105207848526, 0.960430519415565811199035138, // 300
+    0.953306040354193836916740383, 0.302005949319228067003463232, // 304
+    0.460538710958240023633181487, 0.887639620402853947760181617, // 308
+    0.765167265622458925888815999, 0.643831542889791465068086063, // 312
+    0.085797312344439890461556332, 0.996312612182778012627226190, // 316
+    0.998118112900149207125155861, 0.061320736302208577782614593, // 320
+    0.662415777590171761113069817, 0.749136394523459325469203257, // 324
+    0.898674465693953843041976744, 0.438616238538527637647025738, // 328
+    0.325310292162262934135954708, 0.945607325380521325730945387, // 332
+    0.966976471044852109087220226, 0.254865659604514571553980779, // 336
+    0.503538383725717558691867071, 0.863972856121586737918147054, // 340
+    0.795836904608883536262791915, 0.605511041404325513920626941, // 344
+    0.134580708507126186316358409, 0.990902635427780025108237011, // 348
+    0.987301418157858382399815802, 0.158858143333861441684385360, // 352
+    0.585797857456438860328080838, 0.810457198252594791726703434, // 356
+    0.851355193105265142261290312, 0.524589682678468906215098464, // 360
+    0.231058108280671119643236018, 0.972939952205560145467720114, // 364
+    0.937339011912574923201899593, 0.348418680249434568419308588, // 368
+    0.416429560097637182562598911, 0.909167983090522376563884788, // 372
+    0.732654271672412834615546649, 0.680600997795453050594430464, // 376
+    0.036807222941358832324332691, 0.999322384588349500896221011, // 380
+};
+
+const fpr fpr_tab_log9[] = {
+    0.999981175282601142656990438, 0.006135884649154475359640235, // 512
+    0.702754744457225302452914421, 0.711432195745216441522130290, // 516
+    0.921514039342041943465396332, 0.388345046698826291624993541, // 520
+    0.377007410216418256726567823, 0.926210242138311341974793388, // 524
+    0.979569765685440534439326110, 0.201104634842091911558443546, // 528
+    0.550457972936604802977289893, 0.834862874986380056304401383, // 532
+    0.828045045257755752067527592, 0.560661576197336023839710223, // 536
+    0.189068664149806212754997837, 0.981963869109555264072848154, // 540
+    0.994564570734255452119106243, 0.104121633872054579120943880, // 544
+    0.629638238914927025372981341, 0.776888465673232450040827983, // 548
+    0.879012226428633477831323711, 0.476799230063322133342158117, // 552
+    0.284407537211271843618310615, 0.958703474895871555374645792, // 556
+    0.955141168305770721498157712, 0.296150888243623824121786128, // 560
+    0.465976495767966177902756065, 0.884797098430937780104007041, // 564
+    0.769103337645579639346626069, 0.639124444863775743801488193, // 568
+    0.091908956497132728624990979, 0.995767414467659793982495643, // 572
+    0.998475580573294752208559038, 0.055195244349689939809447526, // 576
+    0.666999922303637506650154222, 0.745057785441465962407907310, // 580
+    0.901348847046022014570746093, 0.433093818853151968484222638, // 584
+    0.331106305759876401737190737, 0.943593458161960361495301445, // 588
+    0.968522094274417316221088329, 0.248927605745720168110682816, // 592
+    0.508830142543107036931749324, 0.860866938637767279344583877, // 596
+    0.799537269107905033500246232, 0.600616479383868926653875896, // 600
+    0.140658239332849230714788846, 0.990058210262297105505906464, // 604
+    0.988257567730749491404792538, 0.152797185258443427720336613, // 608
+    0.590759701858874228423887908, 0.806847553543799272206514313, // 612
+    0.854557988365400520767862276, 0.519355990165589587361829932, // 616
+    0.237023605994367206867735915, 0.971503890986251775537099622, // 620
+    0.939459223602189911962669246, 0.342660717311994397592781983, // 624
+    0.422000270799799685941287941, 0.906595704514915365332960588, // 628
+    0.736816568877369875090132520, 0.676092703575315960360419228, // 632
+    0.042938256934940823077124540, 0.999077727752645382888781997, // 636
+    0.999529417501093163079703322, 0.030674803176636625934021028, // 640
+    0.685083667772700381362052545, 0.728464390448225196492035438, // 644
+    0.911706032005429851404397325, 0.410843171057903942183466675, // 648
+    0.354163525420490382357395796, 0.935183509938947577642207480, // 652
+    0.974339382785575860518721668, 0.225083911359792835991642120, // 656
+    0.529803624686294668216054671, 0.848120344803297251279133563, // 660
+    0.814036329705948361654516690, 0.580813958095764545075595272, // 664
+    0.164913120489969921418189113, 0.986308097244598647863297524, // 668
+    0.991709753669099522860049931, 0.128498110793793172624415589, // 672
+    0.610382806276309452716352152, 0.792106577300212351782342879, // 676
+    0.867046245515692651480195629, 0.498227666972781852410983869, // 680
+    0.260794117915275518280186509, 0.965394441697689374550843858, // 684
+    0.947585591017741134653387321, 0.319502030816015677901518272, // 688
+    0.444122144570429231642069418, 0.895966249756185155914560282, // 692
+    0.753186799043612482483430486, 0.657806693297078656931182264, // 696
+    0.067443919563664057897972422, 0.997723066644191609848546728, // 700
+    0.996820299291165714972629398, 0.079682437971430121147120656, // 704
+    0.648514401022112445084560551, 0.761202385484261814029709836, // 708
+    0.890448723244757889952150560, 0.455083587126343823535869268, // 712
+    0.307849640041534893682063646, 0.951435020969008369549175569, // 716
+    0.962121404269041595429604316, 0.272621355449948984493347477, // 720
+    0.487550160148435954641485027, 0.873094978418290098636085973, // 724
+    0.784556597155575233023892575, 0.620057211763289178646268191, // 728
+    0.116318630911904767252544319, 0.993211949234794533104601012, // 732
+    0.984210092386929073193874387, 0.177004220412148756196839844, // 736
+    0.570780745886967280232652864, 0.821102514991104679060430820, // 740
+    0.841554977436898409603499520, 0.540171472729892881297845480, // 744
+    0.213110319916091373967757518, 0.977028142657754351485866211, // 748
+    0.930766961078983731944872340, 0.365612997804773870011745909, // 752
+    0.399624199845646828544117031, 0.916679059921042663116457013, // 756
+    0.720002507961381629076682999, 0.693971460889654009003734389, // 760
+    0.018406729905804820927366313, 0.999830581795823422015722275, // 764
+};
+
+const fpr *fpr_table[] = {
+    NULL, NULL,
+    fpr_tab_log2,
+    fpr_tab_log3,
+    fpr_tab_log4,
+    fpr_tab_log5,
+    fpr_tab_log6,
+    fpr_tab_log7,
+    fpr_tab_log8,
+    fpr_tab_log9,
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.h
new file mode 100644
index 000000000..6a045a45e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.h
@@ -0,0 +1,245 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/* ====================================================================== */
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "macrof.h"
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef double fpr;
+
+static inline fpr
+FPR(double v) {
+    fpr x;
+
+    x = v;
+    return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+    return (double)i;
+}
+
+static const fpr fpr_q = 12289.0 ;
+static const fpr fpr_inverse_of_q = 1.0 / 12289.0 ;
+static const fpr fpr_inv_2sqrsigma0 = .150865048875372721532312163019 ;
+static const fpr fpr_inv_sigma_9 = 0.0060336696681577241031668062510953022 ;
+static const fpr fpr_sigma_min_9 = 1.2778336969128335860256340575729042 ;
+static const fpr fpr_log2 = 0.69314718055994530941723212146 ;
+static const fpr fpr_inv_log2 = 1.4426950408889634073599246810 ;
+static const fpr fpr_bnorm_max = 16822.4121 ;
+static const fpr fpr_zero = 0.0 ;
+static const fpr fpr_one = 1.0 ;
+static const fpr fpr_two = 2.0 ;
+static const fpr fpr_onehalf = 0.5 ;
+static const fpr fpr_invsqrt2 = 0.707106781186547524400844362105 ;
+static const fpr fpr_invsqrt8 = 0.353553390593273762200422181052 ;
+static const fpr fpr_ptwo31 = 2147483648.0 ;
+static const fpr fpr_ptwo31m1 = 2147483647.0 ;
+static const fpr fpr_mtwo31m1 = -2147483647.0 ;
+static const fpr fpr_ptwo63m1 = 9223372036854775807.0 ;
+static const fpr fpr_mtwo63m1 = -9223372036854775807.0 ;
+static const fpr fpr_ptwo63 = 9223372036854775808.0 ;
+
+static inline int64_t
+fpr_rint(fpr x) {
+    int64_t t;
+    __asm__ ( "fcvtns   %x0, %d1": "=r" (t) : "w" (x));
+    return t;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+    int64_t r;
+
+    /*
+     * The cast performs a trunc() (rounding toward 0) and thus is
+     * wrong by 1 for most negative values. The correction below is
+     * constant-time as long as the compiler turns the
+     * floating-point conversion result into a 0/1 integer without a
+     * conditional branch or another non-constant-time construction.
+     * This should hold on all modern architectures with an FPU (and
+     * if it is false on a given arch, then chances are that the FPU
+     * itself is not constant-time, making the point moot).
+     */
+    r = (int64_t)x;
+    return r - (x < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+    return (int64_t)x;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+    return (x + y);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+    return (x - y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+    return (-x);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+    return (x * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+    return (x + x);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+    return (x * y);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+    return (x * x);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+    return (1.0 / x);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+    return (x / y);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+    __asm__ ( "fsqrt   %d0, %d0" : "+w" (x) : : );
+    return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+    return x < y;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+    static const double C_expm[] = {
+        1.000000000000000000000000000000,  // c0
+        -0.999999999999994892974086724280, // c1
+        0.500000000000019206858326015208,  // c2
+        -0.166666666666984014666397229121, // c3
+        0.041666666666110491190622155955,  // c4
+        -0.008333333327800835146903501993, // c5
+        0.001388888894063186997887560103,  // c6
+        -0.000198412739277311890541063977, // c7
+        0.000024801566833585381209939524,  // c8
+        -0.000002755586350219122514855659, // c9
+        0.000000275607356160477811864927,  // c10
+        -0.000000025299506379442070029551, // c11
+        0.000000002073772366009083061987,  // c12
+        0.000000000000000000000000000000,
+    };
+    float64x2_t neon_x, neon_1x, neon_x2,
+                neon_x4, neon_x8, neon_x12, neon_ccs;
+    float64x2x4_t neon_exp0;
+    float64x2x3_t neon_exp1;
+    float64x2_t y1, y2, y3, y;
+    double ret;
+
+    neon_exp0 = vld1q_f64_x4(&C_expm[0]);
+    neon_exp1 = vld1q_f64_x3(&C_expm[8]);
+    neon_ccs = vdupq_n_f64(ccs);
+    neon_ccs = vmulq_n_f64(neon_ccs, fpr_ptwo63);
+
+    // x | x
+    neon_x = vdupq_n_f64(x);
+    // 1 | x
+    neon_1x = vsetq_lane_f64(1.0, neon_x, 0);
+    neon_x2 = vmulq_f64(neon_x, neon_x);
+    neon_x4 = vmulq_f64(neon_x2, neon_x2);
+    neon_x8 = vmulq_f64(neon_x4, neon_x4);
+    neon_x12 = vmulq_f64(neon_x8, neon_x4);
+
+    vfmla(y1, neon_exp0.val[0], neon_exp0.val[1], neon_x2);
+    vfmla(y2, neon_exp0.val[2], neon_exp0.val[3], neon_x2);
+    vfmla(y3, neon_exp1.val[0], neon_exp1.val[1], neon_x2);
+
+    y1 = vmulq_f64(y1, neon_1x);
+    y2 = vmulq_f64(y2, neon_1x);
+    y3 = vmulq_f64(y3, neon_1x);
+
+    vfmla(y, y1, y2, neon_x4);
+    vfmla(y,  y, y3, neon_x8);
+    vfmla(y,  y, neon_exp1.val[2], neon_x12);
+    y = vmulq_f64( y, neon_ccs);
+    ret = vaddvq_f64(y);
+
+    return (uint64_t) ret;
+}
+
+#define fpr_p2_tab   PQCLEAN_FALCONPADDED512_AARCH64_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+#define fpr_tab_log2   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log2
+#define fpr_tab_log3   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log3
+#define fpr_tab_log4   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log4
+#define fpr_tab_log5   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log5
+#define fpr_tab_log6   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log6
+#define fpr_tab_log7   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log7
+#define fpr_tab_log8   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log8
+#define fpr_tab_log9   PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log9
+#define fpr_table      PQCLEAN_FALCONPADDED512_AARCH64_fpr_table
+
+extern const fpr fpr_tab_log2[];
+extern const fpr fpr_tab_log3[];
+extern const fpr fpr_tab_log4[];
+extern const fpr fpr_tab_log5[];
+extern const fpr fpr_tab_log6[];
+extern const fpr fpr_tab_log7[];
+extern const fpr fpr_tab_log8[];
+extern const fpr fpr_tab_log9[];
+extern const fpr *fpr_table[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/inner.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/inner.h
new file mode 100644
index 000000000..65b0e7799
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/inner.h
@@ -0,0 +1,825 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+#include "params.h"
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCONPADDED512_AARCH64_ macro (e.g. PQCLEAN_FALCONPADDED512_AARCH64_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc)        shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ *   - modq: sequence of values modulo 12289, each encoded over exactly
+ *     14 bits. The encoder and decoder verify that integers are within
+ *     the valid range (0..12288). Values are arrays of uint16.
+ *
+ *   - trim: sequence of signed integers, a specified number of bits
+ *     each. The number of bits is provided as parameter and includes
+ *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ *     (which means that the -2^(bits-1) value is forbidden); encode and
+ *     decode functions check that property. Values are arrays of
+ *     int16_t or int8_t, corresponding to names 'trim_i16' and
+ *     'trim_i8', respectively.
+ *
+ *   - comp: variable-length encoding for signed integers; each integer
+ *     uses a minimum of 9 bits, possibly more. This is normally used
+ *     only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED512_AARCH64_modq_encode(void *out, size_t max_out_len,
+        const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(void *out, size_t max_out_len, const int8_t *x, uint8_t bits);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x);
+
+size_t PQCLEAN_FALCONPADDED512_AARCH64_modq_decode(uint16_t *x, const void *in,
+        size_t max_in_len, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(int8_t *x, unsigned bits, const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short(const int16_t *s1, const int16_t *s2);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+        const int16_t *hm, const double *t0,
+        const double *t1);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+/*
+ * Convert a public key to NTT. Conversion is done in place.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt(int16_t *h);
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt_monty(int16_t *h);
+
+/*
+ * Internal signature verification code:
+ *   c0[]      contains the hashed nonce+message
+ *   s2[]      is the decoded signature
+ *   h[]       contains the public key, in NTT + Montgomery format
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+        int16_t *h, int16_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_compute_public(int16_t *h, const int8_t *f,
+        const int8_t *g, int16_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_complete_private(int8_t *G, const int8_t *f,
+        const int8_t *g, const int8_t *F,
+        uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+        const int16_t *s1, const int16_t *s2,
+        uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ *   fpr
+ *         type for a real number
+ *
+ *   fpr fpr_of(int64_t i)
+ *         cast an integer into a real number; source must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_scaled(int64_t i, int sc)
+ *         compute i*2^sc as a real number; source 'i' must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_ldexp(fpr x, int e)
+ *         compute x*2^e
+ *
+ *   int64_t fpr_rint(fpr x)
+ *         round x to the nearest integer; x must be in the -(2^63-1)
+ *         to +(2^63-1) range
+ *
+ *   int64_t fpr_trunc(fpr x)
+ *         round to an integer; this rounds towards zero; value must
+ *         be in the -(2^63-1) to +(2^63-1) range
+ *
+ *   fpr fpr_add(fpr x, fpr y)
+ *         compute x + y
+ *
+ *   fpr fpr_sub(fpr x, fpr y)
+ *         compute x - y
+ *
+ *   fpr fpr_neg(fpr x)
+ *         compute -x
+ *
+ *   fpr fpr_half(fpr x)
+ *         compute x/2
+ *
+ *   fpr fpr_double(fpr x)
+ *         compute x*2
+ *
+ *   fpr fpr_mul(fpr x, fpr y)
+ *         compute x * y
+ *
+ *   fpr fpr_sqr(fpr x)
+ *         compute x * x
+ *
+ *   fpr fpr_inv(fpr x)
+ *         compute 1/x
+ *
+ *   fpr fpr_div(fpr x, fpr y)
+ *         compute x/y
+ *
+ *   fpr fpr_sqrt(fpr x)
+ *         compute the square root of x
+ *
+ *   int fpr_lt(fpr x, fpr y)
+ *         return 1 if x < y, 0 otherwise
+ *
+ *   uint64_t fpr_expm_p63(fpr x)
+ *         return exp(x), assuming that 0 <= x < log(2). Returned value
+ *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ *         rounded to the nearest integer). Computation should have a
+ *         precision of at least 45 bits.
+ *
+ *   const fpr fpr_gm_tab[]
+ *         array of constants for FFT / iFFT
+ *
+ *   const fpr fpr_p2_tab[]
+ *         precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ *   fpr fpr_q                 12289
+ *   fpr fpr_inverse_of_q      1/12289
+ *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
+ *   fpr fpr_inv_sigma[]       1/sigma (indexed by logn, 1 to 10)
+ *   fpr fpr_sigma_min[]       1/sigma_min (indexed by logn, 1 to 10)
+ *   fpr fpr_log2              log(2)
+ *   fpr fpr_inv_log2          1/log(2)
+ *   fpr fpr_bnorm_max         16822.4121
+ *   fpr fpr_zero              0
+ *   fpr fpr_one               1
+ *   fpr fpr_two               2
+ *   fpr fpr_onehalf           0.5
+ *   fpr fpr_ptwo31            2^31
+ *   fpr fpr_ptwo31m1          2^31-1
+ *   fpr fpr_mtwo31m1          -(2^31-1)
+ *   fpr fpr_ptwo63m1          2^63-1
+ *   fpr fpr_mtwo63m1          -(2^63-1)
+ *   fpr fpr_ptwo63            2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+    union {
+        uint8_t d[512]; /* MUST be 512, exactly */
+        uint64_t dummy_u64;
+    } buf;
+    size_t ptr;
+    union {
+        uint8_t d[256];
+        uint64_t dummy_u64;
+    } state;
+    int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+    }
+    return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(fpr *c, const fpr *a, const fpr *restrict b, const fpr *restrict d, unsigned logn);
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(fpr *d, fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d,
+        const fpr *a, const fpr *restrict b, unsigned logn);
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(fpr *c, const fpr *restrict d, const fpr *restrict a, unsigned logn);
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_fft(fpr *restrict c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+        const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add_muladj_fft(fpr *restrict d,
+        const fpr *restrict F, const fpr *restrict G,
+        const fpr *restrict f, const fpr *restrict g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+        fpr *restrict g01, fpr *restrict g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(fpr *restrict d11, fpr *restrict l10,
+        const fpr *restrict g00, const fpr *restrict g01,
+        const fpr *restrict g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1,
+        const fpr *restrict f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(fpr *restrict f,
+        const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm, const unsigned falcon_n);
+
+fpr PQCLEAN_FALCONPADDED512_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2);
+
+int32_t PQCLEAN_FALCONPADDED512_AARCH64_poly_small_sqnorm(const int8_t *f); // common.c
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1      136
+#define FALCON_KEYGEN_TEMP_2      272
+#define FALCON_KEYGEN_TEMP_3      224
+#define FALCON_KEYGEN_TEMP_4      448
+#define FALCON_KEYGEN_TEMP_5      896
+#define FALCON_KEYGEN_TEMP_6     1792
+#define FALCON_KEYGEN_TEMP_7     3584
+#define FALCON_KEYGEN_TEMP_8     7168
+#define FALCON_KEYGEN_TEMP_9    14336
+#define FALCON_KEYGEN_TEMP_10   28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_keygen(inner_shake256_context *rng,
+        int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_expand_privkey(fpr *restrict expanded_key,
+        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+        uint8_t *restrict tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED512_AARCH64_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *restrict expanded_key,
+        const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *restrict f, const int8_t *restrict g,
+        const int8_t *restrict F, const int8_t *restrict G,
+        const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED512_AARCH64_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/keygen.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/keygen.c
new file mode 100644
index 000000000..feee9d483
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/keygen.c
@@ -0,0 +1,4200 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "util.h"
+
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ *   Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ *   Montgomery multiplication computes (x*y)/R mod p for
+ *   operands x and y. Therefore:
+ *
+ *    - if operands are x*R and y*R (Montgomery representations of x and
+ *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ *      mod p, which is the Montgomery representation of the product x*y;
+ *
+ *    - if operands are x*R and y (or x and y*R), then Montgomery
+ *      multiplication returns x*y mod p: mixed-representation
+ *      multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ *  p   The prime itself.
+ *
+ *  g   A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ *  s   The inverse of the product of all previous primes in the array,
+ *      computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+    uint32_t p;
+    uint32_t g;
+    uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+    { 2147473409,  383167813,      10239 },
+    { 2147389441,  211808905,  471403745 },
+    { 2147387393,   37672282, 1329335065 },
+    { 2147377153, 1977035326,  968223422 },
+    { 2147358721, 1067163706,  132460015 },
+    { 2147352577, 1606082042,  598693809 },
+    { 2147346433, 2033915641, 1056257184 },
+    { 2147338241, 1653770625,  421286710 },
+    { 2147309569,  631200819, 1111201074 },
+    { 2147297281, 2038364663, 1042003613 },
+    { 2147295233, 1962540515,   19440033 },
+    { 2147239937, 2100082663,  353296760 },
+    { 2147235841, 1991153006, 1703918027 },
+    { 2147217409,  516405114, 1258919613 },
+    { 2147205121,  409347988, 1089726929 },
+    { 2147196929,  927788991, 1946238668 },
+    { 2147178497, 1136922411, 1347028164 },
+    { 2147100673,  868626236,  701164723 },
+    { 2147082241, 1897279176,  617820870 },
+    { 2147074049, 1888819123,  158382189 },
+    { 2147051521,   25006327,  522758543 },
+    { 2147043329,  327546255,   37227845 },
+    { 2147039233,  766324424, 1133356428 },
+    { 2146988033, 1862817362,   73861329 },
+    { 2146963457,  404622040,  653019435 },
+    { 2146959361, 1936581214,  995143093 },
+    { 2146938881, 1559770096,  634921513 },
+    { 2146908161,  422623708, 1985060172 },
+    { 2146885633, 1751189170,  298238186 },
+    { 2146871297,  578919515,  291810829 },
+    { 2146846721, 1114060353,  915902322 },
+    { 2146834433, 2069565474,   47859524 },
+    { 2146818049, 1552824584,  646281055 },
+    { 2146775041, 1906267847, 1597832891 },
+    { 2146756609, 1847414714, 1228090888 },
+    { 2146744321, 1818792070, 1176377637 },
+    { 2146738177, 1118066398, 1054971214 },
+    { 2146736129,   52057278,  933422153 },
+    { 2146713601,  592259376, 1406621510 },
+    { 2146695169,  263161877, 1514178701 },
+    { 2146656257,  685363115,  384505091 },
+    { 2146650113,  927727032,  537575289 },
+    { 2146646017,   52575506, 1799464037 },
+    { 2146643969, 1276803876, 1348954416 },
+    { 2146603009,  814028633, 1521547704 },
+    { 2146572289, 1846678872, 1310832121 },
+    { 2146547713,  919368090, 1019041349 },
+    { 2146508801,  671847612,   38582496 },
+    { 2146492417,  283911680,  532424562 },
+    { 2146490369, 1780044827,  896447978 },
+    { 2146459649,  327980850, 1327906900 },
+    { 2146447361, 1310561493,  958645253 },
+    { 2146441217,  412148926,  287271128 },
+    { 2146437121,  293186449, 2009822534 },
+    { 2146430977,  179034356, 1359155584 },
+    { 2146418689, 1517345488, 1790248672 },
+    { 2146406401, 1615820390, 1584833571 },
+    { 2146404353,  826651445,  607120498 },
+    { 2146379777,    3816988, 1897049071 },
+    { 2146363393, 1221409784, 1986921567 },
+    { 2146355201, 1388081168,  849968120 },
+    { 2146336769, 1803473237, 1655544036 },
+    { 2146312193, 1023484977,  273671831 },
+    { 2146293761, 1074591448,  467406983 },
+    { 2146283521,  831604668, 1523950494 },
+    { 2146203649,  712865423, 1170834574 },
+    { 2146154497, 1764991362, 1064856763 },
+    { 2146142209,  627386213, 1406840151 },
+    { 2146127873, 1638674429, 2088393537 },
+    { 2146099201, 1516001018,  690673370 },
+    { 2146093057, 1294931393,  315136610 },
+    { 2146091009, 1942399533,  973539425 },
+    { 2146078721, 1843461814, 2132275436 },
+    { 2146060289, 1098740778,  360423481 },
+    { 2146048001, 1617213232, 1951981294 },
+    { 2146041857, 1805783169, 2075683489 },
+    { 2146019329,  272027909, 1753219918 },
+    { 2145986561, 1206530344, 2034028118 },
+    { 2145976321, 1243769360, 1173377644 },
+    { 2145964033,  887200839, 1281344586 },
+    { 2145906689, 1651026455,  906178216 },
+    { 2145875969, 1673238256, 1043521212 },
+    { 2145871873, 1226591210, 1399796492 },
+    { 2145841153, 1465353397, 1324527802 },
+    { 2145832961, 1150638905,  554084759 },
+    { 2145816577,  221601706,  427340863 },
+    { 2145785857,  608896761,  316590738 },
+    { 2145755137, 1712054942, 1684294304 },
+    { 2145742849, 1302302867,  724873116 },
+    { 2145728513,  516717693,  431671476 },
+    { 2145699841,  524575579, 1619722537 },
+    { 2145691649, 1925625239,  982974435 },
+    { 2145687553,  463795662, 1293154300 },
+    { 2145673217,  771716636,  881778029 },
+    { 2145630209, 1509556977,  837364988 },
+    { 2145595393,  229091856,  851648427 },
+    { 2145587201, 1796903241,  635342424 },
+    { 2145525761,  715310882, 1677228081 },
+    { 2145495041, 1040930522,  200685896 },
+    { 2145466369,  949804237, 1809146322 },
+    { 2145445889, 1673903706,   95316881 },
+    { 2145390593,  806941852, 1428671135 },
+    { 2145372161, 1402525292,  159350694 },
+    { 2145361921, 2124760298, 1589134749 },
+    { 2145359873, 1217503067, 1561543010 },
+    { 2145355777,  338341402,   83865711 },
+    { 2145343489, 1381532164,  641430002 },
+    { 2145325057, 1883895478, 1528469895 },
+    { 2145318913, 1335370424,   65809740 },
+    { 2145312769, 2000008042, 1919775760 },
+    { 2145300481,  961450962, 1229540578 },
+    { 2145282049,  910466767, 1964062701 },
+    { 2145232897,  816527501,  450152063 },
+    { 2145218561, 1435128058, 1794509700 },
+    { 2145187841,   33505311, 1272467582 },
+    { 2145181697,  269767433, 1380363849 },
+    { 2145175553,   56386299, 1316870546 },
+    { 2145079297, 2106880293, 1391797340 },
+    { 2145021953, 1347906152,  720510798 },
+    { 2145015809,  206769262, 1651459955 },
+    { 2145003521, 1885513236, 1393381284 },
+    { 2144960513, 1810381315,   31937275 },
+    { 2144944129, 1306487838, 2019419520 },
+    { 2144935937,   37304730, 1841489054 },
+    { 2144894977, 1601434616,  157985831 },
+    { 2144888833,   98749330, 2128592228 },
+    { 2144880641, 1772327002, 2076128344 },
+    { 2144864257, 1404514762, 2029969964 },
+    { 2144827393,  801236594,  406627220 },
+    { 2144806913,  349217443, 1501080290 },
+    { 2144796673, 1542656776, 2084736519 },
+    { 2144778241, 1210734884, 1746416203 },
+    { 2144759809, 1146598851,  716464489 },
+    { 2144757761,  286328400, 1823728177 },
+    { 2144729089, 1347555695, 1836644881 },
+    { 2144727041, 1795703790,  520296412 },
+    { 2144696321, 1302475157,  852964281 },
+    { 2144667649, 1075877614,  504992927 },
+    { 2144573441,  198765808, 1617144982 },
+    { 2144555009,  321528767,  155821259 },
+    { 2144550913,  814139516, 1819937644 },
+    { 2144536577,  571143206,  962942255 },
+    { 2144524289, 1746733766,    2471321 },
+    { 2144512001, 1821415077,  124190939 },
+    { 2144468993,  917871546, 1260072806 },
+    { 2144458753,  378417981, 1569240563 },
+    { 2144421889,  175229668, 1825620763 },
+    { 2144409601, 1699216963,  351648117 },
+    { 2144370689, 1071885991,  958186029 },
+    { 2144348161, 1763151227,  540353574 },
+    { 2144335873, 1060214804,  919598847 },
+    { 2144329729,  663515846, 1448552668 },
+    { 2144327681, 1057776305,  590222840 },
+    { 2144309249, 1705149168, 1459294624 },
+    { 2144296961,  325823721, 1649016934 },
+    { 2144290817,  738775789,  447427206 },
+    { 2144243713,  962347618,  893050215 },
+    { 2144237569, 1655257077,  900860862 },
+    { 2144161793,  242206694, 1567868672 },
+    { 2144155649,  769415308, 1247993134 },
+    { 2144137217,  320492023,  515841070 },
+    { 2144120833, 1639388522,  770877302 },
+    { 2144071681, 1761785233,  964296120 },
+    { 2144065537,  419817825,  204564472 },
+    { 2144028673,  666050597, 2091019760 },
+    { 2144010241, 1413657615, 1518702610 },
+    { 2143952897, 1238327946,  475672271 },
+    { 2143940609,  307063413, 1176750846 },
+    { 2143918081, 2062905559,  786785803 },
+    { 2143899649, 1338112849, 1562292083 },
+    { 2143891457,   68149545,   87166451 },
+    { 2143885313,  921750778,  394460854 },
+    { 2143854593,  719766593,  133877196 },
+    { 2143836161, 1149399850, 1861591875 },
+    { 2143762433, 1848739366, 1335934145 },
+    { 2143756289, 1326674710,  102999236 },
+    { 2143713281,  808061791, 1156900308 },
+    { 2143690753,  388399459, 1926468019 },
+    { 2143670273, 1427891374, 1756689401 },
+    { 2143666177, 1912173949,  986629565 },
+    { 2143645697, 2041160111,  371842865 },
+    { 2143641601, 1279906897, 2023974350 },
+    { 2143635457,  720473174, 1389027526 },
+    { 2143621121, 1298309455, 1732632006 },
+    { 2143598593, 1548762216, 1825417506 },
+    { 2143567873,  620475784, 1073787233 },
+    { 2143561729, 1932954575,  949167309 },
+    { 2143553537,  354315656, 1652037534 },
+    { 2143541249,  577424288, 1097027618 },
+    { 2143531009,  357862822,  478640055 },
+    { 2143522817, 2017706025, 1550531668 },
+    { 2143506433, 2078127419, 1824320165 },
+    { 2143488001,  613475285, 1604011510 },
+    { 2143469569, 1466594987,  502095196 },
+    { 2143426561, 1115430331, 1044637111 },
+    { 2143383553,    9778045, 1902463734 },
+    { 2143377409, 1557401276, 2056861771 },
+    { 2143363073,  652036455, 1965915971 },
+    { 2143260673, 1464581171, 1523257541 },
+    { 2143246337, 1876119649,  764541916 },
+    { 2143209473, 1614992673, 1920672844 },
+    { 2143203329,  981052047, 2049774209 },
+    { 2143160321, 1847355533,  728535665 },
+    { 2143129601,  965558457,  603052992 },
+    { 2143123457, 2140817191,    8348679 },
+    { 2143100929, 1547263683,  694209023 },
+    { 2143092737,  643459066, 1979934533 },
+    { 2143082497,  188603778, 2026175670 },
+    { 2143062017, 1657329695,  377451099 },
+    { 2143051777,  114967950,  979255473 },
+    { 2143025153, 1698431342, 1449196896 },
+    { 2143006721, 1862741675, 1739650365 },
+    { 2142996481,  756660457,  996160050 },
+    { 2142976001,  927864010, 1166847574 },
+    { 2142965761,  905070557,  661974566 },
+    { 2142916609,   40932754, 1787161127 },
+    { 2142892033, 1987985648,  675335382 },
+    { 2142885889,  797497211, 1323096997 },
+    { 2142871553, 2068025830, 1411877159 },
+    { 2142861313, 1217177090, 1438410687 },
+    { 2142830593,  409906375, 1767860634 },
+    { 2142803969, 1197788993,  359782919 },
+    { 2142785537,  643817365,  513932862 },
+    { 2142779393, 1717046338,  218943121 },
+    { 2142724097,   89336830,  416687049 },
+    { 2142707713,    5944581, 1356813523 },
+    { 2142658561,  887942135, 2074011722 },
+    { 2142638081,  151851972, 1647339939 },
+    { 2142564353, 1691505537, 1483107336 },
+    { 2142533633, 1989920200, 1135938817 },
+    { 2142529537,  959263126, 1531961857 },
+    { 2142527489,  453251129, 1725566162 },
+    { 2142502913, 1536028102,  182053257 },
+    { 2142498817,  570138730,  701443447 },
+    { 2142416897,  326965800,  411931819 },
+    { 2142363649, 1675665410, 1517191733 },
+    { 2142351361,  968529566, 1575712703 },
+    { 2142330881, 1384953238, 1769087884 },
+    { 2142314497, 1977173242, 1833745524 },
+    { 2142289921,   95082313, 1714775493 },
+    { 2142283777,  109377615, 1070584533 },
+    { 2142277633,   16960510,  702157145 },
+    { 2142263297,  553850819,  431364395 },
+    { 2142208001,  241466367, 2053967982 },
+    { 2142164993, 1795661326, 1031836848 },
+    { 2142097409, 1212530046,  712772031 },
+    { 2142087169, 1763869720,  822276067 },
+    { 2142078977,  644065713, 1765268066 },
+    { 2142074881,  112671944,  643204925 },
+    { 2142044161, 1387785471, 1297890174 },
+    { 2142025729,  783885537, 1000425730 },
+    { 2142011393,  905662232, 1679401033 },
+    { 2141974529,  799788433,  468119557 },
+    { 2141943809, 1932544124,  449305555 },
+    { 2141933569, 1527403256,  841867925 },
+    { 2141931521, 1247076451,  743823916 },
+    { 2141902849, 1199660531,  401687910 },
+    { 2141890561,  150132350, 1720336972 },
+    { 2141857793, 1287438162,  663880489 },
+    { 2141833217,  618017731, 1819208266 },
+    { 2141820929,  999578638, 1403090096 },
+    { 2141786113,   81834325, 1523542501 },
+    { 2141771777,  120001928,  463556492 },
+    { 2141759489,  122455485, 2124928282 },
+    { 2141749249,  141986041,  940339153 },
+    { 2141685761,  889088734,  477141499 },
+    { 2141673473,  324212681, 1122558298 },
+    { 2141669377, 1175806187, 1373818177 },
+    { 2141655041, 1113654822,  296887082 },
+    { 2141587457,  991103258, 1585913875 },
+    { 2141583361, 1401451409, 1802457360 },
+    { 2141575169, 1571977166,  712760980 },
+    { 2141546497, 1107849376, 1250270109 },
+    { 2141515777,  196544219,  356001130 },
+    { 2141495297, 1733571506, 1060744866 },
+    { 2141483009,  321552363, 1168297026 },
+    { 2141458433,  505818251,  733225819 },
+    { 2141360129, 1026840098,  948342276 },
+    { 2141325313,  945133744, 2129965998 },
+    { 2141317121, 1871100260, 1843844634 },
+    { 2141286401, 1790639498, 1750465696 },
+    { 2141267969, 1376858592,  186160720 },
+    { 2141255681, 2129698296, 1876677959 },
+    { 2141243393, 2138900688, 1340009628 },
+    { 2141214721, 1933049835, 1087819477 },
+    { 2141212673, 1898664939, 1786328049 },
+    { 2141202433,  990234828,  940682169 },
+    { 2141175809, 1406392421,  993089586 },
+    { 2141165569, 1263518371,  289019479 },
+    { 2141073409, 1485624211,  507864514 },
+    { 2141052929, 1885134788,  311252465 },
+    { 2141040641, 1285021247,  280941862 },
+    { 2141028353, 1527610374,  375035110 },
+    { 2141011969, 1400626168,  164696620 },
+    { 2140999681,  632959608,  966175067 },
+    { 2140997633, 2045628978, 1290889438 },
+    { 2140993537, 1412755491,  375366253 },
+    { 2140942337,  719477232,  785367828 },
+    { 2140925953,   45224252,  836552317 },
+    { 2140917761, 1157376588, 1001839569 },
+    { 2140887041,  278480752, 2098732796 },
+    { 2140837889, 1663139953,  924094810 },
+    { 2140788737,  802501511, 2045368990 },
+    { 2140766209, 1820083885, 1800295504 },
+    { 2140764161, 1169561905, 2106792035 },
+    { 2140696577,  127781498, 1885987531 },
+    { 2140684289,   16014477, 1098116827 },
+    { 2140653569,  665960598, 1796728247 },
+    { 2140594177, 1043085491,  377310938 },
+    { 2140579841, 1732838211, 1504505945 },
+    { 2140569601,  302071939,  358291016 },
+    { 2140567553,  192393733, 1909137143 },
+    { 2140557313,  406595731, 1175330270 },
+    { 2140549121, 1748850918,  525007007 },
+    { 2140477441,  499436566, 1031159814 },
+    { 2140469249, 1886004401, 1029951320 },
+    { 2140426241, 1483168100, 1676273461 },
+    { 2140420097, 1779917297,  846024476 },
+    { 2140413953,  522948893, 1816354149 },
+    { 2140383233, 1931364473, 1296921241 },
+    { 2140366849, 1917356555,  147196204 },
+    { 2140354561,   16466177, 1349052107 },
+    { 2140348417, 1875366972, 1860485634 },
+    { 2140323841,  456498717, 1790256483 },
+    { 2140321793, 1629493973,  150031888 },
+    { 2140315649, 1904063898,  395510935 },
+    { 2140280833, 1784104328,  831417909 },
+    { 2140250113,  256087139,  697349101 },
+    { 2140229633,  388553070,  243875754 },
+    { 2140223489,  747459608, 1396270850 },
+    { 2140200961,  507423743, 1895572209 },
+    { 2140162049,  580106016, 2045297469 },
+    { 2140149761,  712426444,  785217995 },
+    { 2140137473, 1441607584,  536866543 },
+    { 2140119041,  346538902, 1740434653 },
+    { 2140090369,  282642885,   21051094 },
+    { 2140076033, 1407456228,  319910029 },
+    { 2140047361, 1619330500, 1488632070 },
+    { 2140041217, 2089408064, 2012026134 },
+    { 2140008449, 1705524800, 1613440760 },
+    { 2139924481, 1846208233, 1280649481 },
+    { 2139906049,  989438755, 1185646076 },
+    { 2139867137, 1522314850,  372783595 },
+    { 2139842561, 1681587377,  216848235 },
+    { 2139826177, 2066284988, 1784999464 },
+    { 2139824129,  480888214, 1513323027 },
+    { 2139789313,  847937200,  858192859 },
+    { 2139783169, 1642000434, 1583261448 },
+    { 2139770881,  940699589,  179702100 },
+    { 2139768833,  315623242,  964612676 },
+    { 2139666433,  331649203,  764666914 },
+    { 2139641857, 2118730799, 1313764644 },
+    { 2139635713,  519149027,  519212449 },
+    { 2139598849, 1526413634, 1769667104 },
+    { 2139574273,  551148610,  820739925 },
+    { 2139568129, 1386800242,  472447405 },
+    { 2139549697,  813760130, 1412328531 },
+    { 2139537409, 1615286260, 1609362979 },
+    { 2139475969, 1352559299, 1696720421 },
+    { 2139455489, 1048691649, 1584935400 },
+    { 2139432961,  836025845,  950121150 },
+    { 2139424769, 1558281165, 1635486858 },
+    { 2139406337, 1728402143, 1674423301 },
+    { 2139396097, 1727715782, 1483470544 },
+    { 2139383809, 1092853491, 1741699084 },
+    { 2139369473,  690776899, 1242798709 },
+    { 2139351041, 1768782380, 2120712049 },
+    { 2139334657, 1739968247, 1427249225 },
+    { 2139332609, 1547189119,  623011170 },
+    { 2139310081, 1346827917, 1605466350 },
+    { 2139303937,  369317948,  828392831 },
+    { 2139301889, 1560417239, 1788073219 },
+    { 2139283457, 1303121623,  595079358 },
+    { 2139248641, 1354555286,  573424177 },
+    { 2139240449,   60974056,  885781403 },
+    { 2139222017,  355573421, 1221054839 },
+    { 2139215873,  566477826, 1724006500 },
+    { 2139150337,  871437673, 1609133294 },
+    { 2139144193, 1478130914, 1137491905 },
+    { 2139117569, 1854880922,  964728507 },
+    { 2139076609,  202405335,  756508944 },
+    { 2139062273, 1399715741,  884826059 },
+    { 2139045889, 1051045798, 1202295476 },
+    { 2139033601, 1707715206,  632234634 },
+    { 2139006977, 2035853139,  231626690 },
+    { 2138951681,  183867876,  838350879 },
+    { 2138945537, 1403254661,  404460202 },
+    { 2138920961,  310865011, 1282911681 },
+    { 2138910721, 1328496553,  103472415 },
+    { 2138904577,   78831681,  993513549 },
+    { 2138902529, 1319697451, 1055904361 },
+    { 2138816513,  384338872, 1706202469 },
+    { 2138810369, 1084868275,  405677177 },
+    { 2138787841,  401181788, 1964773901 },
+    { 2138775553, 1850532988, 1247087473 },
+    { 2138767361,  874261901, 1576073565 },
+    { 2138757121, 1187474742,  993541415 },
+    { 2138748929, 1782458888, 1043206483 },
+    { 2138744833, 1221500487,  800141243 },
+    { 2138738689,  413465368, 1450660558 },
+    { 2138695681,  739045140,  342611472 },
+    { 2138658817, 1355845756,  672674190 },
+    { 2138644481,  608379162, 1538874380 },
+    { 2138632193, 1444914034,  686911254 },
+    { 2138607617,  484707818, 1435142134 },
+    { 2138591233,  539460669, 1290458549 },
+    { 2138572801, 2093538990, 2011138646 },
+    { 2138552321, 1149786988, 1076414907 },
+    { 2138546177,  840688206, 2108985273 },
+    { 2138533889,  209669619,  198172413 },
+    { 2138523649, 1975879426, 1277003968 },
+    { 2138490881, 1351891144, 1976858109 },
+    { 2138460161, 1817321013, 1979278293 },
+    { 2138429441, 1950077177,  203441928 },
+    { 2138400769,  908970113,  628395069 },
+    { 2138398721,  219890864,  758486760 },
+    { 2138376193, 1306654379,  977554090 },
+    { 2138351617,  298822498, 2004708503 },
+    { 2138337281,  441457816, 1049002108 },
+    { 2138320897, 1517731724, 1442269609 },
+    { 2138290177, 1355911197, 1647139103 },
+    { 2138234881,  531313247, 1746591962 },
+    { 2138214401, 1899410930,  781416444 },
+    { 2138202113, 1813477173, 1622508515 },
+    { 2138191873, 1086458299, 1025408615 },
+    { 2138183681, 1998800427,  827063290 },
+    { 2138173441, 1921308898,  749670117 },
+    { 2138103809, 1620902804, 2126787647 },
+    { 2138099713,  828647069, 1892961817 },
+    { 2138085377,  179405355, 1525506535 },
+    { 2138060801,  615683235, 1259580138 },
+    { 2138044417, 2030277840, 1731266562 },
+    { 2138042369, 2087222316, 1627902259 },
+    { 2138032129,  126388712, 1108640984 },
+    { 2138011649,  715026550, 1017980050 },
+    { 2137993217, 1693714349, 1351778704 },
+    { 2137888769, 1289762259, 1053090405 },
+    { 2137853953,  199991890, 1254192789 },
+    { 2137833473,  941421685,  896995556 },
+    { 2137817089,  750416446, 1251031181 },
+    { 2137792513,  798075119,  368077456 },
+    { 2137786369,  878543495, 1035375025 },
+    { 2137767937,    9351178, 1156563902 },
+    { 2137755649, 1382297614, 1686559583 },
+    { 2137724929, 1345472850, 1681096331 },
+    { 2137704449,  834666929,  630551727 },
+    { 2137673729, 1646165729, 1892091571 },
+    { 2137620481,  778943821,   48456461 },
+    { 2137618433, 1730837875, 1713336725 },
+    { 2137581569,  805610339, 1378891359 },
+    { 2137538561,  204342388, 1950165220 },
+    { 2137526273, 1947629754, 1500789441 },
+    { 2137516033,  719902645, 1499525372 },
+    { 2137491457,  230451261,  556382829 },
+    { 2137440257,  979573541,  412760291 },
+    { 2137374721,  927841248, 1954137185 },
+    { 2137362433, 1243778559,  861024672 },
+    { 2137313281, 1341338501,  980638386 },
+    { 2137311233,  937415182, 1793212117 },
+    { 2137255937,  795331324, 1410253405 },
+    { 2137243649,  150756339, 1966999887 },
+    { 2137182209,  163346914, 1939301431 },
+    { 2137171969, 1952552395,  758913141 },
+    { 2137159681,  570788721,  218668666 },
+    { 2137147393, 1896656810, 2045670345 },
+    { 2137141249,  358493842,  518199643 },
+    { 2137139201, 1505023029,  674695848 },
+    { 2137133057,   27911103,  830956306 },
+    { 2137122817,  439771337, 1555268614 },
+    { 2137116673,  790988579, 1871449599 },
+    { 2137110529,  432109234,  811805080 },
+    { 2137102337, 1357900653, 1184997641 },
+    { 2137098241,  515119035, 1715693095 },
+    { 2137090049,  408575203, 2085660657 },
+    { 2137085953, 2097793407, 1349626963 },
+    { 2137055233, 1556739954, 1449960883 },
+    { 2137030657, 1545758650, 1369303716 },
+    { 2136987649,  332602570,  103875114 },
+    { 2136969217, 1499989506, 1662964115 },
+    { 2136924161,  857040753,    4738842 },
+    { 2136895489, 1948872712,  570436091 },
+    { 2136893441,   58969960, 1568349634 },
+    { 2136887297, 2127193379,  273612548 },
+    { 2136850433,  111208983, 1181257116 },
+    { 2136809473, 1627275942, 1680317971 },
+    { 2136764417, 1574888217,   14011331 },
+    { 2136741889,   14011055, 1129154251 },
+    { 2136727553,   35862563, 1838555253 },
+    { 2136721409,  310235666, 1363928244 },
+    { 2136698881, 1612429202, 1560383828 },
+    { 2136649729, 1138540131,  800014364 },
+    { 2136606721,  602323503, 1433096652 },
+    { 2136563713,  182209265, 1919611038 },
+    { 2136555521,  324156477,  165591039 },
+    { 2136549377,  195513113,  217165345 },
+    { 2136526849, 1050768046,  939647887 },
+    { 2136508417, 1886286237, 1619926572 },
+    { 2136477697,  609647664,   35065157 },
+    { 2136471553,  679352216, 1452259468 },
+    { 2136457217,  128630031,  824816521 },
+    { 2136422401,   19787464, 1526049830 },
+    { 2136420353,  698316836, 1530623527 },
+    { 2136371201, 1651862373, 1804812805 },
+    { 2136334337,  326596005,  336977082 },
+    { 2136322049,   63253370, 1904972151 },
+    { 2136297473,  312176076,  172182411 },
+    { 2136248321,  381261841,  369032670 },
+    { 2136242177,  358688773, 1640007994 },
+    { 2136229889,  512677188,   75585225 },
+    { 2136219649, 2095003250, 1970086149 },
+    { 2136207361, 1909650722,  537760675 },
+    { 2136176641, 1334616195, 1533487619 },
+    { 2136158209, 2096285632, 1793285210 },
+    { 2136143873, 1897347517,  293843959 },
+    { 2136133633,  923586222, 1022655978 },
+    { 2136096769, 1464868191, 1515074410 },
+    { 2136094721, 2020679520, 2061636104 },
+    { 2136076289,  290798503, 1814726809 },
+    { 2136041473,  156415894, 1250757633 },
+    { 2135996417,  297459940, 1132158924 },
+    { 2135955457,  538755304, 1688831340 },
+    { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+    uint32_t w;
+
+    w = (uint32_t)x;
+    w += p & -(w >> 31);
+    return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+    return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+    uint32_t y;
+
+    y = 2 - p;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+    /*
+     * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+     * 2^31 - p.
+     */
+    return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a + b - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a - b;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+    a += p & -(a & 1);
+    return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+    uint64_t z, w;
+    uint32_t d;
+
+    z = (uint64_t)a * (uint64_t)b;
+    w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+    d = (uint32_t)((z + w) >> 31) - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+    uint32_t z;
+
+    /*
+     * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+     * representation), then double it with an addition.
+     */
+    z = modp_R(p);
+    z = modp_add(z, z, p);
+
+    /*
+     * Square it five times to obtain 2^32 in Montgomery representation
+     * (i.e. 2^63 mod p).
+     */
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+
+    /*
+     * Halve the value mod p to get 2^62.
+     */
+    z = (z + (p & -(z & 1))) >> 1;
+    return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+    int i;
+    uint32_t r, z;
+
+    /*
+     * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+     * representation of (2^31)^e mod p, where e = x-1.
+     * R2 is 2^31 in Montgomery representation.
+     */
+    x --;
+    r = R2;
+    z = modp_R(p);
+    for (i = 0; (1U << i) <= x; i ++) {
+        if ((x & (1U << i)) != 0) {
+            z = modp_montymul(z, r, p, p0i);
+        }
+        r = modp_montymul(r, r, p, p0i);
+    }
+    return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ *   a     dividend
+ *   b     divisor
+ *   p     odd prime modulus
+ *   p0i   -1/p mod 2^31
+ *   R     2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+    uint32_t z, e;
+    int i;
+
+    e = p - 2;
+    z = R;
+    for (i = 30; i >= 0; i --) {
+        uint32_t z2;
+
+        z = modp_montymul(z, z, p, p0i);
+        z2 = modp_montymul(z, b, p, p0i);
+        z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+    }
+
+    /*
+     * The loop above just assumed that b was in Montgomery
+     * representation, i.e. really contained b*R; under that
+     * assumption, it returns 1/b in Montgomery representation,
+     * which is R/b. But we gave it b in normal representation,
+     * so the loop really returned R/(b/R) = R^2/b.
+     *
+     * We want a/b, so we need one Montgomery multiplication with a,
+     * which also remove one of the R factors, and another such
+     * multiplication to remove the second R factor.
+     */
+    z = modp_montymul(z, 1, p, p0i);
+    return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+    0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
+    192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
+    96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
+    144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
+    48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
+    240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
+    72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
+    168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
+    24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
+    216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
+    120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
+    132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
+    36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
+    228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
+    84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
+    180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
+    12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
+    204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
+    108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
+    156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
+    60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
+    252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
+    66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
+    162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
+    18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
+    210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
+    114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
+    138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
+    42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
+    234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
+    90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
+    186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
+    6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
+    198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
+    102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
+    150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
+    54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
+    246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
+    78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
+    174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
+    30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
+    222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
+    126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
+    129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
+    33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
+    225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
+    81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
+    177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
+    9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
+    201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
+    105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
+    153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
+    57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
+    249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
+    69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
+    165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
+    21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
+    213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
+    117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
+    141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
+    45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
+    237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
+    93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
+    189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
+    3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
+    195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
+    99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
+    147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
+    51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
+    243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
+    75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
+    171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
+    27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
+    219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
+    123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
+    135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
+    39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
+    231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
+    87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
+    183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
+    15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
+    207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
+    111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
+    159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
+    63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
+    255,  767,  511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ *   gm[rev(i)] = g^i mod p
+ *   igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
+           uint32_t g, uint32_t p, uint32_t p0i) {
+    size_t u, n;
+    unsigned k;
+    uint32_t ig, x1, x2, R2;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * We want g such that g^(2N) = 1 mod p, but the provided
+     * generator has order 2048. We must square it a few times.
+     */
+    R2 = modp_R2(p, p0i);
+    g = modp_montymul(g, R2, p, p0i);
+    for (k = logn; k < 10; k ++) {
+        g = modp_montymul(g, g, p, p0i);
+    }
+
+    ig = modp_div(R2, g, p, p0i, modp_R(p));
+    k = 10 - logn;
+    x1 = x2 = modp_R(p);
+    for (u = 0; u < n; u ++) {
+        size_t v;
+
+        v = REV10[u << k];
+        gm[v] = x1;
+        igm[v] = x2;
+        x1 = modp_montymul(x1, g, p, p0i);
+        x2 = modp_montymul(x2, ig, p, p0i);
+    }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+              uint32_t p, uint32_t p0i) {
+    size_t t, m, n;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, u, v1;
+
+        ht = t >> 1;
+        for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = gm[m + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + ht * stride;
+            for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = modp_montymul(*r2, s, p, p0i);
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_sub(x, y, p);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+               uint32_t p, uint32_t p0i) {
+    size_t t, m, n, k;
+    uint32_t ni;
+    uint32_t *r;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = 1;
+    for (m = n; m > 1; m >>= 1) {
+        size_t hm, dt, u, v1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = igm[hm + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + t * stride;
+            for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = *r2;
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_montymul(
+                          modp_sub(x, y, p), s, p, p0i);;
+            }
+        }
+        t = dt;
+    }
+
+    /*
+     * We need 1/n in Montgomery representation, i.e. R/n. Since
+     * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+     * thus a simple shift will do.
+     */
+    ni = (uint32_t)1 << (31 - logn);
+    for (k = 0, r = a; k < n; k ++, r += stride) {
+        *r = modp_montymul(*r, ni, p, p0i);
+    }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+                  uint32_t p, uint32_t p0i, uint32_t R2) {
+    size_t hn, u;
+
+    hn = (size_t)1 << (logn - 1);
+    for (u = 0; u < hn; u ++) {
+        uint32_t w0, w1;
+
+        w0 = f[(u << 1) + 0];
+        w1 = f[(u << 1) + 1];
+        f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+    }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ *  - Rebuild the resultant and the polynomial coefficients from their
+ *    values modulo small primes (of length 31 bits each).
+ *
+ *  - Compute an extended GCD between the two computed resultants.
+ *
+ *  - Extract top bits and add scaled values during the successive steps
+ *    of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
+         uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    cc = 0;
+    m = -ctl;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, w;
+
+        aw = a[u];
+        w = aw - b[u] - cc;
+        cc = w >> 31;
+        aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+        a[u] = aw;
+    }
+    return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < mlen; u ++) {
+        uint64_t z;
+
+        z = (uint64_t)m[u] * (uint64_t)x + cc;
+        m[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ *  d is unsigned
+ *  p is prime
+ *  2^30 < p < 2^31
+ *  p0i = -(1/p) mod 2^31
+ *  R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+                        uint32_t p, uint32_t p0i, uint32_t R2) {
+    uint32_t x;
+    size_t u;
+
+    /*
+     * Algorithm: we inject words one by one, starting with the high
+     * word. Each step is:
+     *  - multiply x by 2^31
+     *  - add new word
+     */
+    x = 0;
+    u = dlen;
+    while (u -- > 0) {
+        uint32_t w;
+
+        x = modp_montymul(x, R2, p, p0i);
+        w = d[u] - p;
+        w += p & -(w >> 31);
+        x = modp_add(x, w, p);
+    }
+    return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+                      uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+    uint32_t z;
+
+    if (dlen == 0) {
+        return 0;
+    }
+    z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+    z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+    return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *restrict x,
+                   const uint32_t *restrict y, size_t len, uint32_t s) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t xw, yw;
+        uint64_t z;
+
+        xw = x[u];
+        yw = y[u];
+        z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) {
+    size_t u;
+    uint32_t r, bb;
+
+    /*
+     * Compare x with p/2. We use the shifted version of p, and p
+     * is odd, so we really compare with (p-1)/2; we want to perform
+     * the subtraction if and only if x > (p-1)/2.
+     */
+    r = 0;
+    bb = 0;
+    u = len;
+    while (u -- > 0) {
+        uint32_t wx, wp, cc;
+
+        /*
+         * Get the two words to compare in wx and wp (both over
+         * 31 bits exactly).
+         */
+        wx = x[u];
+        wp = (p[u] >> 1) | (bb << 30);
+        bb = p[u] & 1;
+
+        /*
+         * We set cc to -1, 0 or 1, depending on whether wp is
+         * lower than, equal to, or greater than wx.
+         */
+        cc = wp - wx;
+        cc = ((-cc) >> 31) | -(cc >> 31);
+
+        /*
+         * If r != 0 then it is either 1 or -1, and we keep its
+         * value. Otherwise, if r = 0, then we replace it with cc.
+         */
+        r |= cc & ((r & 1) - 1);
+    }
+
+    /*
+     * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+     * is lower than, equal to, or greater than x. We thus want to
+     * do the subtraction only if r = -1.
+     */
+    zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
+                 size_t num, const small_prime *primes, int normalize_signed,
+                 uint32_t *restrict tmp) {
+    size_t u;
+    uint32_t *x;
+
+    tmp[0] = primes[0].p;
+    for (u = 1; u < xlen; u ++) {
+        /*
+         * At the entry of each loop iteration:
+         *  - the first u words of each array have been
+         *    reassembled;
+         *  - the first u words of tmp[] contains the
+         * product of the prime moduli processed so far.
+         *
+         * We call 'q' the product of all previous primes.
+         */
+        uint32_t p, p0i, s, R2;
+        size_t v;
+
+        p = primes[u].p;
+        s = primes[u].s;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        for (v = 0, x = xx; v < num; v ++, x += xstride) {
+            uint32_t xp, xq, xr;
+            /*
+             * xp = the integer x modulo the prime p for this
+             *      iteration
+             * xq = (x mod q) mod p
+             */
+            xp = x[u];
+            xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+            /*
+             * New value is (x mod q) + q * (s * (xp - xq) mod p)
+             */
+            xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+            zint_add_mul_small(x, tmp, u, xr);
+        }
+
+        /*
+         * Update product of primes in tmp[].
+         */
+        tmp[u] = zint_mul_small(tmp, u, p);
+    }
+
+    /*
+     * Normalize the reconstructed values around 0.
+     */
+    if (normalize_signed) {
+        for (u = 0, x = xx; u < num; u ++, x += xstride) {
+            zint_norm_zero(x, tmp, xlen);
+        }
+    }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    /*
+     * If ctl = 1 then we flip the bits of a by XORing with
+     * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+     * with 0 and add 0, which leaves the value unchanged.
+     */
+    cc = ctl;
+    m = -ctl >> 1;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw;
+
+        aw = a[u];
+        aw = (aw ^ m) + cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ *  0  both values were positive
+ *  1  new a had to be negated
+ *  2  new b had to be negated
+ *  3  both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+               int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t nega, negb;
+
+    cca = 0;
+    ccb = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    nega = (uint32_t)((uint64_t)cca >> 63);
+    negb = (uint32_t)((uint64_t)ccb >> 63);
+    zint_negate(a, len, nega);
+    zint_negate(b, len, negb);
+    return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+    size_t u;
+    uint32_t cc, xm, ym;
+
+    /*
+     * First pass: compare a (assumed nonnegative) with m. Note that
+     * if the top word uses 32 bits, subtracting m must yield a
+     * value less than 2^31 since a < 2*m.
+     */
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        cc = (a[u] - m[u] - cc) >> 31;
+    }
+
+    /*
+     * If neg = 1 then we must add m (regardless of cc)
+     * If neg = 0 and cc = 0 then we must subtract m
+     * If neg = 0 and cc = 1 then we must do nothing
+     *
+     * In the loop below, we conditionally subtract either m or -m
+     * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+     * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+     */
+    xm = -neg >> 1;
+    ym = -(neg | (1 - cc));
+    cc = neg;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, mw;
+
+        aw = a[u];
+        mw = (m[u] ^ xm) & ym;
+        aw = aw - mw - cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+                   uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t fa, fb;
+
+    /*
+     * These are actually four combined Montgomery multiplications.
+     */
+    cca = 0;
+    ccb = 0;
+    fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+    fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb
+             + m[u] * (uint64_t)fa + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+             + m[u] * (uint64_t)fb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    /*
+     * At this point:
+     *   -m <= a < 2*m
+     *   -m <= b < 2*m
+     * (this is a case of Montgomery reduction)
+     * The top words of 'a' and 'b' may have a 32-th bit set.
+     * We want to add or subtract the modulus, as required.
+     */
+    zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+    zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ *   0 <= u <= y
+ *   0 <= v <= x
+ *   x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
+            const uint32_t *restrict x, const uint32_t *restrict y,
+            size_t len, uint32_t *restrict tmp) {
+    /*
+     * Algorithm is an extended binary GCD. We maintain 6 values
+     * a, b, u0, u1, v0 and v1 with the following invariants:
+     *
+     *  a = x*u0 - y*v0
+     *  b = x*u1 - y*v1
+     *  0 <= a <= x
+     *  0 <= b <= y
+     *  0 <= u0 < y
+     *  0 <= v0 < x
+     *  0 <= u1 <= y
+     *  0 <= v1 < x
+     *
+     * Initial values are:
+     *
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     *
+     * Each iteration reduces either a or b, and maintains the
+     * invariants. Algorithm stops when a = b, at which point their
+     * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+     * the values (u,v) we want to return.
+     *
+     * The formal definition of the algorithm is a sequence of steps:
+     *
+     *  - If a is even, then:
+     *        a <- a/2
+     *        u0 <- u0/2 mod y
+     *        v0 <- v0/2 mod x
+     *
+     *  - Otherwise, if b is even, then:
+     *        b <- b/2
+     *        u1 <- u1/2 mod y
+     *        v1 <- v1/2 mod x
+     *
+     *  - Otherwise, if a > b, then:
+     *        a <- (a-b)/2
+     *        u0 <- (u0-u1)/2 mod y
+     *        v0 <- (v0-v1)/2 mod x
+     *
+     *  - Otherwise:
+     *        b <- (b-a)/2
+     *        u1 <- (u1-u0)/2 mod y
+     *        v1 <- (v1-v0)/2 mod y
+     *
+     * We can show that the operations above preserve the invariants:
+     *
+     *  - If a is even, then u0 and v0 are either both even or both
+     *    odd (since a = x*u0 - y*v0, and x and y are both odd).
+     *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+     *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+     *    the a = x*u0 - y*v0 invariant is preserved.
+     *
+     *  - The same holds for the case where b is even.
+     *
+     *  - If a and b are odd, and a > b, then:
+     *
+     *      a-b = x*(u0-u1) - y*(v0-v1)
+     *
+     *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+     *    a-b > 0; therefore, it must be that v0 < v1, and the
+     *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+     *    which preserves the invariants. Otherwise, if u0 > u1,
+     *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+     *    b >= 0, hence a-b <= x. It follows that, in that case,
+     *    v0-v1 >= 0. The first part of the update is then:
+     *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+     *    invariants.
+     *
+     *    Either way, once the subtraction is done, the new value of
+     *    a, which is the difference of two odd values, is even,
+     *    and the remaining of this step is a subcase of the
+     *    first algorithm case (i.e. when a is even).
+     *
+     *  - If a and b are odd, and b > a, then the a similar
+     *    argument holds.
+     *
+     * The values a and b start at x and y, respectively. Since x
+     * and y are odd, their GCD is odd, and it is easily seen that
+     * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+     * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+     * or b is reduced by at least one bit at each iteration, so
+     * the algorithm necessarily converges on the case a = b, at
+     * which point the common value is the GCD.
+     *
+     * In the algorithm expressed above, when a = b, the fourth case
+     * applies, and sets b = 0. Since a contains the GCD of x and y,
+     * which are both odd, a must be odd, and subsequent iterations
+     * (if any) will simply divide b by 2 repeatedly, which has no
+     * consequence. Thus, the algorithm can run for more iterations
+     * than necessary; the final GCD will be in a, and the (u,v)
+     * coefficients will be (u0,v0).
+     *
+     *
+     * The presentation above is bit-by-bit. It can be sped up by
+     * noticing that all decisions are taken based on the low bits
+     * and high bits of a and b. We can extract the two top words
+     * and low word of each of a and b, and compute reduction
+     * parameters pa, pb, qa and qb such that the new values for
+     * a and b are:
+     *    a' = (a*pa + b*pb) / (2^31)
+     *    b' = (a*qa + b*qb) / (2^31)
+     * the two divisions being exact. The coefficients are obtained
+     * just from the extracted words, and may be slightly off, requiring
+     * an optional correction: if a' < 0, then we replace pa with -pa
+     * and pb with -pb. Each such step will reduce the total length
+     * (sum of lengths of a and b) by at least 30 bits at each
+     * iteration.
+     */
+    uint32_t *u0, *u1, *v0, *v1, *a, *b;
+    uint32_t x0i, y0i;
+    uint32_t num, rc;
+    size_t j;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    /*
+     * u0 and v0 are the u and v result buffers; the four other
+     * values (u1, v1, a and b) are taken from tmp[].
+     */
+    u0 = u;
+    v0 = v;
+    u1 = tmp;
+    v1 = u1 + len;
+    a = v1 + len;
+    b = a + len;
+
+    /*
+     * We'll need the Montgomery reduction coefficients.
+     */
+    x0i = modp_ninv31(x[0]);
+    y0i = modp_ninv31(y[0]);
+
+    /*
+     * Initialize a, b, u0, u1, v0 and v1.
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     * Note that x is odd, so computing x-1 is easy.
+     */
+    memcpy(a, x, len * sizeof * x);
+    memcpy(b, y, len * sizeof * y);
+    u0[0] = 1;
+    memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+    memset(v0, 0, len * sizeof * v0);
+    memcpy(u1, y, len * sizeof * u1);
+    memcpy(v1, x, len * sizeof * v1);
+    v1[0] --;
+
+    /*
+     * Each input operand may be as large as 31*len bits, and we
+     * reduce the total length by at least 30 bits at each iteration.
+     */
+    for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+        uint32_t c0, c1;
+        uint32_t a0, a1, b0, b1;
+        uint64_t a_hi, b_hi;
+        uint32_t a_lo, b_lo;
+        int64_t pa, pb, qa, qb;
+        int i;
+        uint32_t r;
+
+        /*
+         * Extract the top words of a and b. If j is the highest
+         * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+         * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+         * If a and b are down to one word each, then we use
+         * a[0] and b[0].
+         */
+        c0 = (uint32_t) -1;
+        c1 = (uint32_t) -1;
+        a0 = 0;
+        a1 = 0;
+        b0 = 0;
+        b1 = 0;
+        j = len;
+        while (j -- > 0) {
+            uint32_t aw, bw;
+
+            aw = a[j];
+            bw = b[j];
+            a0 ^= (a0 ^ aw) & c0;
+            a1 ^= (a1 ^ aw) & c1;
+            b0 ^= (b0 ^ bw) & c0;
+            b1 ^= (b1 ^ bw) & c1;
+            c1 = c0;
+            c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+        }
+
+        /*
+         * If c1 = 0, then we grabbed two words for a and b.
+         * If c1 != 0 but c0 = 0, then we grabbed one word. It
+         * is not possible that c1 != 0 and c0 != 0, because that
+         * would mean that both integers are zero.
+         */
+        a1 |= a0 & c1;
+        a0 &= ~c1;
+        b1 |= b0 & c1;
+        b0 &= ~c1;
+        a_hi = ((uint64_t)a0 << 31) + a1;
+        b_hi = ((uint64_t)b0 << 31) + b1;
+        a_lo = a[0];
+        b_lo = b[0];
+
+        /*
+         * Compute reduction factors:
+         *
+         *   a' = a*pa + b*pb
+         *   b' = a*qa + b*qb
+         *
+         * such that a' and b' are both multiple of 2^31, but are
+         * only marginally larger than a and b.
+         */
+        pa = 1;
+        pb = 0;
+        qa = 0;
+        qb = 1;
+        for (i = 0; i < 31; i ++) {
+            /*
+             * At each iteration:
+             *
+             *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+             *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+             *   a <- a/2 if: a is even
+             *   b <- b/2 if: a is odd, b is even
+             *
+             * We multiply a_lo and b_lo by 2 at each
+             * iteration, thus a division by 2 really is a
+             * non-multiplication by 2.
+             */
+            uint32_t rt, oa, ob, cAB, cBA, cA;
+            uint64_t rz;
+
+            /*
+             * rt = 1 if a_hi > b_hi, 0 otherwise.
+             */
+            rz = b_hi - a_hi;
+            rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+                                   & (a_hi ^ rz))) >> 63);
+
+            /*
+             * cAB = 1 if b must be subtracted from a
+             * cBA = 1 if a must be subtracted from b
+             * cA = 1 if a must be divided by 2
+             *
+             * Rules:
+             *
+             *   cAB and cBA cannot both be 1.
+             *   If a is not divided by 2, b is.
+             */
+            oa = (a_lo >> i) & 1;
+            ob = (b_lo >> i) & 1;
+            cAB = oa & ob & rt;
+            cBA = oa & ob & ~rt;
+            cA = cAB | (oa ^ 1);
+
+            /*
+             * Conditional subtractions.
+             */
+            a_lo -= b_lo & -cAB;
+            a_hi -= b_hi & -(uint64_t)cAB;
+            pa -= qa & -(int64_t)cAB;
+            pb -= qb & -(int64_t)cAB;
+            b_lo -= a_lo & -cBA;
+            b_hi -= a_hi & -(uint64_t)cBA;
+            qa -= pa & -(int64_t)cBA;
+            qb -= pb & -(int64_t)cBA;
+
+            /*
+             * Shifting.
+             */
+            a_lo += a_lo & (cA - 1);
+            pa += pa & ((int64_t)cA - 1);
+            pb += pb & ((int64_t)cA - 1);
+            a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+            b_lo += b_lo & -cA;
+            qa += qa & -(int64_t)cA;
+            qb += qb & -(int64_t)cA;
+            b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+        }
+
+        /*
+         * Apply the computed parameters to our values. We
+         * may have to correct pa and pb depending on the
+         * returned value of zint_co_reduce() (when a and/or b
+         * had to be negated).
+         */
+        r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+        pa -= (pa + pa) & -(int64_t)(r & 1);
+        pb -= (pb + pb) & -(int64_t)(r & 1);
+        qa -= (qa + qa) & -(int64_t)(r >> 1);
+        qb -= (qb + qb) & -(int64_t)(r >> 1);
+        zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+        zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+    }
+
+    /*
+     * At that point, array a[] should contain the GCD, and the
+     * results (u,v) should already be set. We check that the GCD
+     * is indeed 1. We also check that the two operands x and y
+     * are odd.
+     */
+    rc = a[0] ^ 1;
+    for (j = 1; j < len; j ++) {
+        rc |= a[j];
+    }
+    return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
+                          const uint32_t *restrict y, size_t ylen, int32_t k,
+                          uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    int32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t wy, wys, ccu;
+        uint64_t z;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        wy = v < ylen ? y[v] : ysign;
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        /*
+         * The expression below does not overflow.
+         */
+        z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+        /*
+         * Right-shifting the signed value z would yield
+         * implementation-defined results (arithmetic shift is
+         * not guaranteed). However, we can cast to unsigned,
+         * and get the next carry as an unsigned word. We can
+         * then convert it back to signed by using the guaranteed
+         * fact that 'int32_t' uses two's complement with no
+         * trap representation or padding bit, and with a layout
+         * compatible with that of 'uint32_t'.
+         */
+        ccu = (uint32_t)(z >> 31);
+        cc = *(int32_t *)&ccu;
+    }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *restrict x, size_t xlen,
+                const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    uint32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t w, wy, wys;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        wy = v < ylen ? y[v] : ysign;
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        w = x[u] - wys - cc;
+        x[u] = w & 0x7FFFFFFF;
+        cc = w >> 31;
+    }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+    uint32_t w;
+
+    w = x[0];
+    w |= (w & 0x40000000) << 1;
+    return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+               unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    if (flen == 0) {
+        for (u = 0; u < n; u ++) {
+            d[u] = fpr_zero;
+        }
+        return;
+    }
+    for (u = 0; u < n; u ++, f += fstride) {
+        size_t v;
+        uint32_t neg, cc, xm;
+        fpr x, fsc;
+
+        /*
+         * Get sign of the integer; if it is negative, then we
+         * will load its absolute value instead, and negate the
+         * result.
+         */
+        neg = -(f[flen - 1] >> 30);
+        xm = neg >> 1;
+        cc = neg & 1;
+        x = fpr_zero;
+        fsc = fpr_one;
+        for (v = 0; v < flen; v++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+            uint32_t w;
+
+            w = (f[v] ^ xm) + cc;
+            cc = w >> 31;
+            w &= 0x7FFFFFFF;
+            w -= (w << 1) & neg;
+            x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+        }
+        d[u] = x;
+    }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = zint_one_to_plain(s + u);
+        if (z < -lim || z > lim) {
+            return 0;
+        }
+        d[u] = (int8_t)z;
+    }
+    return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
+                const uint32_t *restrict f, size_t flen, size_t fstride,
+                const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t kf;
+        size_t v;
+        uint32_t *x;
+        const uint32_t *y;
+
+        kf = -k[u];
+        x = F + u * Fstride;
+        y = f;
+        for (v = 0; v < n; v ++) {
+            zint_add_scaled_mul_small(
+                x, Flen, y, flen, kf, sch, scl);
+            if (u + v == n - 1) {
+                x = F;
+                kf = -kf;
+            } else {
+                x += Fstride;
+            }
+            y += fstride;
+        }
+    }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
+                    const uint32_t *restrict f, size_t flen, size_t fstride,
+                    const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
+                    uint32_t *restrict tmp) {
+    uint32_t *gm, *igm, *fk, *t1, *x;
+    const uint32_t *y;
+    size_t n, u, tlen;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    tlen = flen + 1;
+    gm = tmp;
+    igm = gm + MKN(logn);
+    fk = igm + MKN(logn);
+    t1 = fk + n * tlen;
+
+    primes = PRIMES;
+
+    /*
+     * Compute k*f in fk[], in RNS notation.
+     */
+    for (u = 0; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0; v < n; v ++) {
+            t1[v] = modp_set(k[v], p);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, y = f, x = fk + u;
+                v < n; v ++, y += fstride, x += tlen) {
+            *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+        }
+        modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+        for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+            *x = modp_montymul(
+                     modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+        }
+        modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild k*f.
+     */
+    zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+    /*
+     * Subtract k*f, scaled, from F.
+     */
+    for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+        zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+    }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT   inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+    /*
+     * We enforce little-endian representation.
+     */
+
+    uint8_t tmp[8];
+
+    inner_shake256_extract(rng, tmp, sizeof tmp);
+    return (uint64_t)tmp[0]
+           | ((uint64_t)tmp[1] << 8)
+           | ((uint64_t)tmp[2] << 16)
+           | ((uint64_t)tmp[3] << 24)
+           | ((uint64_t)tmp[4] << 32)
+           | ((uint64_t)tmp[5] << 40)
+           | ((uint64_t)tmp[6] << 48)
+           | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ *    D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+    1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
+    2353523259288686585u,  1227179971273316331u,   575931623374121527u,
+    242543240509105209u,    91437049221049666u,    30799446349977173u,
+    9255276791179340u,     2478152334826140u,      590642893610164u,
+    125206034929641u,       23590435911403u,        3948334035941u,
+    586753615614u,          77391054539u,           9056793210u,
+    940121950u,             86539696u,              7062824u,
+    510971u,                32764u,                 1862u,
+    94u,                    4u,                    0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+    unsigned u, g;
+    int val;
+
+    g = 1U << (10 - logn);
+    val = 0;
+    for (u = 0; u < g; u ++) {
+        /*
+         * Each iteration generates one value with the
+         * Gaussian distribution for N = 1024.
+         *
+         * We use two random 64-bit values. First value
+         * decides on whether the generated value is 0, and,
+         * if not, the sign of the value. Second random 64-bit
+         * word is used to generate the non-zero value.
+         *
+         * For constant-time code we have to read the complete
+         * table. This has negligible cost, compared with the
+         * remainder of the keygen process (solving the NTRU
+         * equation).
+         */
+        uint64_t r;
+        uint32_t f, v, k, neg;
+
+        /*
+         * First value:
+         *  - flag 'neg' is randomly selected to be 0 or 1.
+         *  - flag 'f' is set to 1 if the generated value is zero,
+         *    or set to 0 otherwise.
+         */
+        r = get_rng_u64(rng);
+        neg = (uint32_t)(r >> 63);
+        r &= ~((uint64_t)1 << 63);
+        f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+        /*
+         * We produce a new random 63-bit integer r, and go over
+         * the array, starting at index 1. We store in v the
+         * index of the first array element which is not greater
+         * than r, unless the flag f was already 1.
+         */
+        v = 0;
+        r = get_rng_u64(rng);
+        r &= ~((uint64_t)1 << 63);
+        for (k = 1; k < (sizeof gauss_1024_12289)
+                / (sizeof gauss_1024_12289[0]); k ++) {
+            uint32_t t;
+
+            t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+            v |= k & -(t & (f ^ 1));
+            f |= t;
+        }
+
+        /*
+         * We apply the sign ('neg' flag). If the value is zero,
+         * the sign has no effect.
+         */
+        v = (v ^ -neg) + neg;
+
+        /*
+         * Generated value is added to val.
+         */
+        val += *(int32_t *)&v;
+    }
+    return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ *  - Within an array, values grow.
+ *
+ *  - The 'SMALL' array must have an entry for maximum depth, corresponding
+ *    to the size of values used in the binary GCD. There is no such value
+ *    for the 'LARGE' array (the binary GCD yields already reduced
+ *    coefficients).
+ *
+ *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ *  - Values must be large enough to handle the common cases, with some
+ *    margins.
+ *
+ *  - Values must not be "too large" either because we will convert some
+ *    integers into floating-point values by considering the top 10 words,
+ *    i.e. 310 bits; hence, for values of length more than 10 words, we
+ *    should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
+ *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
+ *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
+ *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
+ *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
+ *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
+ *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
+ *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
+ *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
+ *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
+ *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+    1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+    2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+    int avg;
+    int std;
+} BITLENGTH[] = {
+    {    4,  0 },
+    {   11,  1 },
+    {   24,  1 },
+    {   50,  1 },
+    {  102,  1 },
+    {  202,  2 },
+    {  401,  4 },
+    {  794,  5 },
+    { 1577,  8 },
+    { 3138, 13 },
+    { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG   4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = MKN(logn);
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = f[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(fpr);
+    if (km) {
+        k += (sizeof(fpr)) - km;
+    }
+    return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(uint32_t);
+    if (km) {
+        k += (sizeof(uint32_t)) - km;
+    }
+    return (uint32_t *)(cb + k);
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+             int in_ntt, int out_ntt) {
+    size_t n, hn, u;
+    size_t slen, tlen;
+    uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+    const small_prime *primes;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    slen = MAX_BL_SMALL[depth];
+    tlen = MAX_BL_SMALL[depth + 1];
+    primes = PRIMES;
+
+    /*
+     * Prepare room for the result.
+     */
+    fd = data;
+    gd = fd + hn * tlen;
+    fs = gd + hn * tlen;
+    gs = fs + n * slen;
+    gm = gs + n * slen;
+    igm = gm + n;
+    t1 = igm + n;
+    memmove(fs, data, 2 * n * slen * sizeof * data);
+
+    /*
+     * First slen words: we use the input values directly, and apply
+     * inverse NTT as we go.
+     */
+    for (u = 0; u < slen; u ++) {
+        uint32_t p, p0i, R2;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+        }
+
+        for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+
+    /*
+     * Since the fs and gs words have been de-NTTized, we can use the
+     * CRT to rebuild the values.
+     */
+    zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+    zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+    /*
+     * Remaining words: use modular reductions to extract the values.
+     */
+    for (u = slen; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+        for (v = 0, x = fs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        for (v = 0, x = gs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ *   0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+        unsigned logn, unsigned depth, int out_ntt) {
+    size_t n, u;
+    uint32_t *ft, *gt, p0;
+    unsigned d;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    ft = data;
+    gt = ft + n;
+    primes = PRIMES;
+    p0 = primes[0].p;
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p0);
+        gt[u] = modp_set(g[u], p0);
+    }
+
+    if (depth == 0 && out_ntt) {
+        uint32_t *gm, *igm;
+        uint32_t p, p0i;
+
+        p = primes[0].p;
+        p0i = modp_ninv31(p);
+        gm = gt + n;
+        igm = gm + MKN(logn);
+        modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+        modp_NTT2(ft, gm, logn, p, p0i);
+        modp_NTT2(gt, gm, logn, p, p0i);
+        return;
+    }
+
+    for (d = 0; d < depth; d ++) {
+        make_fg_step(data, logn - d, d,
+                     d != 0, (d + 1) < depth || out_ntt);
+    }
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+                   const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t len;
+    uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+    const small_prime *primes;
+
+    len = MAX_BL_SMALL[logn_top];
+    primes = PRIMES;
+
+    Fp = tmp;
+    Gp = Fp + len;
+    fp = Gp + len;
+    gp = fp + len;
+    t1 = gp + len;
+
+    make_fg(fp, f, g, logn_top, logn_top, 0);
+
+    /*
+     * We use the CRT to rebuild the resultants as big integers.
+     * There are two such big integers. The resultants are always
+     * nonnegative.
+     */
+    zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+    /*
+     * Apply the binary GCD. The zint_bezout() function works only
+     * if both inputs are odd.
+     *
+     * We can test on the result and return 0 because that would
+     * imply failure of the NTRU solving equation, and the (f,g)
+     * values will be abandoned in that case.
+     */
+    if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+        return 0;
+    }
+
+    /*
+     * Multiply the two values by the target value q. Values must
+     * fit in the destination arrays.
+     * We can again test on the returned words: a non-zero output
+     * of zint_mul_small() means that we exceeded our array
+     * capacity, and that implies failure and rejection of (f,g).
+     */
+    q = 12289;
+    if (zint_mul_small(Fp, len, q) != 0
+            || zint_mul_small(Gp, len, q) != 0) {
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+                        const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+    /*
+     * In this function, 'logn' is the log2 of the degree for
+     * this step. If N = 2^logn, then:
+     *  - the F and G values already in fk->tmp (from the deeper
+     *    levels) have degree N/2;
+     *  - this function should return F and G of degree N.
+     */
+    unsigned logn;
+    size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+    int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+    uint32_t *x, *y;
+    int32_t *k;
+    const small_prime *primes;
+
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2 or N/3)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+    primes = PRIMES;
+
+    /*
+     * Fd and Gd are the F and G from the deeper level.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+
+    /*
+     * Compute the input f and g for this level. Note that we get f
+     * and g in RNS + NTT representation.
+     */
+    ft = Gd + dlen * hn;
+    make_fg(ft, f, g, logn_top, depth, 1);
+
+    /*
+     * Move the newly computed f and g to make room for our candidate
+     * F and G (unreduced).
+     */
+    Ft = tmp;
+    Gt = Ft + n * llen;
+    t1 = Gt + n * llen;
+    memmove(t1, ft, 2 * n * slen * sizeof * ft);
+    ft = t1;
+    gt = ft + slen * n;
+    t1 = gt + slen * n;
+
+    /*
+     * Move Fd and Gd _after_ f and g.
+     */
+    memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+    Fd = t1;
+    Gd = Fd + hn * dlen;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt (only n/2 values in each).
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * We do not need Fd and Gd after that point.
+     */
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * If we processed slen words, then f and g have been
+         * de-NTTized, and are in RNS; we can rebuild them.
+         */
+        if (u == slen) {
+            zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+            zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+        }
+
+        gm = t1;
+        igm = gm + n;
+        fx = igm + n;
+        gx = fx + n;
+
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        if (u < slen) {
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = *x;
+                gx[v] = *y;
+            }
+            modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+            modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+        } else {
+            uint32_t Rx;
+
+            Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+            for (v = 0, x = ft, y = gt;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = zint_mod_small_signed(x, slen,
+                                              p, p0i, R2, Rx);
+                gx[v] = zint_mod_small_signed(y, slen,
+                                              p, p0i, R2, Rx);
+            }
+            modp_NTT2(fx, gm, logn, p, p0i);
+            modp_NTT2(gx, gm, logn, p, p0i);
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed in
+         * a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * General case:
+         *
+         *   we divide degree by d = 2 or 3
+         *   f'(x^d) = N(f)(x^d) = f * adj(f)
+         *   g'(x^d) = N(g)(x^d) = g * adj(g)
+         *   f'*G' - g'*F' = q
+         *   F = F'(x^d) * adj(g)
+         *   G = G'(x^d) * adj(f)
+         *
+         * We compute things in the NTT. We group roots of phi
+         * such that all roots x in a group share the same x^d.
+         * If the roots in a group are x_1, x_2... x_d, then:
+         *
+         *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+         *
+         * Thus, we have:
+         *
+         *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   ...
+         *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+         *
+         * In all cases, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, in our chosen NTT representation, roots
+         * from the same group are consecutive in RAM.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+                v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild F and G with the CRT.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+    zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+    /*
+     * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+     * order).
+     */
+
+    /*
+     * Apply Babai reduction to bring back F and G to size slen.
+     *
+     * We use the FFT to compute successive approximations of the
+     * reduction coefficient. We first isolate the top bits of
+     * the coefficients of f and g, and convert them to floating
+     * point; with the FFT, we compute adj(f), adj(g), and
+     * 1/(f*adj(f)+g*adj(g)).
+     *
+     * Then, we repeatedly apply the following:
+     *
+     *   - Get the top bits of the coefficients of F and G into
+     *     floating point, and use the FFT to compute:
+     *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+     *
+     *   - Convert back that value into normal representation, and
+     *     round it to the nearest integers, yielding a polynomial k.
+     *     Proper scaling is applied to f, g, F and G so that the
+     *     coefficients fit on 32 bits (signed).
+     *
+     *   - Subtract k*f from F and k*g from G.
+     *
+     * Under normal conditions, this process reduces the size of F
+     * and G by some bits at each iteration. For constant-time
+     * operation, we do not want to measure the actual length of
+     * F and G; instead, we do the following:
+     *
+     *   - f and g are converted to floating-point, with some scaling
+     *     if necessary to keep values in the representable range.
+     *
+     *   - For each iteration, we _assume_ a maximum size for F and G,
+     *     and use the values at that size. If we overreach, then
+     *     we get zeros, which is harmless: the resulting coefficients
+     *     of k will be 0 and the value won't be reduced.
+     *
+     *   - We conservatively assume that F and G will be reduced by
+     *     at least 25 bits at each iteration.
+     *
+     * Even when reaching the bottom of the reduction, reduction
+     * coefficient will remain low. If it goes out-of-range, then
+     * something wrong occurred and the whole NTRU solving fails.
+     */
+
+    /*
+     * Memory layout:
+     *  - We need to compute and keep adj(f), adj(g), and
+     *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+     *    respectively).
+     *  - At each iteration we need two extra fp buffer (N fp values),
+     *    and produce a k (N 32-bit words). k will be shared with one
+     *    of the fp buffers.
+     *  - To compute k*f and k*g efficiently (with the NTT), we need
+     *    some extra room; we reuse the space of the temporary buffers.
+     *
+     * Arrays of 'fpr' are obtained from the temporary array itself.
+     * We ensure that the base is at a properly aligned offset (the
+     * source array tmp[] is supposed to be already aligned).
+     */
+
+    rt3 = align_fpr(tmp, t1);
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt1 = rt5 + (n >> 1);
+    k = (int32_t *)align_u32(tmp, rt1);
+    rt2 = align_fpr(tmp, k + n);
+    if (rt2 < (rt1 + n)) {
+        rt2 = rt1 + n;
+    }
+    t1 = (uint32_t *)k + n;
+
+    /*
+     * Get f and g into rt3 and rt4 as floating-point approximations.
+     *
+     * We need to "scale down" the floating-point representation of
+     * coefficients when they are too big. We want to keep the value
+     * below 2^310 or so. Thus, when values are larger than 10 words,
+     * we consider only the top 10 words. Array lengths have been
+     * computed so that average maximum length will fall in the
+     * middle or the upper half of these top 10 words.
+     */
+    rlen = (slen > 10) ? 10 : slen;
+    poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+    poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+    /*
+     * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+     */
+    scale_fg = 31 * (int)(slen - rlen);
+
+    /*
+     * Estimated boundaries for the maximum size (in bits) of the
+     * coefficients of (f,g). We use the measured average, and
+     * allow for a deviation of at most six times the standard
+     * deviation.
+     */
+    minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+    maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+    /*
+     * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+     * and adj(g) in rt3 and rt4, respectively.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt3, rt3, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt4, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt4, rt4, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+
+    /*
+     * Reduce F and G repeatedly.
+     *
+     * The expected maximum bit length of coefficients of F and G
+     * is kept in maxbl_FG, with the corresponding word length in
+     * FGlen.
+     */
+    FGlen = llen;
+    maxbl_FG = 31 * (int)llen;
+
+    /*
+     * Each reduction operation computes the reduction polynomial
+     * "k". We need that polynomial to have coefficients that fit
+     * on 32-bit signed integers, with some scaling; thus, we use
+     * a descending sequence of scaling values, down to zero.
+     *
+     * The size of the coefficients of k is (roughly) the difference
+     * between the size of the coefficients of (F,G) and the size
+     * of the coefficients of (f,g). Thus, the maximum size of the
+     * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+     * this is our starting scale value for k.
+     *
+     * We need to estimate the size of (F,G) during the execution of
+     * the algorithm; we are allowed some overestimation but not too
+     * much (poly_big_to_fp() uses a 310-bit window). Generally
+     * speaking, after applying a reduction with k scaled to
+     * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+     * where 'dd' is a few bits to account for the fact that the
+     * reduction is never perfect (intuitively, dd is on the order
+     * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+     * bits).
+     *
+     * The size of (f,g) is not known exactly, but maxbl_fg is an
+     * upper bound.
+     */
+    scale_k = maxbl_FG - minbl_fg;
+
+    for (;;) {
+        int scale_FG, dc, new_maxbl_FG;
+        uint32_t scl, sch;
+        fpr pdc, pt;
+
+        /*
+         * Convert current F and G into floating-point. We apply
+         * scaling if the current length is more than 10 words.
+         */
+        rlen = (FGlen > 10) ? 10 : FGlen;
+        scale_FG = 31 * (int)(FGlen - rlen);
+        poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+        poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+        /*
+         * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+         */
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt1, rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt2, rt2, rt4, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_add(rt2, rt2, rt1, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt5, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt2, logn);
+
+        /*
+         * (f,g) are scaled by 'scale_fg', meaning that the
+         * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+         * to have their true mathematical value.
+         *
+         * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+         * the value we computed in rt2 is scaled by
+         * 'scale_FG-scale_fg'.
+         *
+         * We want that value to be scaled by 'scale_k', hence we
+         * apply a corrective scaling. After scaling, the values
+         * should fit in -2^31-1..+2^31-1.
+         */
+        dc = scale_k - scale_FG + scale_fg;
+
+        /*
+         * We will need to multiply values by 2^(-dc). The value
+         * 'dc' is not secret, so we can compute 2^(-dc) with a
+         * non-constant-time process.
+         * (We could use ldexp(), but we prefer to avoid any
+         * dependency on libm. When using FP emulation, we could
+         * use our fpr_ldexp(), which is constant-time.)
+         */
+        if (dc < 0) {
+            dc = -dc;
+            pt = fpr_two;
+        } else {
+            pt = fpr_onehalf;
+        }
+        pdc = fpr_one;
+        while (dc != 0) {
+            if ((dc & 1) != 0) {
+                pdc = fpr_mul(pdc, pt);
+            }
+            dc >>= 1;
+            pt = fpr_sqr(pt);
+        }
+
+        for (u = 0; u < n; u ++) {
+            fpr xv;
+
+            xv = fpr_mul(rt2[u], pdc);
+
+            /*
+             * Sometimes the values can be out-of-bounds if
+             * the algorithm fails; we must not call
+             * fpr_rint() (and cast to int32_t) if the value
+             * is not in-bounds. Note that the test does not
+             * break constant-time discipline, since any
+             * failure here implies that we discard the current
+             * secret key (f,g).
+             */
+            if (!fpr_lt(fpr_mtwo31m1, xv)
+                    || !fpr_lt(xv, fpr_ptwo31m1)) {
+                return 0;
+            }
+            k[u] = (int32_t)fpr_rint(xv);
+        }
+
+        /*
+         * Values in k[] are integers. They really are scaled
+         * down by maxbl_FG - minbl_fg bits.
+         *
+         * If we are at low depth, then we use the NTT to
+         * compute k*f and k*g.
+         */
+        sch = (uint32_t)(scale_k / 31);
+        scl = (uint32_t)(scale_k % 31);
+        if (depth <= DEPTH_INT_FG) {
+            poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+                                k, sch, scl, logn, t1);
+            poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+                                k, sch, scl, logn, t1);
+        } else {
+            poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+                            k, sch, scl, logn);
+            poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+                            k, sch, scl, logn);
+        }
+
+        /*
+         * We compute the new maximum size of (F,G), assuming that
+         * (f,g) has _maximal_ length (i.e. that reduction is
+         * "late" instead of "early". We also adjust FGlen
+         * accordingly.
+         */
+        new_maxbl_FG = scale_k + maxbl_fg + 10;
+        if (new_maxbl_FG < maxbl_FG) {
+            maxbl_FG = new_maxbl_FG;
+            if ((int)FGlen * 31 >= maxbl_FG + 31) {
+                FGlen --;
+            }
+        }
+
+        /*
+         * We suppose that scaling down achieves a reduction by
+         * at least 25 bits per iteration. We stop when we have
+         * done the loop with an unscaled k.
+         */
+        if (scale_k <= 0) {
+            break;
+        }
+        scale_k -= 25;
+        if (scale_k < 0) {
+            scale_k = 0;
+        }
+    }
+
+    /*
+     * If (F,G) length was lowered below 'slen', then we must take
+     * care to re-extend the sign.
+     */
+    if (FGlen < slen) {
+        for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+            size_t v;
+            uint32_t sw;
+
+            sw = -(Ft[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Ft[v] = sw;
+            }
+            sw = -(Gt[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Gt[v] = sw;
+            }
+        }
+    }
+
+    /*
+     * Compress encoding of all values to 'slen' words (this is the
+     * expected output format).
+     */
+    for (u = 0, x = tmp, y = tmp;
+            u < (n << 1); u ++, x += slen, y += llen) {
+        memmove(x, y, slen * sizeof * y);
+    }
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    /*
+     * The first half of this function is a copy of the corresponding
+     * part in solve_NTRU_intermediate(), for the reconstruction of
+     * the unreduced F and G. The second half (Babai reduction) is
+     * done differently, because the unreduced F and G fit in 53 bits
+     * of precision, allowing a much simpler process with lower RAM
+     * usage.
+     */
+    unsigned depth, logn;
+    size_t n_top, n, hn, slen, dlen, llen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+    uint32_t *x, *y;
+
+    depth = 1;
+    n_top = (size_t)1 << logn_top;
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     */
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+
+    /*
+     * Fd and Gd are the F and G from the deeper level. Ft and Gt
+     * are the destination arrays for the unreduced F and G.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+    Ft = Gd + dlen * hn;
+    Gt = Ft + llen * n;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * Now Fd and Gd are not needed anymore; we can squeeze them out.
+     */
+    memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+    Ft = tmp;
+    memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+    Gt = Ft + llen * n;
+    ft = Gt + llen * n;
+    gt = ft + slen * n;
+
+    t1 = gt + slen * n;
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        unsigned e;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * We recompute things from the source f and g, of full
+         * degree. However, we will need only the n first elements
+         * of the inverse NTT table (igm); the call to modp_mkgm()
+         * below will fill n_top elements in igm[] (thus overflowing
+         * into fx[]) but later code will overwrite these extra
+         * elements.
+         */
+        gm = t1;
+        igm = gm + n_top;
+        fx = igm + n;
+        gx = fx + n_top;
+        modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+        /*
+         * Set ft and gt to f and g modulo p, respectively.
+         */
+        for (v = 0; v < n_top; v ++) {
+            fx[v] = modp_set(f[v], p);
+            gx[v] = modp_set(g[v], p);
+        }
+
+        /*
+         * Convert to NTT and compute our f and g.
+         */
+        modp_NTT2(fx, gm, logn_top, p, p0i);
+        modp_NTT2(gx, gm, logn_top, p, p0i);
+        for (e = logn_top; e > logn; e --) {
+            modp_poly_rec_res(fx, e, p, p0i, R2);
+            modp_poly_rec_res(gx, e, p, p0i, R2);
+        }
+
+        /*
+         * From that point onward, we only need tables for
+         * degree n, so we can save some space.
+         */
+        if (depth > 0) { /* always true */
+            memmove(gm + n, igm, n * sizeof * igm);
+            igm = gm + n;
+            memmove(igm + n, fx, n * sizeof * ft);
+            fx = igm + n;
+            memmove(fx + n, gx, n * sizeof * gt);
+            gx = fx + n;
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed
+         * in a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * Equations are:
+         *
+         *   f'(x^2) = N(f)(x^2) = f * adj(f)
+         *   g'(x^2) = N(g)(x^2) = g * adj(g)
+         *
+         *   f'*G' - g'*F' = q
+         *
+         *   F = F'(x^2) * adj(g)
+         *   G = G'(x^2) * adj(f)
+         *
+         * The NTT representation of f is f(w) for all w which
+         * are roots of phi. In the binary case, as well as in
+         * the ternary case for all depth except the deepest,
+         * these roots can be grouped in pairs (w,-w), and we
+         * then have:
+         *
+         *   f(w) = adj(f)(-w)
+         *   f(-w) = adj(f)(w)
+         *
+         * and w^2 is then a root for phi at the half-degree.
+         *
+         * At the deepest level in the ternary case, this still
+         * holds, in the following sense: the roots of x^2-x+1
+         * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+         * have:
+         *
+         *   f(w) = adj(f)(-w^2)
+         *   f(-w^2) = adj(f)(w)
+         *
+         * In all case, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, the two roots for each pair are consecutive
+         * in our bit-reversal encoding.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+        /*
+         * Also save ft and gt (only up to size slen).
+         */
+        if (u < slen) {
+            modp_iNTT2(fx, igm, logn, p, p0i);
+            modp_iNTT2(gx, igm, logn, p, p0i);
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                *x = fx[v];
+                *y = gx[v];
+            }
+        }
+    }
+
+    /*
+     * Rebuild f, g, F and G with the CRT. Note that the elements of F
+     * and G are consecutive, and thus can be rebuilt in a single
+     * loop; similarly, the elements of f and g are consecutive.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+    zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+    /*
+     * Here starts the Babai reduction, specialized for depth = 1.
+     *
+     * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+     * are converted to floating point. There is no scaling, and a
+     * single pass is sufficient.
+     */
+
+    /*
+     * Convert F and G into floating point (rt1 and rt2).
+     */
+    rt1 = align_fpr(tmp, gt + slen * n);
+    rt2 = rt1 + n;
+    poly_big_to_fp(rt1, Ft, llen, llen, logn);
+    poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+    /*
+     * Integer representation of F and G is no longer needed, we
+     * can remove it.
+     */
+    memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+    ft = tmp;
+    gt = ft + slen * n;
+    rt3 = align_fpr(tmp, gt + slen * n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * Convert f and g into floating point (rt3 and rt4).
+     */
+    poly_big_to_fp(rt3, ft, slen, slen, logn);
+    poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+    /*
+     * Remove unneeded ft and gt.
+     */
+    memmove(tmp, rt1, 4 * n * sizeof * rt1);
+    rt1 = (fpr *)tmp;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt6 = rt5 + n;
+
+    /*
+     * We now have:
+     *   rt1 = F
+     *   rt2 = G
+     *   rt3 = f
+     *   rt4 = g
+     * in that order in RAM. We convert all of them to FFT.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt1, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt2, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = F*adj(f) + G*adj(g)
+     *   rt6 = 1 / (f*adj(f) + g*adj(g))
+     * (Note that rt6 is half-length.)
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt5, rt5, rt6, logn);
+
+    /*
+     * Compute k as the rounded version of rt5. Check that none of
+     * the values is larger than 2^63-1 (in absolute value)
+     * because that would make the fpr_rint() do something undefined;
+     * note that any out-of-bounds value here implies a failure and
+     * (f,g) will be discarded, so we can make a simple test.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt5, logn);
+    for (u = 0; u < n; u ++) {
+        fpr z;
+
+        z = rt5[u];
+        if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+            return 0;
+        }
+        rt5[u] = fpr_of(fpr_rint(z));
+    }
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt5, logn);
+
+    /*
+     * Subtract k*f from F, and k*g from G.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt3, rt3, rt5, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(rt1, rt1, rt3, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt1, logn);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt4, rt4, rt5, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(rt2, rt2, rt4, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt2, logn);
+
+    /*
+     * Convert back F and G to integers, and return.
+     */
+    Ft = tmp;
+    Gt = Ft + n;
+    rt3 = align_fpr(tmp, Gt + n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    for (u = 0; u < n; u ++) {
+        Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+        Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t n, hn, u;
+    uint32_t p, p0i, R2;
+    uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+    uint32_t *gm, *igm, *ft, *gt;
+    fpr *rt2, *rt3;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     *
+     * Everything should fit in 31-bit integers, hence we can just use
+     * the first small prime p = 2147473409.
+     */
+    p = PRIMES[0].p;
+    p0i = modp_ninv31(p);
+    R2 = modp_R2(p, p0i);
+
+    Fp = tmp;
+    Gp = Fp + hn;
+    ft = Gp + hn;
+    gt = ft + n;
+    gm = gt + n;
+    igm = gm + n;
+
+    modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F' anf G' in NTT representation.
+     */
+    for (u = 0; u < hn; u ++) {
+        Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+        Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+    }
+    modp_NTT2(Fp, gm, logn - 1, p, p0i);
+    modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+    /*
+     * Load f and g and convert them to NTT representation.
+     */
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+
+    /*
+     * Build the unreduced F,G in ft and gt.
+     */
+    for (u = 0; u < n; u += 2) {
+        uint32_t ftA, ftB, gtA, gtB;
+        uint32_t mFp, mGp;
+
+        ftA = ft[u + 0];
+        ftB = ft[u + 1];
+        gtA = gt[u + 0];
+        gtB = gt[u + 1];
+        mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+        mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+        ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+        ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+        gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+        gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+    }
+    modp_iNTT2(ft, igm, logn, p, p0i);
+    modp_iNTT2(gt, igm, logn, p, p0i);
+
+    Gp = Fp + n;
+    t1 = Gp + n;
+    memmove(Fp, ft, 2 * n * sizeof * ft);
+
+    /*
+     * We now need to apply the Babai reduction. At that point,
+     * we have F and G in two n-word arrays.
+     *
+     * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+     * modulo p, using the NTT. We still move memory around in
+     * order to save RAM.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+
+    /*
+     * Compute the NTT tables in t1 and t2. We do not keep t2
+     * (we'll recompute it later on).
+     */
+    modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F and G to NTT.
+     */
+    modp_NTT2(Fp, t1, logn, p, p0i);
+    modp_NTT2(Gp, t1, logn, p, p0i);
+
+    /*
+     * Load f and adj(f) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(f[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[n - u] = modp_set(-f[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Compute F*adj(f) in t2, and f*adj(f) in t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_montymul(w, Fp[u], p, p0i);
+        t3[u] = modp_montymul(w, t4[u], p, p0i);
+    }
+
+    /*
+     * Load g and adj(g) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(g[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(g[u], p);
+        t5[n - u] = modp_set(-g[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Add G*adj(g) to t2, and g*adj(g) to t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_add(t2[u],
+                         modp_montymul(w, Gp[u], p, p0i), p);
+        t3[u] = modp_add(t3[u],
+                         modp_montymul(w, t4[u], p, p0i), p);
+    }
+
+    /*
+     * Convert back t2 and t3 to normal representation (normalized
+     * around 0), and then
+     * move them to t1 and t2. We first need to recompute the
+     * inverse table for NTT.
+     */
+    modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+    modp_iNTT2(t2, t4, logn, p, p0i);
+    modp_iNTT2(t3, t4, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint32_t)modp_norm(t2[u], p);
+        t2[u] = (uint32_t)modp_norm(t3[u], p);
+    }
+
+    /*
+     * At that point, array contents are:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   F*adj(f)+G*adj(g) (t1)
+     *   f*adj(f)+g*adj(g) (t2)
+     *
+     * We want to divide t1 by t2. The result is not integral; it
+     * must be rounded. We thus need to use the FFT.
+     */
+
+    /*
+     * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+     * polynomial is auto-adjoint, all its coordinates in FFT
+     * representation are actually real, so we can truncate off
+     * the imaginary parts.
+     */
+    rt3 = align_fpr(tmp, t3);
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t2)[u]);
+    }
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+    rt2 = align_fpr(tmp, t2);
+    memmove(rt2, rt3, hn * sizeof * rt3);
+
+    /*
+     * Convert F*adj(f)+G*adj(g) in FFT representation.
+     */
+    rt3 = rt2 + hn;
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t1)[u]);
+    }
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+
+    /*
+     * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+     * its rounded normal representation in t1.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_div_autoadj_fft(rt3, rt3, rt2, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt3, logn);
+    for (u = 0; u < n; u ++) {
+        t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+    }
+
+    /*
+     * RAM contents are now:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   k (t1)
+     *
+     * We want to compute F-k*f, and G-k*g.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+    modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(t1, t2, logn, p, p0i);
+    modp_NTT2(t4, t2, logn, p, p0i);
+    modp_NTT2(t5, t2, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t kw;
+
+        kw = modp_montymul(t1[u], R2, p, p0i);
+        Fp[u] = modp_sub(Fp[u],
+                         modp_montymul(kw, t4[u], p, p0i), p);
+        Gp[u] = modp_sub(Gp[u],
+                         modp_montymul(kw, t5[u], p, p0i), p);
+    }
+    modp_iNTT2(Fp, t3, logn, p, p0i);
+    modp_iNTT2(Gp, t3, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+        Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+    }
+
+    return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+           const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+    size_t n, u;
+    uint32_t *ft, *gt, *Ft, *Gt, *gm;
+    uint32_t p, p0i, r;
+    const small_prime *primes;
+
+    n = MKN(logn);
+
+    if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+        return 0;
+    }
+
+    /*
+     * For logn <= 2, we need to use solve_NTRU_intermediate()
+     * directly, because coefficients are a bit too large and
+     * do not fit the hypotheses in solve_NTRU_binary_depth0().
+     */
+    if (logn <= 2) {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 0) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+    } else {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 2) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+        if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+            return 0;
+        }
+        if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+            return 0;
+        }
+    }
+
+    /*
+     * If no buffer has been provided for G, use a temporary one.
+     */
+    if (G == NULL) {
+        G = (int8_t *)(tmp + 2 * n);
+    }
+
+    /*
+     * Final F and G are in fk->tmp, one word per coefficient
+     * (signed value over 31 bits).
+     */
+    if (!poly_big_to_small(F, tmp, lim, logn)
+            || !poly_big_to_small(G, tmp + n, lim, logn)) {
+        return 0;
+    }
+
+    /*
+     * Verify that the NTRU equation is fulfilled. Since all elements
+     * have short lengths, verifying modulo a small prime p works, and
+     * allows using the NTT.
+     *
+     * We put Gt[] first in tmp[], and process it first, so that it does
+     * not overlap with G[] in case we allocated it ourselves.
+     */
+    Gt = tmp;
+    ft = Gt + n;
+    gt = ft + n;
+    Ft = gt + n;
+    gm = Ft + n;
+
+    primes = PRIMES;
+    p = primes[0].p;
+    p0i = modp_ninv31(p);
+    modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Gt[u] = modp_set(G[u], p);
+    }
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+        Ft[u] = modp_set(F[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+    modp_NTT2(Ft, gm, logn, p, p0i);
+    modp_NTT2(Gt, gm, logn, p, p0i);
+    r = modp_montymul(12289, 1, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t z;
+
+        z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+                     modp_montymul(gt[u], Ft[u], p, p0i), p);
+        if (z != r) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+    size_t n, u;
+    unsigned mod2;
+
+    n = MKN(logn);
+    mod2 = 0;
+    for (u = 0; u < n; u ++) {
+        int s;
+
+restart:
+        s = mkgauss(rng, logn);
+
+        /*
+         * We need the coefficient to fit within -127..+127;
+         * realistically, this is always the case except for
+         * the very low degrees (N = 2 or 4), for which there
+         * is no real security anyway.
+         */
+        if (s < -127 || s > 127) {
+            goto restart;
+        }
+
+        /*
+         * We need the sum of all coefficients to be 1; otherwise,
+         * the resultant of the polynomial with X^N+1 will be even,
+         * and the binary GCD will fail.
+         */
+        if (u == n - 1) {
+            if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+                goto restart;
+            }
+        } else {
+            mod2 ^= (unsigned)(s & 1);
+        }
+        f[u] = (int8_t)s;
+    }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_keygen(inner_shake256_context *rng,
+                                       int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+                                       unsigned logn, uint8_t *tmp) {
+    /*
+     * Algorithm is the following:
+     *
+     *  - Generate f and g with the Gaussian distribution.
+     *
+     *  - If either Res(f,phi) or Res(g,phi) is even, try again.
+     *
+     *  - If ||(f,g)|| is too large, try again.
+     *
+     *  - If ||B~_{f,g}|| is too large, try again.
+     *
+     *  - If f is not invertible mod phi mod q, try again.
+     *
+     *  - Compute h = g/f mod phi mod q.
+     *
+     *  - Solve the NTRU equation fG - gF = q; if the solving fails,
+     *    try again. Usual failure condition is when Res(f,phi)
+     *    and Res(g,phi) are not prime to each other.
+     */
+    size_t n, u;
+    int16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
+
+    n = MKN(logn);
+    rc = rng;
+
+    /*
+     * We need to generate f and g randomly, until we find values
+     * such that the norm of (g,-f), and of the orthogonalized
+     * vector, are satisfying. The orthogonalized vector is:
+     *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+     * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+     *
+     * In the binary case, coefficients of f and g are generated
+     * independently of each other, with a discrete Gaussian
+     * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+     * the two vectors have expected norm 1.17*sqrt(q), which is
+     * also our acceptance bound: we require both vectors to be no
+     * larger than that (this will be satisfied about 1/4th of the
+     * time, thus we expect sampling new (f,g) about 4 times for that
+     * step).
+     *
+     * We require that Res(f,phi) and Res(g,phi) are both odd (the
+     * NTRU equation solver requires it).
+     */
+    for (;;) {
+        fpr *rt1, *rt2, *rt3;
+        fpr bnorm;
+        uint32_t normf, normg, norm;
+        int lim;
+
+        /*
+         * The poly_small_mkgauss() function makes sure
+         * that the sum of coefficients is 1 modulo 2
+         * (i.e. the resultant of the polynomial with phi
+         * will be odd).
+         */
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
+
+        /*
+         * Verify that all coefficients are within the bounds
+         * defined in max_fg_bits. This is the case with
+         * overwhelming probability; this guarantees that the
+         * key will be encodable with FALCON_COMP_TRIM.
+         */
+        lim = 1 << (PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[logn] - 1);
+        for (u = 0; u < n; u ++) {
+            /*
+             * We can use non-CT tests since on any failure
+             * we will discard f and g.
+             */
+            if (f[u] >= lim || f[u] <= -lim
+                    || g[u] >= lim || g[u] <= -lim) {
+                lim = -1;
+                break;
+            }
+        }
+        if (lim < 0) {
+            continue;
+        }
+
+        /*
+         * Bound is 1.17*sqrt(q). We compute the squared
+         * norms. With q = 12289, the squared bound is:
+         *   (1.17^2)* 12289 = 16822.4121
+         * Since f and g are integral, the squared norm
+         * of (g,-f) is an integer.
+         */
+        normf = poly_small_sqnorm(f, logn);
+        normg = poly_small_sqnorm(g, logn);
+        norm = (normf + normg) | -((normf | normg) >> 31);
+        if (norm >= 16823) {
+            continue;
+        }
+
+        /*
+         * We compute the orthogonalized vector norm.
+         */
+        rt1 = (fpr *)tmp;
+        rt2 = rt1 + n;
+        rt3 = rt2 + n;
+
+        poly_small_to_fp(rt1, f, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt1, rt1, logn);
+
+        poly_small_to_fp(rt2, g, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt2, rt2, logn);
+
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(rt1, rt1, fpr_q, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt1, rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt1, logn);
+
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(rt2, rt2, fpr_q, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt3, logn);
+        PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt2, logn);
+
+        bnorm = PQCLEAN_FALCONPADDED512_AARCH64_compute_bnorm(rt1, rt2);
+
+        if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+            continue;
+        }
+
+        /*
+         * Compute public key h = g/f mod X^N+1 mod q. If this
+         * fails, we must restart.
+         */
+        if (h == NULL) {
+            h2 = (int16_t *)tmp;
+            tmp2 = h2 + n;
+        } else {
+            h2 = (int16_t *)h;
+            tmp2 = (int16_t *)tmp;
+        }
+
+        if (!PQCLEAN_FALCONPADDED512_AARCH64_compute_public(h2, f, g, tmp2)) {
+            continue;
+        }
+
+        /*
+         * Solve the NTRU equation to get F and G.
+         */
+        lim = (1 << (PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[logn] - 1)) - 1;
+        if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+            continue;
+        }
+
+        /*
+         * Key pair is generated.
+         */
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrof.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrof.h
new file mode 100644
index 000000000..c8f82991e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrof.h
@@ -0,0 +1,125 @@
+/*
+ * 64-bit Floating point NEON macro x1
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+
+// c <= addr x1
+#define vload(c, addr) c = vld1q_f64(addr);
+// c <= addr interleave 2
+#define vload2(c, addr) c = vld2q_f64(addr);
+// c <= addr interleave 4
+#define vload4(c, addr) c = vld4q_f64(addr);
+
+#define vstore(addr, c) vst1q_f64(addr, c);
+// addr <= c
+#define vstore2(addr, c) vst2q_f64(addr, c);
+// addr <= c
+#define vstore4(addr, c) vst4q_f64(addr, c);
+
+// c <= addr x2
+#define vloadx2(c, addr) c = vld1q_f64_x2(addr);
+// c <= addr x3
+#define vloadx3(c, addr) c = vld1q_f64_x3(addr);
+
+// addr <= c
+#define vstorex2(addr, c) vst1q_f64_x2(addr, c);
+
+// c = a - b
+#define vfsub(c, a, b) c = vsubq_f64(a, b);
+
+// c = a + b
+#define vfadd(c, a, b) c = vaddq_f64(a, b);
+
+// c = a * b
+#define vfmul(c, a, b) c = vmulq_f64(a, b);
+
+// c = a * n (n is constant)
+#define vfmuln(c, a, n) c = vmulq_n_f64(a, n);
+
+// Swap from a|b to b|a
+#define vswap(c, a) c = vextq_f64(a, a, 1);
+
+// c = a * b[i]
+#define vfmul_lane(c, a, b, i) c = vmulq_laneq_f64(a, b, i);
+
+// c = 1/a
+#define vfinv(c, a) c = vdivq_f64(vdupq_n_f64(1.0), a);
+
+// c = -a
+#define vfneg(c, a) c = vnegq_f64(a);
+
+#define transpose_f64(a, b, t, ia, ib, it)        \
+    t.val[it] = a.val[ia];                        \
+    a.val[ia] = vzip1q_f64(a.val[ia], b.val[ib]); \
+    b.val[ib] = vzip2q_f64(t.val[it], b.val[ib]);
+
+/*
+ * c = a + jb
+ * c[0] = a[0] - b[1]
+ * c[1] = a[1] + b[0]
+ */
+#define vfcaddj(c, a, b) c = vcaddq_rot90_f64(a, b);
+
+/*
+ * c = a - jb
+ * c[0] = a[0] + b[1]
+ * c[1] = a[1] - b[0]
+ */
+#define vfcsubj(c, a, b) c = vcaddq_rot270_f64(a, b);
+
+// c[0] = c[0] + b[0]*a[0], c[1] = c[1] + b[1]*a[0]
+#define vfcmla(c, a, b) c = vcmlaq_f64(c, a, b);
+
+// c[0] = c[0] - b[1]*a[1], c[1] = c[1] + b[0]*a[1]
+#define vfcmla_90(c, a, b) c = vcmlaq_rot90_f64(c, a, b);
+
+// c[0] = c[0] - b[0]*a[0], c[1] = c[1] - b[1]*a[0]
+#define vfcmla_180(c, a, b) c = vcmlaq_rot180_f64(c, a, b);
+
+// c[0] = c[0] + b[1]*a[1], c[1] = c[1] - b[0]*a[1]
+#define vfcmla_270(c, a, b) c = vcmlaq_rot270_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a*b
+ * c[0] = a[0]*b[0] - a[1]*b[1]
+ * c[1] = a[0]*b[1] + a[1]*b[0]
+ */
+#define FPC_CMUL(c, a, b)         \
+    c = vmulq_laneq_f64(b, a, 0); \
+    c = vcmlaq_rot90_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a * conjugate(b) = a * (b[0], -b[1])
+ * c[0] =   b[0]*a[0] + b[1]*a[1]
+ * c[1] = + b[0]*a[1] - b[1]*a[0]
+ */
+#define FPC_CMUL_CONJ(c, a, b)    \
+    c = vmulq_laneq_f64(a, b, 0); \
+    c = vcmlaq_rot270_f64(c, b, a);
+
+// d = c + a *b
+#define vfmla(d, c, a, b) d = vfmaq_f64(c, a, b);
+// d = c - a * b
+#define vfmls(d, c, a, b) d = vfmsq_f64(c, a, b);
+// d = c + a * b[i]
+#define vfmla_lane(d, c, a, b, i) d = vfmaq_laneq_f64(c, a, b, i);
+// d = c - a * b[i]
+#define vfmls_lane(d, c, a, b, i) d = vfmsq_laneq_f64(c, a, b, i);
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrofx4.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrofx4.h
new file mode 100644
index 000000000..e6b70e64e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrofx4.h
@@ -0,0 +1,430 @@
+/*
+ * 64-bit Floating point NEON macro x4
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+#include "macrof.h"
+
+#define vloadx4(c, addr) c = vld1q_f64_x4(addr);
+
+#define vstorex4(addr, c) vst1q_f64_x4(addr, c);
+
+#define vfdupx4(c, constant)          \
+    c.val[0] = vdupq_n_f64(constant); \
+    c.val[1] = vdupq_n_f64(constant); \
+    c.val[2] = vdupq_n_f64(constant); \
+    c.val[3] = vdupq_n_f64(constant);
+
+#define vfnegx4(c, a)               \
+    c.val[0] = vnegq_f64(a.val[0]); \
+    c.val[1] = vnegq_f64(a.val[1]); \
+    c.val[2] = vnegq_f64(a.val[2]); \
+    c.val[3] = vnegq_f64(a.val[3]);
+
+#define vfmulnx4(c, a, n)                \
+    c.val[0] = vmulq_n_f64(a.val[0], n); \
+    c.val[1] = vmulq_n_f64(a.val[1], n); \
+    c.val[2] = vmulq_n_f64(a.val[2], n); \
+    c.val[3] = vmulq_n_f64(a.val[3], n);
+
+// c = a - b
+#define vfsubx4(c, a, b)                      \
+    c.val[0] = vsubq_f64(a.val[0], b.val[0]); \
+    c.val[1] = vsubq_f64(a.val[1], b.val[1]); \
+    c.val[2] = vsubq_f64(a.val[2], b.val[2]); \
+    c.val[3] = vsubq_f64(a.val[3], b.val[3]);
+
+// c = a + b
+#define vfaddx4(c, a, b)                      \
+    c.val[0] = vaddq_f64(a.val[0], b.val[0]); \
+    c.val[1] = vaddq_f64(a.val[1], b.val[1]); \
+    c.val[2] = vaddq_f64(a.val[2], b.val[2]); \
+    c.val[3] = vaddq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4(c, a, b)                      \
+    c.val[0] = vmulq_f64(a.val[0], b.val[0]); \
+    c.val[1] = vmulq_f64(a.val[1], b.val[1]); \
+    c.val[2] = vmulq_f64(a.val[2], b.val[2]); \
+    c.val[3] = vmulq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4_i(c, a, b)             \
+    c.val[0] = vmulq_f64(a.val[0], b); \
+    c.val[1] = vmulq_f64(a.val[1], b); \
+    c.val[2] = vmulq_f64(a.val[2], b); \
+    c.val[3] = vmulq_f64(a.val[3], b);
+
+#define vfinvx4(c, a)                                 \
+    c.val[0] = vdivq_f64(vdupq_n_f64(1.0), a.val[0]); \
+    c.val[1] = vdivq_f64(vdupq_n_f64(1.0), a.val[1]); \
+    c.val[2] = vdivq_f64(vdupq_n_f64(1.0), a.val[2]); \
+    c.val[3] = vdivq_f64(vdupq_n_f64(1.0), a.val[3]);
+
+#define vfcvtx4(c, a)                   \
+    c.val[0] = vcvtq_f64_s64(a.val[0]); \
+    c.val[1] = vcvtq_f64_s64(a.val[1]); \
+    c.val[2] = vcvtq_f64_s64(a.val[2]); \
+    c.val[3] = vcvtq_f64_s64(a.val[3]);
+
+#define vfmlax4(d, c, a, b)                        \
+    vfmla(d.val[0], c.val[0], a.val[0], b.val[0]); \
+    vfmla(d.val[1], c.val[1], a.val[1], b.val[1]); \
+    vfmla(d.val[2], c.val[2], a.val[2], b.val[2]); \
+    vfmla(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfmlsx4(d, c, a, b)                        \
+    vfmls(d.val[0], c.val[0], a.val[0], b.val[0]); \
+    vfmls(d.val[1], c.val[1], a.val[1], b.val[1]); \
+    vfmls(d.val[2], c.val[2], a.val[2], b.val[2]); \
+    vfmls(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfrintx4(c, a)                   \
+    c.val[0] = vcvtnq_s64_f64(a.val[0]); \
+    c.val[1] = vcvtnq_s64_f64(a.val[1]); \
+    c.val[2] = vcvtnq_s64_f64(a.val[2]); \
+    c.val[3] = vcvtnq_s64_f64(a.val[3]);
+
+/*
+ * Wrapper for FFT, split/merge and poly_float.c
+ */
+
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmul(d_re, a_re, b_re);                        \
+    vfmls(d_re, d_re, a_im, b_im);                  \
+    vfmul(d_im, a_re, b_im);                        \
+    vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MULx2(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmul(d_re.val[0], a_re.val[0], b_re.val[0]);              \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmul(d_re.val[1], a_re.val[1], b_re.val[1]);              \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmul(d_im.val[0], a_re.val[0], b_im.val[0]);              \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmul(d_im.val[1], a_re.val[1], b_im.val[1]);              \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MULx4(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmul(d_re.val[0], a_re.val[0], b_re.val[0]);              \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmul(d_re.val[1], a_re.val[1], b_re.val[1]);              \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmul(d_re.val[2], a_re.val[2], b_re.val[2]);              \
+    vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+    vfmul(d_re.val[3], a_re.val[3], b_re.val[3]);              \
+    vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+    vfmul(d_im.val[0], a_re.val[0], b_im.val[0]);              \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmul(d_im.val[1], a_re.val[1], b_im.val[1]);              \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+    vfmul(d_im.val[2], a_re.val[2], b_im.val[2]);              \
+    vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+    vfmul(d_im.val[3], a_re.val[3], b_im.val[3]);              \
+    vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmla(d_re, d_re, a_re, b_re);                  \
+    vfmls(d_re, d_re, a_im, b_im);                  \
+    vfmla(d_im, d_im, a_re, b_im);                  \
+    vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im)          \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+    vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+    vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+    vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+    vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+    vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+    vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+    vfmla(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+    vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+    vfmla(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]); \
+    vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im)     \
+    vfmul(d_re.val[0], b_im.val[0], a_im.val[0]);              \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmul(d_re.val[1], b_im.val[1], a_im.val[1]);              \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmul(d_re.val[2], b_im.val[2], a_im.val[2]);              \
+    vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+    vfmul(d_re.val[3], b_im.val[3], a_im.val[3]);              \
+    vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+    vfmul(d_im.val[0], b_re.val[0], a_im.val[0]);              \
+    vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmul(d_im.val[1], b_re.val[1], a_im.val[1]);              \
+    vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmul(d_im.val[2], b_re.val[2], a_im.val[2]);              \
+    vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+    vfmul(d_im.val[3], b_re.val[3], a_im.val[3]);              \
+    vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im)     \
+    vfmla(d_re.val[0], d_re.val[0], b_im.val[0], a_im.val[0]); \
+    vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+    vfmla(d_re.val[1], d_re.val[1], b_im.val[1], a_im.val[1]); \
+    vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+    vfmla(d_re.val[2], d_re.val[2], b_im.val[2], a_im.val[2]); \
+    vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+    vfmla(d_re.val[3], d_re.val[3], b_im.val[3], a_im.val[3]); \
+    vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+    vfmla(d_im.val[0], d_im.val[0], b_re.val[0], a_im.val[0]); \
+    vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+    vfmla(d_im.val[1], d_im.val[1], b_re.val[1], a_im.val[1]); \
+    vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+    vfmla(d_im.val[2], d_im.val[2], b_re.val[2], a_im.val[2]); \
+    vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+    vfmla(d_im.val[3], d_im.val[3], b_re.val[3], a_im.val[3]); \
+    vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MUL_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+    vfmul_lane(d_re, a_re, b_re_im, 0);               \
+    vfmls_lane(d_re, d_re, a_im, b_re_im, 1);         \
+    vfmul_lane(d_im, a_re, b_re_im, 1);               \
+    vfmla_lane(d_im, d_im, a_im, b_re_im, 0);
+
+#define FPC_MUL_LANEx4(d_re, d_im, a_re, a_im, b_re_im)            \
+    vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0);              \
+    vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+    vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0);              \
+    vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+    vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0);              \
+    vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+    vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0);              \
+    vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+    vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 1);              \
+    vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 0); \
+    vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 1);              \
+    vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 0); \
+    vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 1);              \
+    vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 0); \
+    vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 1);              \
+    vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 0);
+
+#define FWD_TOP(t_re, t_im, b_re, b_im, zeta_re, zeta_im) \
+    FPC_MUL(t_re, t_im, b_re, b_im, zeta_re, zeta_im);
+
+#define FWD_TOP_LANE(t_re, t_im, b_re, b_im, zeta) \
+    FPC_MUL_LANE(t_re, t_im, b_re, b_im, zeta);
+
+#define FWD_TOP_LANEx4(t_re, t_im, b_re, b_im, zeta) \
+    FPC_MUL_LANEx4(t_re, t_im, b_re, b_im, zeta);
+
+/*
+ * FPC
+ */
+
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vsubq_f64(a_re, b_re);                   \
+    d_im = vsubq_f64(a_im, b_im);
+
+#define FPC_SUBx4(d_re, d_im, a_re, a_im, b_re, b_im)  \
+    d_re.val[0] = vsubq_f64(a_re.val[0], b_re.val[0]); \
+    d_im.val[0] = vsubq_f64(a_im.val[0], b_im.val[0]); \
+    d_re.val[1] = vsubq_f64(a_re.val[1], b_re.val[1]); \
+    d_im.val[1] = vsubq_f64(a_im.val[1], b_im.val[1]); \
+    d_re.val[2] = vsubq_f64(a_re.val[2], b_re.val[2]); \
+    d_im.val[2] = vsubq_f64(a_im.val[2], b_im.val[2]); \
+    d_re.val[3] = vsubq_f64(a_re.val[3], b_re.val[3]); \
+    d_im.val[3] = vsubq_f64(a_im.val[3], b_im.val[3]);
+
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vaddq_f64(a_re, b_re);                   \
+    d_im = vaddq_f64(a_im, b_im);
+
+#define FPC_ADDx4(d_re, d_im, a_re, a_im, b_re, b_im)  \
+    d_re.val[0] = vaddq_f64(a_re.val[0], b_re.val[0]); \
+    d_im.val[0] = vaddq_f64(a_im.val[0], b_im.val[0]); \
+    d_re.val[1] = vaddq_f64(a_re.val[1], b_re.val[1]); \
+    d_im.val[1] = vaddq_f64(a_im.val[1], b_im.val[1]); \
+    d_re.val[2] = vaddq_f64(a_re.val[2], b_re.val[2]); \
+    d_im.val[2] = vaddq_f64(a_im.val[2], b_im.val[2]); \
+    d_re.val[3] = vaddq_f64(a_re.val[3], b_re.val[3]); \
+    d_im.val[3] = vaddq_f64(a_im.val[3], b_im.val[3]);
+
+#define FWD_BOT(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUB(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADD(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUBx4(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADDx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+/*
+ * FPC_J
+ */
+
+#define FPC_ADDJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vsubq_f64(a_re, b_im);                    \
+    d_im = vaddq_f64(a_im, b_re);
+
+#define FPC_ADDJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re.val[0] = vsubq_f64(a_re.val[0], b_im.val[0]); \
+    d_im.val[0] = vaddq_f64(a_im.val[0], b_re.val[0]); \
+    d_re.val[1] = vsubq_f64(a_re.val[1], b_im.val[1]); \
+    d_im.val[1] = vaddq_f64(a_im.val[1], b_re.val[1]); \
+    d_re.val[2] = vsubq_f64(a_re.val[2], b_im.val[2]); \
+    d_im.val[2] = vaddq_f64(a_im.val[2], b_re.val[2]); \
+    d_re.val[3] = vsubq_f64(a_re.val[3], b_im.val[3]); \
+    d_im.val[3] = vaddq_f64(a_im.val[3], b_re.val[3]);
+
+#define FPC_SUBJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re = vaddq_f64(a_re, b_im);                    \
+    d_im = vsubq_f64(a_im, b_re);
+
+#define FPC_SUBJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+    d_re.val[0] = vaddq_f64(a_re.val[0], b_im.val[0]); \
+    d_im.val[0] = vsubq_f64(a_im.val[0], b_re.val[0]); \
+    d_re.val[1] = vaddq_f64(a_re.val[1], b_im.val[1]); \
+    d_im.val[1] = vsubq_f64(a_im.val[1], b_re.val[1]); \
+    d_re.val[2] = vaddq_f64(a_re.val[2], b_im.val[2]); \
+    d_im.val[2] = vsubq_f64(a_im.val[2], b_re.val[2]); \
+    d_re.val[3] = vaddq_f64(a_re.val[3], b_im.val[3]); \
+    d_im.val[3] = vsubq_f64(a_im.val[3], b_re.val[3]);
+
+#define FWD_BOTJ(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUBJ(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADDJ(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTJx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+    FPC_SUBJx4(b_re, b_im, a_re, a_im, t_re, t_im);    \
+    FPC_ADDJx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+//============== Inverse FFT
+/*
+ * FPC_J
+ * a * conj(b)
+ * Original (without swap):
+ * d_re = b_im * a_im + a_re * b_re;
+ * d_im = b_re * a_im - a_re * b_im;
+ */
+#define FPC_MUL_BOTJ_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+    vfmul_lane(d_re, a_re, b_re_im, 0);                    \
+    vfmla_lane(d_re, d_re, a_im, b_re_im, 1);              \
+    vfmul_lane(d_im, a_im, b_re_im, 0);                    \
+    vfmls_lane(d_im, d_im, a_re, b_re_im, 1);
+
+#define FPC_MUL_BOTJ_LANEx4(d_re, d_im, a_re, a_im, b_re_im)       \
+    vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0);              \
+    vfmla_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+    vfmul_lane(d_im.val[0], a_im.val[0], b_re_im, 0);              \
+    vfmls_lane(d_im.val[0], d_im.val[0], a_re.val[0], b_re_im, 1); \
+    vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0);              \
+    vfmla_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+    vfmul_lane(d_im.val[1], a_im.val[1], b_re_im, 0);              \
+    vfmls_lane(d_im.val[1], d_im.val[1], a_re.val[1], b_re_im, 1); \
+    vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0);              \
+    vfmla_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+    vfmul_lane(d_im.val[2], a_im.val[2], b_re_im, 0);              \
+    vfmls_lane(d_im.val[2], d_im.val[2], a_re.val[2], b_re_im, 1); \
+    vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0);              \
+    vfmla_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+    vfmul_lane(d_im.val[3], a_im.val[3], b_re_im, 0);              \
+    vfmls_lane(d_im.val[3], d_im.val[3], a_re.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmul(d_re, b_im, a_im);                             \
+    vfmla(d_re, d_re, a_re, b_re);                       \
+    vfmul(d_im, b_re, a_im);                             \
+    vfmls(d_im, d_im, a_re, b_im);
+
+#define INV_TOPJ(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);     \
+    FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUBx4(t_re, t_im, a_re, a_im, b_re, b_im);     \
+    FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+    FPC_MUL_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta);
+
+/*
+ * FPC_Jm
+ * a * -conj(b)
+ * d_re = a_re * b_im - a_im * b_re;
+ * d_im = a_im * b_im + a_re * b_re;
+ */
+#define FPC_MUL_BOTJm_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+    vfmul_lane(d_re, a_re, b_re_im, 1);                     \
+    vfmls_lane(d_re, d_re, a_im, b_re_im, 0);               \
+    vfmul_lane(d_im, a_re, b_re_im, 0);                     \
+    vfmla_lane(d_im, d_im, a_im, b_re_im, 1);
+
+#define FPC_MUL_BOTJm_LANEx4(d_re, d_im, a_re, a_im, b_re_im)      \
+    vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 1);              \
+    vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 0); \
+    vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 0);              \
+    vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 1); \
+    vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 1);              \
+    vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 0); \
+    vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 0);              \
+    vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 1); \
+    vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 1);              \
+    vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 0); \
+    vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 0);              \
+    vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 1); \
+    vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 1);              \
+    vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 0); \
+    vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 0);              \
+    vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJm(d_re, d_im, a_re, a_im, b_re, b_im) \
+    vfmul(d_re, a_re, b_im);                              \
+    vfmls(d_re, d_re, a_im, b_re);                        \
+    vfmul(d_im, a_im, b_im);                              \
+    vfmla(d_im, d_im, a_re, b_re);
+
+#define INV_TOPJm(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUB(t_re, t_im, b_re, b_im, a_re, a_im);      \
+    FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJmx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+    FPC_SUBx4(t_re, t_im, b_re, b_im, a_re, a_im);      \
+    FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+    FPC_MUL_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+    FPC_MUL_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta);
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrous.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrous.h
new file mode 100644
index 000000000..dfee8bc12
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrous.h
@@ -0,0 +1,469 @@
+/*
+ * Macro for sign/unsigned integer
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+
+#define vmull_lo(c, a, b) c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+
+#define vmull_hi(c, a, b) c = vmull_high_s16(a, b);
+
+#define vmulla_lo(d, c, a, b) d = vmlal_s16(c, vget_low_s16(a), vget_low_s16(b));
+
+#define vmulla_hi(d, c, a, b) d = vmlal_high_s16(c, a, b);
+
+#define vadd(c, a, b) c = vaddq_u32(a, b);
+
+#define vaddv(c, a) c = vaddvq_u32(a);
+
+#define vor(c, a, b) c = vorrq_u32(a, b);
+
+// Macro for NTT operation. Using signed 16-bit.
+#define vload_s16_4(c, addr) c = vld4q_s16(addr);
+#define vload_s16_x2(c, addr) c = vld1q_s16_x2(addr);
+#define vload_s16_x4(c, addr) c = vld1q_s16_x4(addr);
+
+#define vstore_s16_x4(addr, c) vst1q_s16_x4(addr, c);
+#define vstore_s16_x2(addr, c) vst1q_s16_x2(addr, c);
+#define vstore_s16_4(add, c) vst4q_s16(add, c);
+
+/*
+ * Strategy for NTT:
+ * - Forward and Inverse NTT multiply with constant, use either Barrett or Montgomery *Rounding* arithmetic
+ * - Pointwise multiplication must use Montgomery *Doubling* arithmetic
+ *
+ * Rounding because:
+ *
+ * - Montgomery need one coefficient to be *odd*, it only works with precomputed coefficient
+ * => Tried this approach, very strict on coefficient input range.
+ * => E.g a*b: a in [-R/2, R/2]. b in [-Q/2, Q/2] then c in [-2Q, 2Q]
+ *
+ *  - Barrett multiplication seem to work better with no restriction
+ * => Proved to be good. E.g c=a*b, a in [-R, R], b in [-Q/2, Q/2] then c in [-3Q/2, 3Q/2]
+ * However, depend on the input bound, the output bound is varies. By using this knowledge, we can further
+ * optimize Barrett point by carefully check the output bound according to input bound.
+ *
+ * - Barrett reduction with c = a % Q. a in [-R, R] then c in [-Q/2, Q/2]
+ *
+ *
+ * Doubling because
+ * - Montgomery Doubling work with two unknown coefficient, no constaint at all
+ * => c = a*b. a,b in [-R, R] c in [-Q, Q]
+ */
+
+// ------------ Forward NTT and Inverse NTT ------------
+/*
+ * GS Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define gsbf_br(a, b, zl, zh, QMVQ, t) \
+    t = vsubq_s16(a, b);               \
+    a = vaddq_s16(a, b);               \
+    b = vqrdmulhq_s16(t, zh);          \
+    t = vmulq_s16(t, zl);              \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri(a, b, zl, zh, i, QMVQ, t) \
+    t = vsubq_s16(a, b);                   \
+    a = vaddq_s16(a, b);                   \
+    b = vqrdmulhq_laneq_s16(t, zh, i);     \
+    t = vmulq_laneq_s16(t, zl, i);         \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t)   \
+    t.val[0] = vsubq_s16(a.val[0], b.val[0]);                \
+    t.val[1] = vsubq_s16(a.val[1], b.val[1]);                \
+    t.val[2] = vsubq_s16(a.val[2], b.val[2]);                \
+    t.val[3] = vsubq_s16(a.val[3], b.val[3]);                \
+    a.val[0] = vaddq_s16(a.val[0], b.val[0]);                \
+    a.val[1] = vaddq_s16(a.val[1], b.val[1]);                \
+    a.val[2] = vaddq_s16(a.val[2], b.val[2]);                \
+    a.val[3] = vaddq_s16(a.val[3], b.val[3]);                \
+    b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0);        \
+    b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1);        \
+    b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2);        \
+    b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3);        \
+    t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0);            \
+    b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+    t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1);            \
+    b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+    t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2);            \
+    b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+    t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3);            \
+    b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top_x4(a, b, t)                  \
+    t.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+    t.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+    t.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+    t.val[3] = vsubq_s16(a.val[3], b.val[3]); \
+    a.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+    a.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+    a.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+    a.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
+#define gsbf_bri_bot_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t)  \
+    b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0);        \
+    b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1);        \
+    b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2);        \
+    b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3);        \
+    t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0);            \
+    b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+    t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1);            \
+    b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+    t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2);            \
+    b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+    t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3);            \
+    b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top(a, b, t) \
+    t = vsubq_s16(a, b);  \
+    a = vaddq_s16(a, b);
+
+#define gsbf_bri_bot(b, zl, zh, i, QMVQ, t) \
+    b = vqrdmulhq_laneq_s16(t, zh, i);      \
+    t = vmulq_laneq_s16(t, zl, i);          \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_br_bot(b, zl, zh, QMVQ, t) \
+    b = vqrdmulhq_s16(t, zh);           \
+    t = vmulq_s16(t, zl);               \
+    b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+/*
+ * Barrett multiplication via *Rounding* use for Inverse NTT
+ * Input: a, b, zl, zh, Q. a in [-R, R]
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define barmul_invntt(a, zl, zh, i, QMVQ, t) \
+    t = vqrdmulhq_laneq_s16(a, zh, i);       \
+    a = vmulq_laneq_s16(a, zl, i);           \
+    a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barmul_invntt_x2(a, zl, zh, i, QMVQ, t)              \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i);         \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i);         \
+    a.val[0] = vmulq_laneq_s16(a.val[0], zl, i);             \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+    a.val[1] = vmulq_laneq_s16(a.val[1], zl, i);             \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0);
+
+#define barmul_invntt_x4(a, zl, zh, i, QMVQ, t)              \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i);         \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i);         \
+    t.val[2] = vqrdmulhq_laneq_s16(a.val[2], zh, i);         \
+    t.val[3] = vqrdmulhq_laneq_s16(a.val[3], zh, i);         \
+    a.val[0] = vmulq_laneq_s16(a.val[0], zl, i);             \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+    a.val[1] = vmulq_laneq_s16(a.val[1], zl, i);             \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+    a.val[2] = vmulq_laneq_s16(a.val[2], zl, i);             \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+    a.val[3] = vmulq_laneq_s16(a.val[3], zl, i);             \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+/*
+ * Convert coefficients to Montgomery domain
+ */
+#define barmuli_mont(a, QMVM, t)         \
+    t = vqrdmulhq_laneq_s16(a, QMVM, 6); \
+    a = vmulq_laneq_s16(a, QMVM, 2);     \
+    a = vmlsq_laneq_s16(a, t, QMVM, 0);
+
+#define barmuli_mont_x8(a, b, QMVM, t, t2)                    \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 6);        \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 6);        \
+    t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 6);        \
+    t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 6);        \
+    t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 6);       \
+    t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 6);       \
+    t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 6);       \
+    t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 6);       \
+    a.val[0] = vmulq_laneq_s16(a.val[0], QMVM, 2);            \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0);  \
+    a.val[1] = vmulq_laneq_s16(a.val[1], QMVM, 2);            \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0);  \
+    a.val[2] = vmulq_laneq_s16(a.val[2], QMVM, 2);            \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0);  \
+    a.val[3] = vmulq_laneq_s16(a.val[3], QMVM, 2);            \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0);  \
+    b.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 2);            \
+    b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+    b.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 2);            \
+    b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+    b.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 2);            \
+    b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+    b.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 2);            \
+    b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * Convert coefficients to Montgomery domain and embeded n^-1
+ */
+
+#define barmuli_mont_ninv_x8(a, b, QMVM, t, t2)               \
+    t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 7);        \
+    t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 7);        \
+    t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 7);        \
+    t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 7);        \
+    t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 7);       \
+    t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 7);       \
+    t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 7);       \
+    t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 7);       \
+    a.val[0] = vshlq_n_s16(a.val[0], FALCON_LOG2_NINV_MONT);  \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0);  \
+    a.val[1] = vshlq_n_s16(a.val[1], FALCON_LOG2_NINV_MONT);  \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0);  \
+    a.val[2] = vshlq_n_s16(a.val[2], FALCON_LOG2_NINV_MONT);  \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0);  \
+    a.val[3] = vshlq_n_s16(a.val[3], FALCON_LOG2_NINV_MONT);  \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0);  \
+    b.val[0] = vshlq_n_s16(b.val[0], FALCON_LOG2_NINV_MONT);  \
+    b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+    b.val[1] = vshlq_n_s16(b.val[1], FALCON_LOG2_NINV_MONT);  \
+    b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+    b.val[2] = vshlq_n_s16(b.val[2], FALCON_LOG2_NINV_MONT);  \
+    b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+    b.val[3] = vshlq_n_s16(b.val[3], FALCON_LOG2_NINV_MONT);  \
+    b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * CT Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define ctbf_br(a, b, zl, zh, QMVQ, t)  \
+    t = vqrdmulhq_s16(b, zh);           \
+    b = vmulq_s16(b, zl);               \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0); \
+    b = vsubq_s16(a, t);                \
+    a = vaddq_s16(a, t);
+
+#define ctbf_bri(a, b, zl, zh, i, QMVQ, t) \
+    t = vqrdmulhq_laneq_s16(b, zh, i);     \
+    b = vmulq_laneq_s16(b, zl, i);         \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0);    \
+    b = vsubq_s16(a, t);                   \
+    a = vaddq_s16(a, t);
+
+#define ctbf_br_top(b, zl, zh, QMVQ, t) \
+    t = vqrdmulhq_s16(b, zh);           \
+    b = vmulq_s16(b, zl);               \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bri_top(b, zl, zh, i, QMVQ, t) \
+    t = vqrdmulhq_laneq_s16(b, zh, i);      \
+    b = vmulq_laneq_s16(b, zl, i);          \
+    t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bot(a, b, t) \
+    b = vsubq_s16(a, t);  \
+    a = vaddq_s16(a, t);
+
+#define ctbf_bri_top_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t)  \
+    t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0);        \
+    t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1);        \
+    t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2);        \
+    t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3);        \
+    b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0);            \
+    t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+    b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1);            \
+    t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+    b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2);            \
+    t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+    b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3);            \
+    t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0);
+
+#define ctbf_bot_x4(a, b, t)                  \
+    b.val[0] = vsubq_s16(a.val[0], t.val[0]); \
+    b.val[1] = vsubq_s16(a.val[1], t.val[1]); \
+    b.val[2] = vsubq_s16(a.val[2], t.val[2]); \
+    b.val[3] = vsubq_s16(a.val[3], t.val[3]); \
+    a.val[0] = vaddq_s16(a.val[0], t.val[0]); \
+    a.val[1] = vaddq_s16(a.val[1], t.val[1]); \
+    a.val[2] = vaddq_s16(a.val[2], t.val[2]); \
+    a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+#define ctbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t)   \
+    t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0);        \
+    t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1);        \
+    t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2);        \
+    t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3);        \
+    b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0);            \
+    t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+    b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1);            \
+    t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+    b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2);            \
+    t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+    b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3);            \
+    t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0); \
+    b.val[0] = vsubq_s16(a.val[0], t.val[0]);                \
+    b.val[1] = vsubq_s16(a.val[1], t.val[1]);                \
+    b.val[2] = vsubq_s16(a.val[2], t.val[2]);                \
+    b.val[3] = vsubq_s16(a.val[3], t.val[3]);                \
+    a.val[0] = vaddq_s16(a.val[0], t.val[0]);                \
+    a.val[1] = vaddq_s16(a.val[1], t.val[1]);                \
+    a.val[2] = vaddq_s16(a.val[2], t.val[2]);                \
+    a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+// ------------ Pointwise Multiplication ------------
+/*
+ * Montgomery multiplication via *Doubling*
+ * Input: a, b, bNinv, Q
+ * Output: c = ab * R^-1
+ */
+#define montmul(c, a, b, QMVM, t)       \
+    c = vqdmulhq_s16(a, b);             \
+    t = vmulq_laneq_s16(b, QMVM, 1);    \
+    t = vmulq_s16(a, t);                \
+    t = vqdmulhq_laneq_s16(t, QMVM, 0); \
+    c = vhsubq_s16(c, t);
+
+#define montmul_x4(z, a, b, QMVM, t)                  \
+    z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]);      \
+    z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]);      \
+    z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]);      \
+    z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]);      \
+    t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1);    \
+    t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1);    \
+    t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1);    \
+    t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1);    \
+    t.val[0] = vmulq_s16(a.val[0], t.val[0]);         \
+    t.val[1] = vmulq_s16(a.val[1], t.val[1]);         \
+    t.val[2] = vmulq_s16(a.val[2], t.val[2]);         \
+    t.val[3] = vmulq_s16(a.val[3], t.val[3]);         \
+    t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+    z.val[0] = vhsubq_s16(z.val[0], t.val[0]);        \
+    t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+    z.val[1] = vhsubq_s16(z.val[1], t.val[1]);        \
+    t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+    z.val[2] = vhsubq_s16(z.val[2], t.val[2]);        \
+    t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+    z.val[3] = vhsubq_s16(z.val[3], t.val[3]);
+
+#define montmul_x8(z, w, a, b, e, f, QMVM, t, k)      \
+    z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]);      \
+    z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]);      \
+    z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]);      \
+    z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]);      \
+    w.val[0] = vqdmulhq_s16(e.val[0], f.val[0]);      \
+    w.val[1] = vqdmulhq_s16(e.val[1], f.val[1]);      \
+    w.val[2] = vqdmulhq_s16(e.val[2], f.val[2]);      \
+    w.val[3] = vqdmulhq_s16(e.val[3], f.val[3]);      \
+    t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1);    \
+    t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1);    \
+    t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1);    \
+    t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1);    \
+    k.val[0] = vmulq_laneq_s16(f.val[0], QMVM, 1);    \
+    k.val[1] = vmulq_laneq_s16(f.val[1], QMVM, 1);    \
+    k.val[2] = vmulq_laneq_s16(f.val[2], QMVM, 1);    \
+    k.val[3] = vmulq_laneq_s16(f.val[3], QMVM, 1);    \
+    t.val[0] = vmulq_s16(a.val[0], t.val[0]);         \
+    t.val[1] = vmulq_s16(a.val[1], t.val[1]);         \
+    t.val[2] = vmulq_s16(a.val[2], t.val[2]);         \
+    t.val[3] = vmulq_s16(a.val[3], t.val[3]);         \
+    k.val[0] = vmulq_s16(e.val[0], k.val[0]);         \
+    k.val[1] = vmulq_s16(e.val[1], k.val[1]);         \
+    k.val[2] = vmulq_s16(e.val[2], k.val[2]);         \
+    k.val[3] = vmulq_s16(e.val[3], k.val[3]);         \
+    t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+    z.val[0] = vhsubq_s16(z.val[0], t.val[0]);        \
+    t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+    z.val[1] = vhsubq_s16(z.val[1], t.val[1]);        \
+    t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+    z.val[2] = vhsubq_s16(z.val[2], t.val[2]);        \
+    t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+    z.val[3] = vhsubq_s16(z.val[3], t.val[3]);        \
+    k.val[0] = vqdmulhq_laneq_s16(k.val[0], QMVM, 0); \
+    w.val[0] = vhsubq_s16(w.val[0], k.val[0]);        \
+    k.val[1] = vqdmulhq_laneq_s16(k.val[1], QMVM, 0); \
+    w.val[1] = vhsubq_s16(w.val[1], k.val[1]);        \
+    k.val[2] = vqdmulhq_laneq_s16(k.val[2], QMVM, 0); \
+    w.val[2] = vhsubq_s16(w.val[2], k.val[2]);        \
+    k.val[3] = vqdmulhq_laneq_s16(k.val[3], QMVM, 0); \
+    w.val[3] = vhsubq_s16(w.val[3], k.val[3]);
+
+// ------------ Barrett Reduction ------------
+/*
+ * Barrett reduction, return [-Q/2, Q/2]
+ * `v` = 5461, `n` = 11
+ */
+#define barrett(a, QMVQ, t)             \
+    t = vqdmulhq_laneq_s16(a, QMVQ, 4); \
+    t = vrshrq_n_s16(t, 11);            \
+    a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barrett_x2(a, i, j, m, n, QMVQ, t)                   \
+    t.val[m] = vqdmulhq_laneq_s16(a.val[i], QMVQ, 4);        \
+    t.val[m] = vrshrq_n_s16(t.val[m], 11);                   \
+    t.val[n] = vqdmulhq_laneq_s16(a.val[j], QMVQ, 4);        \
+    t.val[n] = vrshrq_n_s16(t.val[n], 11);                   \
+    a.val[i] = vmlsq_laneq_s16(a.val[i], t.val[m], QMVQ, 0); \
+    a.val[j] = vmlsq_laneq_s16(a.val[j], t.val[n], QMVQ, 0);
+
+#define barrett_x4(a, QMVQ, t)                               \
+    t.val[0] = vqdmulhq_laneq_s16(a.val[0], QMVQ, 4);        \
+    t.val[0] = vrshrq_n_s16(t.val[0], 11);                   \
+    t.val[1] = vqdmulhq_laneq_s16(a.val[1], QMVQ, 4);        \
+    t.val[1] = vrshrq_n_s16(t.val[1], 11);                   \
+    t.val[2] = vqdmulhq_laneq_s16(a.val[2], QMVQ, 4);        \
+    t.val[2] = vrshrq_n_s16(t.val[2], 11);                   \
+    t.val[3] = vqdmulhq_laneq_s16(a.val[3], QMVQ, 4);        \
+    t.val[3] = vrshrq_n_s16(t.val[3], 11);                   \
+    a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+    a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+    a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+    a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+// ------------ Matrix Transpose ------------
+/*
+ * Matrix 4x4 transpose: v
+ * Input: int16x8x4_t v, tmp
+ * Output: int16x8x4_t v
+ */
+#define transpose(v, tmp)                                                           \
+    tmp.val[0] = vtrn1q_s16(v.val[0], v.val[1]);                                    \
+    tmp.val[1] = vtrn2q_s16(v.val[0], v.val[1]);                                    \
+    tmp.val[2] = vtrn1q_s16(v.val[2], v.val[3]);                                    \
+    tmp.val[3] = vtrn2q_s16(v.val[2], v.val[3]);                                    \
+    v.val[0] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+    v.val[2] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+    v.val[1] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]); \
+    v.val[3] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]);
+
+// ------------ Re-arrange vector ------------
+#define arrange(v_out, v_in, i, j, m, n, a, b, c, d)                                      \
+    v_out.val[a] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+    v_out.val[b] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+    v_out.val[c] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]); \
+    v_out.val[d] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]);
+
+// ------------ Addition/Subtraction ------------
+#define vsub_x4(c, a, b)                      \
+    c.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+    c.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+    c.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+    c.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+#define vadd_x4(c, a, b)                      \
+    c.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+    c.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+    c.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+    c.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt.c
new file mode 100644
index 000000000..9b8c7e92f
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt.c
@@ -0,0 +1,822 @@
+/*
+ * High-speed vectorize NTT for N = 512, 1024
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrous.h"
+#include "ntt_consts.h"
+#include "poly.h"
+
+#include <arm_neon.h>
+
+/*
+ * Assume Input in the range [-Q/2, Q/2]
+ * Total Barrett point for N = 512, 1024: 2048, 4096
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont) {
+    // Total SIMD registers 29 = 16 + 12 + 1
+    int16x8x4_t v0, v1, v2, v3; // 16
+    int16x8x4_t zl, zh, t, t2;  // 12
+    int16x8x2_t zlh, zhh;       // 4
+    int16x8_t neon_qmvq;        // 1
+    const int16_t *ptr_ntt_br = PQCLEAN_FALCONPADDED512_AARCH64_ntt_br;
+    const int16_t *ptr_ntt_qinv_br = PQCLEAN_FALCONPADDED512_AARCH64_ntt_qinv_br;
+
+    neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+    zl.val[0] = vld1q_s16(ptr_ntt_br);
+    zh.val[0] = vld1q_s16(ptr_ntt_qinv_br);
+    ptr_ntt_br += 8;
+    ptr_ntt_qinv_br += 8;
+
+    // Layer 8, 7
+    for (unsigned j = 0; j < 128; j += 32) {
+        vload_s16_x4(v0, &a[j]);
+        vload_s16_x4(v1, &a[j + 128]);
+        vload_s16_x4(v2, &a[j + 256]);
+        vload_s16_x4(v3, &a[j + 384]);
+
+        // v0: .5
+        // v1: .5
+        // v2: .5
+        // v3: .5
+
+        // Layer 8
+        // v0 - v2, v1 - v3
+        ctbf_bri_top_x4(v2, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t);
+        ctbf_bri_top_x4(v3, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t2);
+
+        ctbf_bot_x4(v0, v2, t);
+        ctbf_bot_x4(v1, v3, t2);
+
+        // v0: 1.2
+        // v1: 1.2
+        // v2: 1.2
+        // v3: 1.2
+
+        // Layer 7
+        // v0 - v1, v2 - v3
+        ctbf_bri_top_x4(v1, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t);
+        ctbf_bri_top_x4(v3, zl.val[0], zh.val[0], 3, 3, 3, 3, neon_qmvq, t2);
+
+        ctbf_bot_x4(v0, v1, t);
+        ctbf_bot_x4(v2, v3, t2);
+
+        // 2.14 -> 0.5
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // Store at 0.5Q
+        vstore_s16_x4(&a[j], v0);
+        vstore_s16_x4(&a[j + 128], v1);
+        vstore_s16_x4(&a[j + 256], v2);
+        vstore_s16_x4(&a[j + 384], v3);
+    }
+
+    // Layer 6, 5, 4, 3, 2, 1, 0
+    for (unsigned j = 0; j < FALCON_N; j += 128) {
+        vload_s16_x4(v0, &a[j]);
+        vload_s16_x4(v1, &a[j + 32]);
+        vload_s16_x4(v2, &a[j + 64]);
+        vload_s16_x4(v3, &a[j + 96]);
+
+        vload_s16_x2(zlh, ptr_ntt_br);
+        vload_s16_x2(zhh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 16;
+        ptr_ntt_qinv_br += 16;
+
+        // Layer 6
+        // v0 - v2, v1 - v3
+        ctbf_bri_top_x4(v2, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+        ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t2);
+
+        ctbf_bot_x4(v0, v2, t);
+        ctbf_bot_x4(v1, v3, t2);
+
+        // 1.3
+
+        // Layer 5
+        // v0 - v1, v2 - v3
+        ctbf_bri_top_x4(v1, zlh.val[0], zhh.val[0], 1, 1, 1, 1, neon_qmvq, t);
+        ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+
+        ctbf_bot_x4(v0, v1, t);
+        ctbf_bot_x4(v2, v3, t2);
+
+        // 2.3 -> 0.5
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // Layer 4
+        // v0(0, 1 - 2, 3)
+        // v1(0, 1 - 2, 3)
+        // v2(0, 1 - 2, 3)
+        // v3(0, 1 - 2, 3)
+        ctbf_bri_top(v0.val[2], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[0]);
+        ctbf_bri_top(v0.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[1]);
+        ctbf_bri_top(v1.val[2], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[2]);
+        ctbf_bri_top(v1.val[3], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[3]);
+
+        ctbf_bri_top(v2.val[2], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[0]);
+        ctbf_bri_top(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+        ctbf_bri_top(v3.val[2], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+        ctbf_bri_top(v3.val[3], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+        ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+        ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+        ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+        ctbf_bot(v2.val[0], v2.val[2], t2.val[0]);
+        ctbf_bot(v2.val[1], v2.val[3], t2.val[1]);
+        ctbf_bot(v3.val[0], v3.val[2], t2.val[2]);
+        ctbf_bot(v3.val[1], v3.val[3], t2.val[3]);
+
+        // 1.3
+
+        // Layer 3
+        // v0(0, 2 - 1, 3)
+        // v1(0, 2 - 1, 3)
+        // v2(0, 2 - 1, 3)
+        // v3(0, 2 - 1, 3)
+        ctbf_bri_top(v0.val[1], zlh.val[0], zhh.val[0], 7, neon_qmvq, t.val[0]);
+        ctbf_bri_top(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+        ctbf_bri_top(v1.val[1], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+        ctbf_bri_top(v1.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t.val[3]);
+
+        ctbf_bri_top(v2.val[1], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[0]);
+        ctbf_bri_top(v2.val[3], zlh.val[1], zhh.val[1], 4, neon_qmvq, t2.val[1]);
+        ctbf_bri_top(v3.val[1], zlh.val[1], zhh.val[1], 5, neon_qmvq, t2.val[2]);
+        ctbf_bri_top(v3.val[3], zlh.val[1], zhh.val[1], 6, neon_qmvq, t2.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+        ctbf_bot(v0.val[2], v0.val[3], t.val[1]);
+        ctbf_bot(v1.val[0], v1.val[1], t.val[2]);
+        ctbf_bot(v1.val[2], v1.val[3], t.val[3]);
+
+        ctbf_bot(v2.val[0], v2.val[1], t2.val[0]);
+        ctbf_bot(v2.val[2], v2.val[3], t2.val[1]);
+        ctbf_bot(v3.val[0], v3.val[1], t2.val[2]);
+        ctbf_bot(v3.val[2], v3.val[3], t2.val[3]);
+
+        // 2.3 -> 0.5
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // Layer 2
+        // Input:
+        // 0,  1,  2,  3  | 4,  5,  6,  7
+        // 8,  9,  10, 11 | 12, 13, 14, 15
+        // 16, 17, 18, 19 | 20, 21, 22, 23
+        // 24, 25, 26, 27 | 28, 29, 30, 31
+        arrange(t, v0, 0, 2, 1, 3, 0, 1, 2, 3);
+        v0 = t;
+        arrange(t, v1, 0, 2, 1, 3, 0, 1, 2, 3);
+        v1 = t;
+        arrange(t2, v2, 0, 2, 1, 3, 0, 1, 2, 3);
+        v2 = t2;
+        arrange(t2, v3, 0, 2, 1, 3, 0, 1, 2, 3);
+        v3 = t2;
+        // Output:
+        // 0,  1,  2,  3  | 16, 17, 18, 19
+        // 4,  5,  6,  7  | 20, 21, 22, 23
+        // 8,  9,  10, 11 | 24, 25, 26, 27
+        // 12, 13, 14, 15 | 28, 29, 30, 31
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+        ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+        ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+        ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+        ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+        ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+        ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+        // 1.3
+
+        // Layer 1: v0.val[0] x v0.val[2] | v0.val[1] x v0.val[3]
+        // v0.val[0]: 0,  1,  2,  3  | 16, 17, 18, 19
+        // v0.val[1]: 4,  5,  6,  7  | 20, 21, 22, 23
+        // v0.val[2]: 8,  9,  10, 11 | 24, 25, 26, 27
+        // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+        // transpose 4x4
+        transpose(v0, t);
+        transpose(v1, t);
+        transpose(v2, t2);
+        transpose(v3, t2);
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[2], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[1]);
+        ctbf_br_top(v1.val[2], zl.val[1], zh.val[1], neon_qmvq, t.val[2]);
+        ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+        ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+        ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+        ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+        ctbf_br_top(v2.val[2], zl.val[2], zh.val[2], neon_qmvq, t.val[0]);
+        ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[1]);
+        ctbf_br_top(v3.val[2], zl.val[3], zh.val[3], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v2.val[0], v2.val[2], t.val[0]);
+        ctbf_bot(v2.val[1], v2.val[3], t.val[1]);
+        ctbf_bot(v3.val[0], v3.val[2], t.val[2]);
+        ctbf_bot(v3.val[1], v3.val[3], t.val[3]);
+
+        // 2.3 -> 0.5
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // Layer 0
+        // v(0, 2 - 1, 3)
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+        ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+        ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+        ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+        vload_s16_x4(zl, ptr_ntt_br);
+        vload_s16_x4(zh, ptr_ntt_qinv_br);
+        ptr_ntt_br += 32;
+        ptr_ntt_qinv_br += 32;
+
+        ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+        ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+        ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+        ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+        ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+        ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+        ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+        ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+        // 1.3
+        if (mont == NTT_MONT) {
+            // Convert to Montgomery domain by multiply with FALCON_MONT
+            barmuli_mont_x8(v0, v1, neon_qmvq, t, t2);
+            barmuli_mont_x8(v2, v3, neon_qmvq, t, t2);
+        } else if (mont == NTT_MONT_INV) {
+            barmuli_mont_ninv_x8(v0, v1, neon_qmvq, t, t2);
+            barmuli_mont_ninv_x8(v2, v3, neon_qmvq, t, t2);
+        }
+
+        vstore_s16_4(&a[j], v0);
+        vstore_s16_4(&a[j + 32], v1);
+        vstore_s16_4(&a[j + 64], v2);
+        vstore_s16_4(&a[j + 96], v3);
+    }
+}
+
+/*
+ * Assume input in range [-Q, Q]
+ * Total Barrett point N = 512, 1024: 1792, 3840
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv) {
+    // Total SIMD registers: 29 = 16 + 12 + 1
+    int16x8x4_t v0, v1, v2, v3; // 16
+    int16x8x4_t zl, zh, t, t2;  // 12
+    int16x8x2_t zlh, zhh;       // 4
+    int16x8_t neon_qmvq;        // 1
+    const int16_t *ptr_invntt_br = PQCLEAN_FALCONPADDED512_AARCH64_invntt_br;
+    const int16_t *ptr_invntt_qinv_br = PQCLEAN_FALCONPADDED512_AARCH64_invntt_qinv_br;
+
+    neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+    unsigned j;
+
+    // Layer 0, 1, 2, 3, 4, 5, 6
+    for (j = 0; j < FALCON_N; j += 128) {
+        vload_s16_4(v0, &a[j]);
+        vload_s16_4(v1, &a[j + 32]);
+        vload_s16_4(v2, &a[j + 64]);
+        vload_s16_4(v3, &a[j + 96]);
+
+        // Layer 0
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+        gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+        gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+        gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+        gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+        gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+        gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+        gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+        gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        // 0 - 1*, 2 - 3*
+        gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+        gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+        gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+        vload_s16_x4(zl, ptr_invntt_br);
+        vload_s16_x4(zh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 32;
+        ptr_invntt_qinv_br += 32;
+
+        gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+        gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+        gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+        gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+        // 0: 2
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        barrett(v0.val[0], neon_qmvq, t.val[0]);
+        barrett(v1.val[0], neon_qmvq, t.val[1]);
+        barrett(v2.val[0], neon_qmvq, t.val[2]);
+        barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+        // 0: 0.5
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        // Layer 1
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+        // 0 - 2*, 1 - 3*
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+        gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+        gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+        gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+        gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+        gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+        gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+        gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+        gsbf_br_bot(v0.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+        gsbf_br_bot(v0.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t.val[1]);
+        gsbf_br_bot(v1.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t.val[2]);
+        gsbf_br_bot(v1.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_br_bot(v2.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[0]);
+        gsbf_br_bot(v2.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[1]);
+        gsbf_br_bot(v3.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[2]);
+        gsbf_br_bot(v3.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[3]);
+
+        // 0: 2.5
+        // 1: 2.6
+        // 2: 1.5
+        // 3: 1.5
+
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // 0: 0.5
+        // 1: 0.5
+        // 2: 0.5
+        // 3: 0.5
+
+        // Layer 2
+        // Before Transpose
+        // v0.val[0]: 0, 4, 8,  12 | 16, 20, 24, 28
+        // v0.val[1]: 1, 5, 9,  13 | 17, 21, 25, 29
+        // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+        // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+        transpose(v0, t);
+        transpose(v1, t);
+        transpose(v2, t2);
+        transpose(v3, t2);
+
+        // After Transpose
+        // v0.val[0]: 0,  1,  2,  3  | 16,  17,  18,  19
+        // v0.val[1]: 4,  5,  6,  7  | 20,  21,  22,  23
+        // v0.val[2]: 8,  9,  10, 11 | 24,  25,  26,  27
+        // v0.val[3]: 12, 13, 14, 15 | 28,  29,  30,  31
+        // 0 - 1*, 2 - 3*
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+        gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+        gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+        gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+        gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+        gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+        gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+        gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+        gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+        gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+        gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+        vload_s16_x4(zl, ptr_invntt_br);
+        vload_s16_x4(zh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 32;
+        ptr_invntt_qinv_br += 32;
+
+        gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+        gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+        gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+        gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+        // 0: 1
+        // 1: 0.9
+        // 2: 1
+        // 3: 0.9
+
+        // Layer 3
+        // Re-arrange vector from
+        // v0.val[0]: 0,  1,  2,  3  | 16,  17,  18,  19
+        // v0.val[1]: 4,  5,  6,  7  | 20,  21,  22,  23
+        // v0.val[2]: 8,  9,  10, 11 | 24,  25,  26,  27
+        // v0.val[3]: 12, 13, 14, 15 | 28,  29,  30,  31
+        // Compiler will handle register re-naming
+        arrange(t, v0, 0, 1, 2, 3, 0, 2, 1, 3);
+        v0 = t;
+
+        // Compiler will handle register re-naming
+        arrange(t, v1, 0, 1, 2, 3, 0, 2, 1, 3);
+        v1 = t;
+
+        // Compiler will handle register re-naming
+        arrange(t2, v2, 0, 1, 2, 3, 0, 2, 1, 3);
+        v2 = t2;
+
+        // Compiler will handle register re-naming
+        arrange(t2, v3, 0, 1, 2, 3, 0, 2, 1, 3);
+        v3 = t2;
+        // To
+        // v0.val[0]: 0,  1,  2,  3  | 4,  5,  6,  7
+        // v0.val[1]: 8,  9,  10, 11 | 12, 13, 14, 15
+        // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+        // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+        // 0 - 1, 2 - 3
+        vload_s16_x2(zlh, ptr_invntt_br);
+        vload_s16_x2(zhh, ptr_invntt_qinv_br);
+        ptr_invntt_br += 16;
+        ptr_invntt_qinv_br += 16;
+
+        gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+        gsbf_top(v0.val[2], v0.val[3], t.val[1]);
+        gsbf_top(v1.val[0], v1.val[1], t.val[2]);
+        gsbf_top(v1.val[2], v1.val[3], t.val[3]);
+
+        gsbf_top(v2.val[0], v2.val[1], t2.val[0]);
+        gsbf_top(v2.val[2], v2.val[3], t2.val[1]);
+        gsbf_top(v3.val[0], v3.val[1], t2.val[2]);
+        gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+        gsbf_bri_bot(v0.val[1], zlh.val[0], zhh.val[0], 0, neon_qmvq, t.val[0]);
+        gsbf_bri_bot(v0.val[3], zlh.val[0], zhh.val[0], 1, neon_qmvq, t.val[1]);
+        gsbf_bri_bot(v1.val[1], zlh.val[0], zhh.val[0], 2, neon_qmvq, t.val[2]);
+        gsbf_bri_bot(v1.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[3]);
+
+        gsbf_bri_bot(v2.val[1], zlh.val[0], zhh.val[0], 4, neon_qmvq, t2.val[0]);
+        gsbf_bri_bot(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+        gsbf_bri_bot(v3.val[1], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+        gsbf_bri_bot(v3.val[3], zlh.val[0], zhh.val[0], 7, neon_qmvq, t2.val[3]);
+
+        // 0: 2
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        barrett(v0.val[0], neon_qmvq, t.val[0]);
+        barrett(v1.val[0], neon_qmvq, t.val[1]);
+        barrett(v2.val[0], neon_qmvq, t.val[2]);
+        barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+        // 0: 0.5
+        // 1: 1.3
+        // 2: 2
+        // 3: 1.3
+
+        // Layer 4
+        // v0.val[0]: 0,  1,  2,  3  | 4,  5,  6,  7
+        // v0.val[1]: 8,  9,  10, 11 | 12, 13, 14, 15
+        // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+        // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+        // 0 - 2, 1 - 3
+
+        gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+        gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+        gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+        gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+        gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+        gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+        gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+        gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+        gsbf_bri_bot(v0.val[2], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[0]);
+        gsbf_bri_bot(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+        gsbf_bri_bot(v1.val[2], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+        gsbf_bri_bot(v1.val[3], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[3]);
+
+        gsbf_bri_bot(v2.val[2], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[0]);
+        gsbf_bri_bot(v2.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[1]);
+        gsbf_bri_bot(v3.val[2], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[2]);
+        gsbf_bri_bot(v3.val[3], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[3]);
+
+        // 0: 2.5
+        // 1: 2.5
+        // 2: 1.5
+        // 3: 1.5
+
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // 0: 0.5
+        // 1: 0.5
+        // 2: 0.5
+        // 3: 0.5
+
+        // Layer 5
+        // Cross block
+        // v0.0->3 - v1.0->3
+        gsbf_top_x4(v0, v1, t);
+        gsbf_top_x4(v2, v3, t2);
+
+        gsbf_bri_bot_x4(v1, zlh.val[1], zhh.val[1], 4, 4, 4, 4, neon_qmvq, t);
+        gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 5, 5, 5, 5, neon_qmvq, t2);
+
+        // v0: 1
+        // v1: 0.9
+        // v2: 1
+        // v3: 0.9
+
+        // Layer 6
+        // Cross block
+        // v0.0->3 - v2.0->3
+        gsbf_top_x4(v0, v2, t);
+        gsbf_top_x4(v1, v3, t2);
+
+        gsbf_bri_bot_x4(v2, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t);
+        gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t2);
+
+        // v0: 2
+        // v1: 1.8
+        // v2: 1.3
+        // v3: 1.2
+
+        vstore_s16_x4(&a[j], v0);
+        vstore_s16_x4(&a[j + 32], v1);
+        vstore_s16_x4(&a[j + 64], v2);
+        vstore_s16_x4(&a[j + 96], v3);
+    }
+
+    zl.val[0] = vld1q_s16(ptr_invntt_br);
+    zh.val[0] = vld1q_s16(ptr_invntt_qinv_br);
+
+    // Layer 7, 8
+    for (j = 0; j < 64; j += 32) {
+        vload_s16_x4(v0, &a[j]);
+        vload_s16_x4(v1, &a[j + 128]);
+        vload_s16_x4(v2, &a[j + 256]);
+        vload_s16_x4(v3, &a[j + 384]);
+
+        // 2
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // v0: .5
+        // v1: .5
+        // v2: .5
+        // v3: .5
+
+        // Layer 7
+        // v0 - v1, v2 - v3
+        gsbf_top_x4(v0, v1, t);
+        gsbf_top_x4(v2, v3, t2);
+
+        gsbf_bri_bot_x4(v1, zl.val[0], zh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+        gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t2);
+
+        // v0: 1
+        // v1: .87
+        // v2: 1
+        // v3: .87
+
+        // Layer 8
+        // v0 - v2, v1 - v3
+        gsbf_top_x4(v0, v2, t);
+        gsbf_top_x4(v1, v3, t2);
+
+        // v0: 2
+        // v1: 1.75
+        // v2: 1.25
+        // v3: 1.15
+        if (ninv == INVNTT_NINV) {
+            gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t);
+            gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+            barmul_invntt_x4(v0, zl.val[0], zh.val[0], 3, neon_qmvq, t);
+            barmul_invntt_x4(v1, zl.val[0], zh.val[0], 3, neon_qmvq, t2);
+        } else {
+            gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t);
+            gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t2);
+        }
+
+        // v0: 1.25
+        // v1: 1.15
+        // v2: 1.25
+        // v3: 1.15
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+
+        // v0: 0.5
+        // v1: 0.5
+        // v2: 0.97
+        // v3: 0.93
+
+        vstore_s16_x4(&a[j], v0);
+        vstore_s16_x4(&a[j + 128], v1);
+        vstore_s16_x4(&a[j + 256], v2);
+        vstore_s16_x4(&a[j + 384], v3);
+    }
+    for (; j < 128; j += 32) {
+        vload_s16_x4(v0, &a[j]);
+        vload_s16_x4(v1, &a[j + 128]);
+        vload_s16_x4(v2, &a[j + 256]);
+        vload_s16_x4(v3, &a[j + 384]);
+
+        // v0: 1.3
+        // v1: 1.3
+        // v2: 1.3
+        // v3: 1.3
+
+        // Layer 7
+        // v0 - v1, v2 - v3
+        gsbf_top_x4(v0, v1, t);
+        gsbf_top_x4(v2, v3, t2);
+
+        gsbf_bri_bot_x4(v1, zl.val[0], zh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+        gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t2);
+
+        // v0: 2.6
+        // v1: 1.5
+        // v2: 2.6
+        // v3: 1.5
+
+        barrett_x4(v0, neon_qmvq, t);
+        barrett_x4(v1, neon_qmvq, t);
+        barrett_x4(v2, neon_qmvq, t2);
+        barrett_x4(v3, neon_qmvq, t2);
+
+        // v0: 0.5
+        // v1: 0.5
+        // v2: 0.5
+        // v3: 0.5
+
+        // Layer 8
+        // v0 - v2, v1 - v3
+        gsbf_top_x4(v0, v2, t);
+        gsbf_top_x4(v1, v3, t2);
+
+        // v0: 1
+        // v1: 1
+        // v2: .87
+        // v3: .87
+        if (ninv == INVNTT_NINV) {
+            gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t);
+            gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+            barmul_invntt_x4(v0, zl.val[0], zh.val[0], 3, neon_qmvq, t);
+            barmul_invntt_x4(v1, zl.val[0], zh.val[0], 3, neon_qmvq, t2);
+        } else {
+            gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t);
+            gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t2);
+        }
+
+        // v0: .87
+        // v1: .87
+        // v2: .83
+        // v3: .83
+
+        vstore_s16_x4(&a[j], v0);
+        vstore_s16_x4(&a[j + 128], v1);
+        vstore_s16_x4(&a[j + 256], v2);
+        vstore_s16_x4(&a[j + 384], v3);
+    }
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+    // Total SIMD registers: 29 = 28 + 1
+    int16x8x4_t a, b, c, d, e1, e2, t, k; // 28
+    int16x8_t neon_qmvm;                  // 1
+    neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &f[i]);
+        vload_s16_x4(b, &g[i]);
+        vload_s16_x4(c, &f[i + 32]);
+        vload_s16_x4(d, &g[i + 32]);
+
+        montmul_x8(e1, e2, a, b, c, d, neon_qmvm, t, k);
+
+        vstore_s16_x4(&f[i], e1);
+        vstore_s16_x4(&f[i + 32], e2);
+    }
+}
+
+/* ===================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.c
new file mode 100644
index 000000000..1f0076ebd
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.c
@@ -0,0 +1,377 @@
+#include "ntt_consts.h"
+#include "params.h"
+
+#define PADDING 0
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_qmvq[8] = {FALCON_Q, FALCON_QINV,
+                                                         FALCON_MONT, FALCON_NINV_MONT,
+                                                         FALCON_V, 0,
+                                                         FALCON_MONT_BR, FALCON_NINV_MONT_BR
+                                                        };
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_br[] = {
+    PADDING,    -1479,    -5146,     4043,  PADDING,  PADDING,  PADDING,  PADDING,
+    -1305,     3542,    -3504,    -4821,     2639,    -2625,     -949,     2319,
+    -1170,     -955,     -790,    -3201,     3014,     5086,    -1326,  PADDING,
+    1260,     1260,     1260,     1260,     4632,     4632,     4632,     4632,
+    2426,     2426,     2426,     2426,     1428,     1428,     1428,     1428,
+    2013,     2013,     2013,     2013,      729,      729,      729,      729,
+    2881,     2881,     2881,     2881,    -5092,    -5092,    -5092,    -5092,
+    4388,     4388,     4388,     4388,    -5755,    -5755,    -5755,    -5755,
+    334,      334,      334,      334,     1696,     1696,     1696,     1696,
+    -3289,    -3289,    -3289,    -3289,     3241,     3241,     3241,     3241,
+    3284,     3284,     3284,     3284,    -2089,    -2089,    -2089,    -2089,
+    2401,      442,    -5101,    -1067,      390,      773,    -3833,     3778,
+    354,     4861,    -2912,     5698,     5012,    -2481,     2859,    -1045,
+    1017,    -4885,     1632,    -5084,       27,    -3066,    -3763,    -1440,
+    1537,      242,     4714,    -4143,    -2678,     3704,     5019,     -545,
+    49,     5915,    -2500,    -1583,     1512,    -1815,     5369,    -3202,
+    -2738,    -5735,    -3009,      174,    -1975,      347,    -3315,     1858,
+    3030,     2361,     2908,     3434,     3963,     6142,     1954,    -2882,
+    3991,    -2767,     2281,    -2031,     3772,     5908,     5429,    -4737,
+    1263,     1483,    -1489,    -5942,      350,     5383,    -2057,     4493,
+    -5868,     2655,     1693,      723,    -3757,     2925,     -426,     4754,
+    4115,    -1843,      218,    -3529,      576,    -2447,    -2051,    -1805,
+    -3969,      156,     5876,     5333,      418,     -453,    -4774,     1293,
+    722,    -2545,     3621,     -563,    -2975,    -3006,    -2744,     4846,
+    -2747,    -3135,     3712,     4805,    -3553,    -1062,    -2294,  PADDING,
+    -3694,    -3694,    -3694,    -3694,    -1759,    -1759,    -1759,    -1759,
+    3382,     3382,     3382,     3382,    -2548,    -2548,    -2548,    -2548,
+    3637,     3637,     3637,     3637,      145,      145,      145,      145,
+    -2731,    -2731,    -2731,    -2731,    -4890,    -4890,    -4890,    -4890,
+    -5179,    -5179,    -5179,    -5179,    -3707,    -3707,    -3707,    -3707,
+    -355,     -355,     -355,     -355,    -4231,    -4231,    -4231,    -4231,
+    3459,     3459,     3459,     3459,    -5542,    -5542,    -5542,    -5542,
+    -3932,    -3932,    -3932,    -3932,    -5911,    -5911,    -5911,    -5911,
+    1002,     5011,     5088,    -4284,    -4976,    -1607,    -3780,     -875,
+    -2437,     3646,     6022,     2987,    -2566,    -2187,    -6039,    -2422,
+    -1065,     2143,     -404,    -4645,     1168,     5277,    -1207,     3248,
+    493,    -4096,    -5444,     2381,    -4337,     -435,     1378,     1912,
+    295,     5766,    -4016,    -3762,      325,    -1146,     5990,    -3728,
+    3329,     -168,     5961,    -1962,    -6122,    -5184,     1360,    -6119,
+    -4079,      922,     1958,     1112,     4046,    -3150,     4240,    -6065,
+    2459,     3656,    -1566,    -2948,    -3123,    -3054,    -4433,     3834,
+    6099,      652,     4077,    -2919,    -1404,     -948,     1159,    -4049,
+    4298,     2692,    -5106,     1594,    -2555,    -1200,     3956,     5297,
+    -1058,      441,     4322,     2078,      709,     1319,    -3570,     -835,
+    683,      -64,     5782,    -2503,    -1747,    -5486,    -5919,    -5257,
+    5736,    -1646,     1212,     5728,    -4591,     5023,     5828,     3091,
+    -81,    -4320,    -1000,    -2963,    -4896,    -3051,     2366,  PADDING,
+    -2842,    -2842,    -2842,    -2842,     1022,     1022,     1022,     1022,
+    -2468,    -2468,    -2468,    -2468,     5791,     5791,     5791,     5791,
+    -1673,    -1673,    -1673,    -1673,    -5331,    -5331,    -5331,    -5331,
+    -4177,    -4177,    -4177,    -4177,     1381,     1381,     1381,     1381,
+    480,      480,      480,      480,        9,        9,        9,        9,
+    339,      339,      339,      339,      544,      544,      544,      544,
+    4278,     4278,     4278,     4278,    -4989,    -4989,    -4989,    -4989,
+    -3584,    -3584,    -3584,    -3584,    -2525,    -2525,    -2525,    -2525,
+    2166,     3915,     -113,    -4919,     -160,     3149,       -3,     4437,
+    3636,     4938,     5291,     2704,    -1426,    -4654,     1663,    -1777,
+    3364,     1689,     4057,    -3271,    -2847,    -4414,     2174,     4372,
+    -5042,    -2305,     4053,     2645,     5195,    -2780,    -4895,     1484,
+    -5241,    -4169,    -5468,    -3482,     5057,     4780,     -192,     4912,
+    677,    -6055,     1323,      -52,     1579,    -2505,     3957,      151,
+    -58,     3532,     1956,     -885,     3477,      142,    -2844,     -975,
+    -3029,     4782,    -4213,     2302,     -421,     3602,    -3600,     6077,
+    -2920,    -3127,     1010,      787,     4698,    -3445,     1321,    -2049,
+    -5874,    -3336,    -2766,     3174,     -431,     5906,    -2839,    -2127,
+    -241,    -1003,    -5009,    -6008,    -5681,    -1105,     3438,     4212,
+    -5594,     5886,      504,     -605,    -4080,     6068,     3263,    -4624,
+    -4134,     3195,     5860,    -3328,    -5777,    -4978,     1351,    -1177,
+    -4255,    -1635,    -2768,     -140,    -1853,    -4611,     -726,  PADDING,
+    -953,     -953,     -953,     -953,      827,      827,      827,      827,
+    2476,     2476,     2476,     2476,     2197,     2197,     2197,     2197,
+    3949,     3949,     3949,     3949,     4452,     4452,     4452,     4452,
+    -4354,    -4354,    -4354,    -4354,     2837,     2837,     2837,     2837,
+    -3748,    -3748,    -3748,    -3748,     5767,     5767,     5767,     5767,
+    118,      118,      118,      118,    -5067,    -5067,    -5067,    -5067,
+    -3296,    -3296,    -3296,    -3296,     2396,     2396,     2396,     2396,
+    130,      130,      130,      130,    -5374,    -5374,    -5374,    -5374,
+    -3247,    -2686,    -3978,    -2969,    -2370,     2865,     5332,     3510,
+    1630,    -2126,     5407,     3186,    -1153,    -2884,    -2249,    -4048,
+    -2399,    -3400,    -5191,    -3136,    -3000,      671,     3016,      243,
+    -5559,      420,    -2178,     1544,     3985,     4905,     3531,      476,
+    -4467,    -5537,     4449,     -147,     6118,     1190,     3860,    -4536,
+    5079,     2169,    -4324,    -4075,    -1278,     1973,    -3514,     5925,
+    654,     1702,    -5529,     3199,     6136,    -5415,     4948,      400,
+    5339,     3710,      468,      316,    -2033,     3879,    -1359,      973,
+    -4789,     4749,    -5456,    -3789,    -3818,    -2683,     5445,    -1050,
+    -3262,     -522,     4916,     5315,    -2344,    -5574,    -1041,    -1018,
+    3565,     1987,     5206,      -56,    -5862,    -3643,    -6137,    -1728,
+    5446,     6093,    -3988,     -382,    -3998,     1922,    -5435,    -1254,
+}; // 512->712
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_qinv_br[] = {
+    PADDING,    -3943,   -13721,    10780,  PADDING,  PADDING,  PADDING,  PADDING,
+    -3479,     9444,    -9343,   -12854,     7036,    -6999,    -2530,     6183,
+    -3119,    -2546,    -2106,    -8535,     8036,    13561,    -3535,  PADDING,
+    3359,     3359,     3359,     3359,    12350,    12350,    12350,    12350,
+    6468,     6468,     6468,     6468,     3807,     3807,     3807,     3807,
+    5367,     5367,     5367,     5367,     1943,     1943,     1943,     1943,
+    7682,     7682,     7682,     7682,   -13577,   -13577,   -13577,   -13577,
+    11700,    11700,    11700,    11700,   -15345,   -15345,   -15345,   -15345,
+    890,      890,      890,      890,     4522,     4522,     4522,     4522,
+    -8769,    -8769,    -8769,    -8769,     8641,     8641,     8641,     8641,
+    8756,     8756,     8756,     8756,    -5570,    -5570,    -5570,    -5570,
+    6402,     1178,   -13601,    -2845,     1039,     2061,   -10220,    10073,
+    943,    12961,    -7764,    15193,    13364,    -6615,     7623,    -2786,
+    2711,   -13025,     4351,   -13556,       71,    -8175,   -10033,    -3839,
+    4098,      645,    12569,   -11047,    -7140,     9876,    13382,    -1453,
+    130,    15772,    -6666,    -4220,     4031,    -4839,    14316,    -8537,
+    -7300,   -15292,    -8023,      463,    -5266,      925,    -8839,     4954,
+    8079,     6295,     7754,     9156,    10567,    16377,     5210,    -7684,
+    10641,    -7378,     6082,    -5415,    10057,    15753,    14476,   -12630,
+    3367,     3954,    -3970,   -15844,      933,    14353,    -5484,    11980,
+    -15646,     7079,     4514,     1927,   -10017,     7799,    -1135,    12676,
+    10972,    -4914,      581,    -9409,     1535,    -6524,    -5468,    -4812,
+    -10583,      415,    15668,    14220,     1114,    -1207,   -12729,     3447,
+    1925,    -6786,     9655,    -1501,    -7932,    -8015,    -7316,    12921,
+    -7324,    -8359,     9897,    12812,    -9473,    -2831,    -6116,  PADDING,
+    -9849,    -9849,    -9849,    -9849,    -4690,    -4690,    -4690,    -4690,
+    9017,     9017,     9017,     9017,    -6794,    -6794,    -6794,    -6794,
+    9697,     9697,     9697,     9697,      386,      386,      386,      386,
+    -7282,    -7282,    -7282,    -7282,   -13038,   -13038,   -13038,   -13038,
+    -13809,   -13809,   -13809,   -13809,    -9884,    -9884,    -9884,    -9884,
+    -946,     -946,     -946,     -946,   -11281,   -11281,   -11281,   -11281,
+    9223,     9223,     9223,     9223,   -14777,   -14777,   -14777,   -14777,
+    -10484,   -10484,   -10484,   -10484,   -15761,   -15761,   -15761,   -15761,
+    2671,    13361,    13566,   -11423,   -13268,    -4284,   -10079,    -2333,
+    -6498,     9721,    16057,     7964,    -6842,    -5831,   -16102,    -6458,
+    -2839,     5714,    -1077,   -12385,     3114,    14070,    -3218,     8660,
+    1314,   -10921,   -14516,     6348,   -11564,    -1159,     3674,     5098,
+    786,    15374,   -10708,   -10031,      866,    -3055,    15972,    -9940,
+    8876,     -447,    15894,    -5231,   -16324,   -13822,     3626,   -16316,
+    -10876,     2458,     5220,     2965,    10788,    -8399,    11305,   -16172,
+    6556,     9748,    -4175,    -7860,    -8327,    -8143,   -11820,    10223,
+    16262,     1738,    10871,    -7783,    -3743,    -2527,     3090,   -10796,
+    11460,     7178,   -13614,     4250,    -6812,    -3199,    10548,    14124,
+    -2821,     1175,    11524,     5540,     1890,     3517,    -9519,    -2226,
+    1821,     -170,    15417,    -6674,    -4658,   -14628,   -15782,   -14017,
+    15294,    -4388,     3231,    15273,   -12241,    13393,    15540,     8241,
+    -215,   -11519,    -2666,    -7900,   -13054,    -8135,     6308,  PADDING,
+    -7578,    -7578,    -7578,    -7578,     2725,     2725,     2725,     2725,
+    -6580,    -6580,    -6580,    -6580,    15441,    15441,    15441,    15441,
+    -4460,    -4460,    -4460,    -4460,   -14214,   -14214,   -14214,   -14214,
+    -11137,   -11137,   -11137,   -11137,     3682,     3682,     3682,     3682,
+    1279,     1279,     1279,     1279,       23,       23,       23,       23,
+    903,      903,      903,      903,     1450,     1450,     1450,     1450,
+    11407,    11407,    11407,    11407,   -13302,   -13302,   -13302,   -13302,
+    -9556,    -9556,    -9556,    -9556,    -6732,    -6732,    -6732,    -6732,
+    5775,    10439,     -301,   -13116,     -426,     8396,       -7,    11831,
+    9695,    13166,    14108,     7210,    -3802,   -12409,     4434,    -4738,
+    8969,     4503,    10817,    -8721,    -7591,   -11769,     5796,    11657,
+    -13444,    -6146,    10807,     7052,    13852,    -7412,   -13052,     3957,
+    -13974,   -11116,   -14580,    -9284,    13484,    12745,     -511,    13097,
+    1805,   -16145,     3527,     -138,     4210,    -6679,    10551,      402,
+    -154,     9417,     5215,    -2359,     9271,      378,    -7583,    -2599,
+    -8076,    12750,   -11233,     6138,    -1122,     9604,    -9599,    16204,
+    -7786,    -8337,     2693,     2098,    12526,    -9185,     3522,    -5463,
+    -15662,    -8895,    -7375,     8463,    -1149,    15748,    -7570,    -5671,
+    -642,    -2674,   -13356,   -16020,   -15148,    -2946,     9167,    11231,
+    -14916,    15694,     1343,    -1613,   -10879,    16180,     8700,   -12329,
+    -11023,     8519,    15625,    -8873,   -15404,   -13273,     3602,    -3138,
+    -11345,    -4359,    -7380,     -373,    -4940,   -12294,    -1935,  PADDING,
+    -2541,    -2541,    -2541,    -2541,     2205,     2205,     2205,     2205,
+    6602,     6602,     6602,     6602,     5858,     5858,     5858,     5858,
+    10529,    10529,    10529,    10529,    11871,    11871,    11871,    11871,
+    -11609,   -11609,   -11609,   -11609,     7564,     7564,     7564,     7564,
+    -9993,    -9993,    -9993,    -9993,    15377,    15377,    15377,    15377,
+    314,      314,      314,      314,   -13510,   -13510,   -13510,   -13510,
+    -8788,    -8788,    -8788,    -8788,     6388,     6388,     6388,     6388,
+    346,      346,      346,      346,   -14329,   -14329,   -14329,   -14329,
+    -8657,    -7162,   -10607,    -7916,    -6319,     7639,    14217,     9359,
+    4346,    -5668,    14417,     8495,    -3074,    -7690,    -5996,   -10793,
+    -6396,    -9065,   -13841,    -8361,    -7999,     1789,     8042,      647,
+    -14822,     1119,    -5807,     4116,    10625,    13078,     9415,     1269,
+    -11911,   -14764,    11863,     -391,    16313,     3173,    10292,   -12095,
+    13542,     5783,   -11529,   -10865,    -3407,     5260,    -9369,    15798,
+    1743,     4538,   -14742,     8529,    16361,   -14438,    13193,     1066,
+    14236,     9892,     1247,      842,    -5420,    10343,    -3623,     2594,
+    -12769,    12662,   -14548,   -10103,   -10180,    -7154,    14518,    -2799,
+    -8697,    -1391,    13108,    14172,    -6250,   -14862,    -2775,    -2714,
+    9505,     5298,    13881,     -149,   -15630,    -9713,   -16364,    -4607,
+    14521,    16246,   -10633,    -1018,   -10660,     5124,   -14492,    -3343,
+}; // 712
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_br[] = {
+    1254,     5435,    -1922,     3998,      382,     3988,    -6093,    -5446,
+    1728,     6137,     3643,     5862,       56,    -5206,    -1987,    -3565,
+    1018,     1041,     5574,     2344,    -5315,    -4916,      522,     3262,
+    1050,    -5445,     2683,     3818,     3789,     5456,    -4749,     4789,
+    -973,     1359,    -3879,     2033,     -316,     -468,    -3710,    -5339,
+    -400,    -4948,     5415,    -6136,    -3199,     5529,    -1702,     -654,
+    -5925,     3514,    -1973,     1278,     4075,     4324,    -2169,    -5079,
+    4536,    -3860,    -1190,    -6118,      147,    -4449,     5537,     4467,
+    -476,    -3531,    -4905,    -3985,    -1544,     2178,     -420,     5559,
+    -243,    -3016,     -671,     3000,     3136,     5191,     3400,     2399,
+    4048,     2249,     2884,     1153,    -3186,    -5407,     2126,    -1630,
+    -3510,    -5332,    -2865,     2370,     2969,     3978,     2686,     3247,
+    5374,     5374,     5374,     5374,     -130,     -130,     -130,     -130,
+    -2396,    -2396,    -2396,    -2396,     3296,     3296,     3296,     3296,
+    5067,     5067,     5067,     5067,     -118,     -118,     -118,     -118,
+    -5767,    -5767,    -5767,    -5767,     3748,     3748,     3748,     3748,
+    -2837,    -2837,    -2837,    -2837,     4354,     4354,     4354,     4354,
+    -4452,    -4452,    -4452,    -4452,    -3949,    -3949,    -3949,    -3949,
+    -2197,    -2197,    -2197,    -2197,    -2476,    -2476,    -2476,    -2476,
+    -827,     -827,     -827,     -827,      953,      953,      953,      953,
+    726,     4611,     1853,      140,     2768,     1635,     4255,     1177,
+    -1351,     4978,     5777,     3328,    -5860,    -3195,     4134,  PADDING,
+    4624,    -3263,    -6068,     4080,      605,     -504,    -5886,     5594,
+    -4212,    -3438,     1105,     5681,     6008,     5009,     1003,      241,
+    2127,     2839,    -5906,      431,    -3174,     2766,     3336,     5874,
+    2049,    -1321,     3445,    -4698,     -787,    -1010,     3127,     2920,
+    -6077,     3600,    -3602,      421,    -2302,     4213,    -4782,     3029,
+    975,     2844,     -142,    -3477,      885,    -1956,    -3532,       58,
+    -151,    -3957,     2505,    -1579,       52,    -1323,     6055,     -677,
+    -4912,      192,    -4780,    -5057,     3482,     5468,     4169,     5241,
+    -1484,     4895,     2780,    -5195,    -2645,    -4053,     2305,     5042,
+    -4372,    -2174,     4414,     2847,     3271,    -4057,    -1689,    -3364,
+    1777,    -1663,     4654,     1426,    -2704,    -5291,    -4938,    -3636,
+    -4437,        3,    -3149,      160,     4919,      113,    -3915,    -2166,
+    2525,     2525,     2525,     2525,     3584,     3584,     3584,     3584,
+    4989,     4989,     4989,     4989,    -4278,    -4278,    -4278,    -4278,
+    -544,     -544,     -544,     -544,     -339,     -339,     -339,     -339,
+    -9,       -9,       -9,       -9,     -480,     -480,     -480,     -480,
+    -1381,    -1381,    -1381,    -1381,     4177,     4177,     4177,     4177,
+    5331,     5331,     5331,     5331,     1673,     1673,     1673,     1673,
+    -5791,    -5791,    -5791,    -5791,     2468,     2468,     2468,     2468,
+    -1022,    -1022,    -1022,    -1022,     2842,     2842,     2842,     2842,
+    -2366,     3051,     4896,     2963,     1000,     4320,       81,    -3091,
+    -5828,    -5023,     4591,    -5728,    -1212,     1646,    -5736,  PADDING,
+    5257,     5919,     5486,     1747,     2503,    -5782,       64,     -683,
+    835,     3570,    -1319,     -709,    -2078,    -4322,     -441,     1058,
+    -5297,    -3956,     1200,     2555,    -1594,     5106,    -2692,    -4298,
+    4049,    -1159,      948,     1404,     2919,    -4077,     -652,    -6099,
+    -3834,     4433,     3054,     3123,     2948,     1566,    -3656,    -2459,
+    6065,    -4240,     3150,    -4046,    -1112,    -1958,     -922,     4079,
+    6119,    -1360,     5184,     6122,     1962,    -5961,      168,    -3329,
+    3728,    -5990,     1146,     -325,     3762,     4016,    -5766,     -295,
+    -1912,    -1378,      435,     4337,    -2381,     5444,     4096,     -493,
+    -3248,     1207,    -5277,    -1168,     4645,      404,    -2143,     1065,
+    2422,     6039,     2187,     2566,    -2987,    -6022,    -3646,     2437,
+    875,     3780,     1607,     4976,     4284,    -5088,    -5011,    -1002,
+    5911,     5911,     5911,     5911,     3932,     3932,     3932,     3932,
+    5542,     5542,     5542,     5542,    -3459,    -3459,    -3459,    -3459,
+    4231,     4231,     4231,     4231,      355,      355,      355,      355,
+    3707,     3707,     3707,     3707,     5179,     5179,     5179,     5179,
+    4890,     4890,     4890,     4890,     2731,     2731,     2731,     2731,
+    -145,     -145,     -145,     -145,    -3637,    -3637,    -3637,    -3637,
+    2548,     2548,     2548,     2548,    -3382,    -3382,    -3382,    -3382,
+    1759,     1759,     1759,     1759,     3694,     3694,     3694,     3694,
+    2294,     1062,     3553,    -4805,    -3712,     3135,     2747,    -4846,
+    2744,     3006,     2975,      563,    -3621,     2545,     -722,  PADDING,
+    -1293,     4774,      453,     -418,    -5333,    -5876,     -156,     3969,
+    1805,     2051,     2447,     -576,     3529,     -218,     1843,    -4115,
+    -4754,      426,    -2925,     3757,     -723,    -1693,    -2655,     5868,
+    -4493,     2057,    -5383,     -350,     5942,     1489,    -1483,    -1263,
+    4737,    -5429,    -5908,    -3772,     2031,    -2281,     2767,    -3991,
+    2882,    -1954,    -6142,    -3963,    -3434,    -2908,    -2361,    -3030,
+    -1858,     3315,     -347,     1975,     -174,     3009,     5735,     2738,
+    3202,    -5369,     1815,    -1512,     1583,     2500,    -5915,      -49,
+    545,    -5019,    -3704,     2678,     4143,    -4714,     -242,    -1537,
+    1440,     3763,     3066,      -27,     5084,    -1632,     4885,    -1017,
+    1045,    -2859,     2481,    -5012,    -5698,     2912,    -4861,     -354,
+    -3778,     3833,     -773,     -390,     1067,     5101,     -442,    -2401,
+    2089,     2089,     2089,     2089,    -3284,    -3284,    -3284,    -3284,
+    -3241,    -3241,    -3241,    -3241,     3289,     3289,     3289,     3289,
+    -1696,    -1696,    -1696,    -1696,     -334,     -334,     -334,     -334,
+    5755,     5755,     5755,     5755,    -4388,    -4388,    -4388,    -4388,
+    5092,     5092,     5092,     5092,    -2881,    -2881,    -2881,    -2881,
+    -729,     -729,     -729,     -729,    -2013,    -2013,    -2013,    -2013,
+    -1428,    -1428,    -1428,    -1428,    -2426,    -2426,    -2426,    -2426,
+    -4632,    -4632,    -4632,    -4632,    -1260,    -1260,    -1260,    -1260,
+    1326,    -5086,    -3014,     3201,      790,      955,     1170,    -2319,
+    949,     2625,    -2639,     4821,     3504,    -3542,     1305,  PADDING,
+    -4043,     5146,     1371,    12265,     1479,  PADDING,  PADDING,  PADDING,
+}; // 712
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_qinv_br[] = {
+    3343,    14492,    -5124,    10660,     1018,    10633,   -16246,   -14521,
+    4607,    16364,     9713,    15630,      149,   -13881,    -5298,    -9505,
+    2714,     2775,    14862,     6250,   -14172,   -13108,     1391,     8697,
+    2799,   -14518,     7154,    10180,    10103,    14548,   -12662,    12769,
+    -2594,     3623,   -10343,     5420,     -842,    -1247,    -9892,   -14236,
+    -1066,   -13193,    14438,   -16361,    -8529,    14742,    -4538,    -1743,
+    -15798,     9369,    -5260,     3407,    10865,    11529,    -5783,   -13542,
+    12095,   -10292,    -3173,   -16313,      391,   -11863,    14764,    11911,
+    -1269,    -9415,   -13078,   -10625,    -4116,     5807,    -1119,    14822,
+    -647,    -8042,    -1789,     7999,     8361,    13841,     9065,     6396,
+    10793,     5996,     7690,     3074,    -8495,   -14417,     5668,    -4346,
+    -9359,   -14217,    -7639,     6319,     7916,    10607,     7162,     8657,
+    14329,    14329,    14329,    14329,     -346,     -346,     -346,     -346,
+    -6388,    -6388,    -6388,    -6388,     8788,     8788,     8788,     8788,
+    13510,    13510,    13510,    13510,     -314,     -314,     -314,     -314,
+    -15377,   -15377,   -15377,   -15377,     9993,     9993,     9993,     9993,
+    -7564,    -7564,    -7564,    -7564,    11609,    11609,    11609,    11609,
+    -11871,   -11871,   -11871,   -11871,   -10529,   -10529,   -10529,   -10529,
+    -5858,    -5858,    -5858,    -5858,    -6602,    -6602,    -6602,    -6602,
+    -2205,    -2205,    -2205,    -2205,     2541,     2541,     2541,     2541,
+    1935,    12294,     4940,      373,     7380,     4359,    11345,     3138,
+    -3602,    13273,    15404,     8873,   -15625,    -8519,    11023,  PADDING,
+    12329,    -8700,   -16180,    10879,     1613,    -1343,   -15694,    14916,
+    -11231,    -9167,     2946,    15148,    16020,    13356,     2674,      642,
+    5671,     7570,   -15748,     1149,    -8463,     7375,     8895,    15662,
+    5463,    -3522,     9185,   -12526,    -2098,    -2693,     8337,     7786,
+    -16204,     9599,    -9604,     1122,    -6138,    11233,   -12750,     8076,
+    2599,     7583,     -378,    -9271,     2359,    -5215,    -9417,      154,
+    -402,   -10551,     6679,    -4210,      138,    -3527,    16145,    -1805,
+    -13097,      511,   -12745,   -13484,     9284,    14580,    11116,    13974,
+    -3957,    13052,     7412,   -13852,    -7052,   -10807,     6146,    13444,
+    -11657,    -5796,    11769,     7591,     8721,   -10817,    -4503,    -8969,
+    4738,    -4434,    12409,     3802,    -7210,   -14108,   -13166,    -9695,
+    -11831,        7,    -8396,      426,    13116,      301,   -10439,    -5775,
+    6732,     6732,     6732,     6732,     9556,     9556,     9556,     9556,
+    13302,    13302,    13302,    13302,   -11407,   -11407,   -11407,   -11407,
+    -1450,    -1450,    -1450,    -1450,     -903,     -903,     -903,     -903,
+    -23,      -23,      -23,      -23,    -1279,    -1279,    -1279,    -1279,
+    -3682,    -3682,    -3682,    -3682,    11137,    11137,    11137,    11137,
+    14214,    14214,    14214,    14214,     4460,     4460,     4460,     4460,
+    -15441,   -15441,   -15441,   -15441,     6580,     6580,     6580,     6580,
+    -2725,    -2725,    -2725,    -2725,     7578,     7578,     7578,     7578,
+    -6308,     8135,    13054,     7900,     2666,    11519,      215,    -8241,
+    -15540,   -13393,    12241,   -15273,    -3231,     4388,   -15294,  PADDING,
+    14017,    15782,    14628,     4658,     6674,   -15417,      170,    -1821,
+    2226,     9519,    -3517,    -1890,    -5540,   -11524,    -1175,     2821,
+    -14124,   -10548,     3199,     6812,    -4250,    13614,    -7178,   -11460,
+    10796,    -3090,     2527,     3743,     7783,   -10871,    -1738,   -16262,
+    -10223,    11820,     8143,     8327,     7860,     4175,    -9748,    -6556,
+    16172,   -11305,     8399,   -10788,    -2965,    -5220,    -2458,    10876,
+    16316,    -3626,    13822,    16324,     5231,   -15894,      447,    -8876,
+    9940,   -15972,     3055,     -866,    10031,    10708,   -15374,     -786,
+    -5098,    -3674,     1159,    11564,    -6348,    14516,    10921,    -1314,
+    -8660,     3218,   -14070,    -3114,    12385,     1077,    -5714,     2839,
+    6458,    16102,     5831,     6842,    -7964,   -16057,    -9721,     6498,
+    2333,    10079,     4284,    13268,    11423,   -13566,   -13361,    -2671,
+    15761,    15761,    15761,    15761,    10484,    10484,    10484,    10484,
+    14777,    14777,    14777,    14777,    -9223,    -9223,    -9223,    -9223,
+    11281,    11281,    11281,    11281,      946,      946,      946,      946,
+    9884,     9884,     9884,     9884,    13809,    13809,    13809,    13809,
+    13038,    13038,    13038,    13038,     7282,     7282,     7282,     7282,
+    -386,     -386,     -386,     -386,    -9697,    -9697,    -9697,    -9697,
+    6794,     6794,     6794,     6794,    -9017,    -9017,    -9017,    -9017,
+    4690,     4690,     4690,     4690,     9849,     9849,     9849,     9849,
+    6116,     2831,     9473,   -12812,    -9897,     8359,     7324,   -12921,
+    7316,     8015,     7932,     1501,    -9655,     6786,    -1925,  PADDING,
+    -3447,    12729,     1207,    -1114,   -14220,   -15668,     -415,    10583,
+    4812,     5468,     6524,    -1535,     9409,     -581,     4914,   -10972,
+    -12676,     1135,    -7799,    10017,    -1927,    -4514,    -7079,    15646,
+    -11980,     5484,   -14353,     -933,    15844,     3970,    -3954,    -3367,
+    12630,   -14476,   -15753,   -10057,     5415,    -6082,     7378,   -10641,
+    7684,    -5210,   -16377,   -10567,    -9156,    -7754,    -6295,    -8079,
+    -4954,     8839,     -925,     5266,     -463,     8023,    15292,     7300,
+    8537,   -14316,     4839,    -4031,     4220,     6666,   -15772,     -130,
+    1453,   -13382,    -9876,     7140,    11047,   -12569,     -645,    -4098,
+    3839,    10033,     8175,      -71,    13556,    -4351,    13025,    -2711,
+    2786,    -7623,     6615,   -13364,   -15193,     7764,   -12961,     -943,
+    -10073,    10220,    -2061,    -1039,     2845,    13601,    -1178,    -6402,
+    5570,     5570,     5570,     5570,    -8756,    -8756,    -8756,    -8756,
+    -8641,    -8641,    -8641,    -8641,     8769,     8769,     8769,     8769,
+    -4522,    -4522,    -4522,    -4522,     -890,     -890,     -890,     -890,
+    15345,    15345,    15345,    15345,   -11700,   -11700,   -11700,   -11700,
+    13577,    13577,    13577,    13577,    -7682,    -7682,    -7682,    -7682,
+    -1943,    -1943,    -1943,    -1943,    -5367,    -5367,    -5367,    -5367,
+    -3807,    -3807,    -3807,    -3807,    -6468,    -6468,    -6468,    -6468,
+    -12350,   -12350,   -12350,   -12350,    -3359,    -3359,    -3359,    -3359,
+    3535,   -13561,    -8036,     8535,     2106,     2546,     3119,    -6183,
+    2530,     6999,    -7036,    12854,     9343,    -9444,     3479,  PADDING,
+    -10780,    13721,     3655,    32704,     3943,  PADDING,  PADDING,  PADDING,
+}; // 712
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.h
new file mode 100644
index 000000000..ded719645
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.h
@@ -0,0 +1,23 @@
+#ifndef NTT_CONSTS
+#define NTT_CONSTS
+
+#include <stdint.h>
+
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_qmvq[8];
+
+/*
+ * Table for NTT, binary case:
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ */
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_qinv_br[];
+
+/*
+ * Table for inverse NTT
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_qinv_br[];
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/params.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/params.h
new file mode 100644
index 000000000..b02384ae9
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/params.h
@@ -0,0 +1,17 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define FALCON_LOGN 9
+
+#define FALCON_N (1 << FALCON_LOGN)
+#define FALCON_Q 12289
+#define FALCON_QINV (-12287) // pow(12289, -1, pow(2, 16)) - pow(2, 16)
+#define FALCON_V 5461        // Barrett reduction
+#define FALCON_MONT 4091     // pow(2, 16, 12289)
+#define FALCON_MONT_BR 10908 // (4091 << 16)//q//2
+
+#define FALCON_NINV_MONT 128    // pow(512, -1, 12289) * pow(2, 16, 12289)
+#define FALCON_NINV_MONT_BR 341 // (128 << 16) //q // 2
+#define FALCON_LOG2_NINV_MONT 7
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly.h
new file mode 100644
index 000000000..73836b3f8
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly.h
@@ -0,0 +1,42 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "inner.h"
+#include "params.h"
+
+typedef enum ntt_domain {
+    NTT_NONE = 0,
+    NTT_MONT = 1,
+    NTT_MONT_INV = 2,
+} ntt_domain_t;
+
+typedef enum invntt_domain {
+    INVNTT_NONE = 0,
+    INVNTT_NINV = 1,
+} invntt_domain_t;
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]);
+
+uint16_t PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+        const int8_t low, const int8_t high);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+        const int16_t low, const int16_t high);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_float.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_float.c
new file mode 100644
index 000000000..b3eb7598d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_float.c
@@ -0,0 +1,1459 @@
+/*
+ * Poly FFT
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add(fpr *c, const fpr *restrict a,
+        const fpr *restrict b, unsigned logn) {
+    float64x2x4_t neon_a, neon_b, neon_c;
+    float64x2x2_t neon_a2, neon_b2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+    switch (logn) {
+    case 1:
+        // n = 2;
+        vload(neon_a.val[0], &a[0]);
+        vload(neon_b.val[0], &b[0]);
+
+        vfadd(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+        vstore(&c[0], neon_c.val[0]);
+        break;
+
+    case 2:
+        // n = 4
+        vloadx2(neon_a2, &a[0]);
+        vloadx2(neon_b2, &b[0]);
+
+        vfadd(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+        vfadd(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+        vstorex2(&c[0], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = 0; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+            vloadx4(neon_b, &b[i]);
+
+            vfaddx4(neon_c, neon_a, neon_b);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+/*
+ * c = a - b
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(fpr *c, const fpr *restrict a,
+        const fpr *restrict b, unsigned logn) {
+    float64x2x4_t neon_a, neon_b, neon_c;
+    float64x2x2_t neon_a2, neon_b2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+    switch (logn) {
+    case 1:
+        vload(neon_a.val[0], &a[0]);
+        vload(neon_b.val[0], &b[0]);
+
+        vfsub(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+        vstore(&c[0], neon_c.val[0]);
+        break;
+
+    case 2:
+        vloadx2(neon_a2, &a[0]);
+        vloadx2(neon_b2, &b[0]);
+
+        vfsub(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+        vfsub(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+        vstorex2(&c[0], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = 0; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+            vloadx4(neon_b, &b[i]);
+
+            vfsubx4(neon_c, neon_a, neon_b);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+/*
+ * c = -a
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(fpr *c, const fpr *restrict a,
+        unsigned logn) {
+    float64x2x4_t neon_a, neon_c;
+    float64x2x2_t neon_a2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+
+    switch (logn) {
+    case 1:
+        vload(neon_a.val[0], &a[0]);
+
+        vfneg(neon_c.val[0], neon_a.val[0]);
+
+        vstore(&c[0], neon_c.val[0]);
+        break;
+
+    case 2:
+        vloadx2(neon_a2, &a[0]);
+
+        vfneg(neon_c2.val[0], neon_a2.val[0]);
+        vfneg(neon_c2.val[1], neon_a2.val[1]);
+
+        vstorex2(&c[0], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = 0; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+
+            vfnegx4(neon_c, neon_a);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a,
+        unsigned logn) {
+
+    float64x2x4_t neon_a, neon_c;
+    float64x2x2_t neon_a2, neon_c2;
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1;
+        c[1] = fpr_neg(a[1]);
+        break;
+
+    case 2:
+        // n = 4; hn = 2
+        vload(neon_a.val[0], &a[2]);
+        vfneg(neon_c.val[0], neon_a.val[0]);
+        vstore(&c[2], neon_c.val[0]);
+        break;
+
+    case 3:
+        // n = 8; hn = 4
+        vloadx2(neon_a2, &a[4]);
+        vfneg(neon_c2.val[0], neon_a2.val[0]);
+        vfneg(neon_c2.val[1], neon_a2.val[1]);
+        vstorex2(&c[4], neon_c2);
+        break;
+
+    default:
+        for (unsigned i = hn; i < falcon_n; i += 8) {
+            vloadx4(neon_a, &a[i]);
+
+            vfnegx4(neon_c, neon_a);
+
+            vstorex4(&c[i], neon_c);
+        }
+        break;
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log1(
+    fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+    fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+    a_re = a[0];
+    a_im = a[1];
+    b_re = b[0];
+    b_im = b[1];
+
+    c_re = a_re * b_re - a_im * b_im;
+    c_im = a_re * b_im + a_im * b_re;
+
+    c[0] = c_re;
+    c[1] = c_im;
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log2(
+    fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+    // n = 4
+    float64x2x2_t neon_a, neon_b, neon_c;
+    float64x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+    // 0: re, re
+    // 1: im, im
+    vloadx2(neon_a, &a[0]);
+    vloadx2(neon_b, &b[0]);
+
+    a_re = neon_a.val[0];
+    a_im = neon_a.val[1];
+    b_re = neon_b.val[0];
+    b_im = neon_b.val[1];
+
+    FPC_MUL(c_re, c_im, a_re, a_im, b_re, b_im);
+
+    neon_c.val[0] = c_re;
+    neon_c.val[1] = c_im;
+
+    vstorex2(&c[0], neon_c);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log3(
+    fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+    // n = 8
+    float64x2x4_t neon_a, neon_b, neon_c;
+    float64x2x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+    vloadx4(neon_a, &a[0]);
+    vloadx4(neon_b, &b[0]);
+
+    a_re.val[0] = neon_a.val[0];
+    a_re.val[1] = neon_a.val[1];
+    a_im.val[0] = neon_a.val[2];
+    a_im.val[1] = neon_a.val[3];
+
+    b_re.val[0] = neon_b.val[0];
+    b_re.val[1] = neon_b.val[1];
+    b_im.val[0] = neon_b.val[2];
+    b_im.val[1] = neon_b.val[3];
+
+    FPC_MULx2(c_re, c_im, a_re, a_im, b_re, b_im);
+
+    neon_c.val[0] = c_re.val[0];
+    neon_c.val[1] = c_re.val[1];
+    neon_c.val[2] = c_im.val[0];
+    neon_c.val[3] = c_im.val[1];
+
+    vstorex4(&c[0], neon_c);
+}
+
+/* see inner.h */
+/*
+ * c = a * b
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(fpr *c, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    // Total 32 registers
+    float64x2x4_t a_re, b_re, a_im, b_im; // 24
+    float64x2x4_t c_re, c_im;             // 8
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log1(c, a, b);
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log2(c, a, b);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log3(c, a, b);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+            vloadx4(b_im, &b[i + hn]);
+
+            FPC_MULx4(c_re, c_im, a_re, a_im, b_re, b_im);
+
+            vstorex4(&c[i], c_re);
+            vstorex4(&c[i + hn], c_im);
+        }
+        break;
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log1(
+    fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+    const fpr *restrict b) {
+    fpr a_re, a_im, b_re, b_im, c_re, c_im, d_re, d_im;
+
+    a_re = a[0];
+    a_im = a[1];
+    b_re = b[0];
+    b_im = b[1];
+    d_re = d[0];
+    d_im = d[1];
+
+    c_re = a_re * b_re - a_im * b_im;
+    c_im = a_re * b_im + a_im * b_re;
+
+    c[0] = c_re + d_re;
+    c[1] = c_im + d_im;
+
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log2(
+    fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+    const fpr *restrict b) {
+    // n = 4
+    float64x2x2_t neon_a, neon_b, neon_d;
+    float64x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+    // 0: re, re
+    // 1: im, im
+    vloadx2(neon_a, &a[0]);
+    vloadx2(neon_b, &b[0]);
+    vloadx2(neon_d, &d[0]);
+
+    a_re = neon_a.val[0];
+    a_im = neon_a.val[1];
+    b_re = neon_b.val[0];
+    b_im = neon_b.val[1];
+    d_re = neon_d.val[0];
+    d_im = neon_d.val[1];
+
+    FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im);
+
+    neon_d.val[0] = d_re;
+    neon_d.val[1] = d_im;
+
+    vstorex2(&c[0], neon_d);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log3(
+    fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+    const fpr *restrict b) {
+    // n = 8
+    float64x2x4_t neon_a, neon_b, neon_d;
+    float64x2x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+    vloadx4(neon_a, &a[0]);
+    vloadx4(neon_b, &b[0]);
+    vloadx4(neon_d, &d[0]);
+
+    a_re.val[0] = neon_a.val[0];
+    a_re.val[1] = neon_a.val[1];
+    a_im.val[0] = neon_a.val[2];
+    a_im.val[1] = neon_a.val[3];
+
+    b_re.val[0] = neon_b.val[0];
+    b_re.val[1] = neon_b.val[1];
+    b_im.val[0] = neon_b.val[2];
+    b_im.val[1] = neon_b.val[3];
+
+    d_re.val[0] = neon_d.val[0];
+    d_re.val[1] = neon_d.val[1];
+    d_im.val[0] = neon_d.val[2];
+    d_im.val[1] = neon_d.val[3];
+
+    FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im);
+
+    neon_d.val[0] = d_re.val[0];
+    neon_d.val[1] = d_re.val[1];
+    neon_d.val[2] = d_im.val[0];
+    neon_d.val[3] = d_im.val[1];
+
+    vstorex4(&c[0], neon_d);
+}
+
+/* see inner.h */
+/*
+ * c = d + a * b
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(fpr *c, const fpr *restrict d,
+        const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    // Total 32 registers
+    float64x2x4_t a_re, b_re, a_im, b_im, d_re, d_im; // 32
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log1(c, d, a, b);
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log2(c, d, a, b);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log3(c, d, a, b);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+            vloadx4(b_im, &b[i + hn]);
+            vloadx4(d_re, &d[i]);
+            vloadx4(d_im, &d[i + hn]);
+
+            FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+            vstorex4(&c[i], d_re);
+            vstorex4(&c[i + hn], d_im);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(fpr *d, fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+
+    float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(b_re, &b[i]);
+        vloadx4(b_im, &b[i + hn]);
+
+        FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+        vstorex4(&d[i], d_re);
+        vstorex4(&d[i + hn], d_im);
+    }
+}
+
+// c = d + a*b
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+
+    float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(b_re, &b[i]);
+        vloadx4(b_im, &b[i + hn]);
+        vloadx4(d_re, &d[i]);
+        vloadx4(d_im, &d[i + hn]);
+
+        FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+        vstorex4(&c[i], d_re);
+        vstorex4(&c[i + hn], d_im);
+    }
+}
+
+/* see inner.h */
+/*
+ * c = a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(fpr *c,
+        const fpr *restrict a,
+        unsigned logn) {
+
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    float64x2x4_t a_re, a_im, c_re, c_im; // 16
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+
+    vfdupx4(c_im, 0);
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+
+        vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+        vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+        vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+        vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+        vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+        vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+        vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+
+        vstorex4(&c[i], c_re);
+        vstorex4(&c[i + hn], c_im);
+    }
+}
+
+/*
+ * c = d + a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(fpr *c,
+        const fpr *restrict d,
+        const fpr *restrict a,
+        unsigned logn) {
+
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    float64x2x4_t a_re, a_im, d_re; // 16
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(d_re, &d[i]);
+
+        vfmla(d_re.val[0], d_re.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(d_re.val[0], d_re.val[0], a_im.val[0], a_im.val[0]);
+        vfmla(d_re.val[1], d_re.val[1], a_re.val[1], a_re.val[1]);
+        vfmla(d_re.val[1], d_re.val[1], a_im.val[1], a_im.val[1]);
+        vfmla(d_re.val[2], d_re.val[2], a_re.val[2], a_re.val[2]);
+        vfmla(d_re.val[2], d_re.val[2], a_im.val[2], a_im.val[2]);
+        vfmla(d_re.val[3], d_re.val[3], a_re.val[3], a_re.val[3]);
+        vfmla(d_re.val[3], d_re.val[3], a_im.val[3], a_im.val[3]);
+
+        vstorex4(&c[i], d_re);
+    }
+}
+
+/* see inner.h */
+/*
+ * c = a * scalar_x
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x,
+        unsigned logn) {
+    // assert(logn >= 3);
+    // Total SIMD registers: 9
+    const unsigned falcon_n = 1 << logn;
+    float64x2x4_t neon_a, neon_c; // 8
+    float64x2_t neon_x;           // 1
+    neon_x = vdupq_n_f64(x);
+    for (unsigned i = 0; i < falcon_n; i += 8) {
+        vloadx4(neon_a, &a[i]);
+
+        vfmulx4_i(neon_c, neon_a, neon_x);
+
+        vstorex4(&c[i], neon_c);
+    }
+}
+
+/* see inner.h
+ * Unused in the implementation
+ */
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_fft(fpr *restrict c,
+        const fpr *restrict a,
+        const fpr *restrict b,
+        unsigned logn) {
+
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, b_im, c_re, c_im, m;
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+        vloadx4(b_re, &b[i]);
+        vloadx4(b_im, &b[i + hn]);
+
+        vfmulx4(m, b_re, b_re);
+        vfmlax4(m, m, b_im, b_im);
+
+        vfmulx4(c_re, a_re, b_re);
+        vfmlax4(c_re, c_re, a_im, b_im);
+
+        vfinvx4(m, m);
+
+        vfmulx4(c_im, a_im, b_re);
+        vfmlsx4(c_im, c_im, a_re, b_im);
+
+        vfmulx4(c_re, c_re, m);
+        vfmulx4(c_im, c_im, m);
+
+        vstorex4(&c[i], c_re);
+        vstorex4(&c[i + hn], c_im);
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+        const fpr *restrict a,
+        const fpr *restrict b,
+        unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, b_im, c_re;
+    float64x2x2_t x, y;
+    float64x2_t z;
+
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1; i = 0
+        /*
+         * x_re = a[0];
+         * x_im = a[1];
+         * y_re = b[0];
+         * y_im = b[1];
+         * d[0] = 1.0/( (x_re*x_re) + (x_im*x_im) + (y_re*y_re) + (y_im*y_im) );
+         */
+        vload(a_re.val[0], &a[0]);
+        vload(b_re.val[0], &b[0]);
+        vfmul(a_re.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(c_re.val[0], a_re.val[0], b_re.val[0], b_re.val[0]);
+        d[0] = 1.0 / vaddvq_f64(c_re.val[0]);
+        break;
+
+    case 2:
+        // n = 4; hn = 2; i = 0, 1
+        vloadx2(x, &a[0]);
+        vloadx2(y, &b[0]);
+
+        vfmul(z, x.val[0], x.val[0]);
+        vfmla(z, z, x.val[1], x.val[1]);
+        vfmla(z, z, y.val[0], y.val[0]);
+        vfmla(z, z, y.val[1], y.val[1]);
+        vfinv(z, z);
+
+        vstore(&d[0], z);
+        break;
+
+    case 3:
+        // n = 8; hn = 4; i = 0,1,2,3
+        vloadx4(a_re, &a[0]);
+        vloadx4(b_re, &b[0]);
+
+        vfmul(x.val[0], a_re.val[0], a_re.val[0]);
+        vfmla(x.val[0], x.val[0], b_re.val[0], b_re.val[0]);
+        vfmla(x.val[0], x.val[0], a_re.val[2], a_re.val[2]);
+        vfmla(x.val[0], x.val[0], b_re.val[2], b_re.val[2]);
+        vfinv(x.val[0], x.val[0]);
+
+        vfmul(x.val[1], a_re.val[1], a_re.val[1]);
+        vfmla(x.val[1], x.val[1], b_re.val[1], b_re.val[1]);
+        vfmla(x.val[1], x.val[1], a_re.val[3], a_re.val[3]);
+        vfmla(x.val[1], x.val[1], b_re.val[3], b_re.val[3]);
+        vfinv(x.val[1], x.val[1]);
+
+        vstorex2(&d[0], x);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+            vloadx4(b_im, &b[i + hn]);
+
+            vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+            vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+            vfmla(c_re.val[0], c_re.val[0], b_re.val[0], b_re.val[0]);
+            vfmla(c_re.val[0], c_re.val[0], b_im.val[0], b_im.val[0]);
+            vfinv(c_re.val[0], c_re.val[0]);
+
+            vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+            vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+            vfmla(c_re.val[1], c_re.val[1], b_re.val[1], b_re.val[1]);
+            vfmla(c_re.val[1], c_re.val[1], b_im.val[1], b_im.val[1]);
+            vfinv(c_re.val[1], c_re.val[1]);
+
+            vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+            vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+            vfmla(c_re.val[2], c_re.val[2], b_re.val[2], b_re.val[2]);
+            vfmla(c_re.val[2], c_re.val[2], b_im.val[2], b_im.val[2]);
+            vfinv(c_re.val[2], c_re.val[2]);
+
+            vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+            vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+            vfmla(c_re.val[3], c_re.val[3], b_re.val[3], b_re.val[3]);
+            vfmla(c_re.val[3], c_re.val[3], b_im.val[3], b_im.val[3]);
+            vfinv(c_re.val[3], c_re.val[3]);
+
+            vstorex4(&d[i], c_re);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add_muladj_fft(
+    fpr *restrict d, const fpr *restrict F, const fpr *restrict G,
+    const fpr *restrict f, const fpr *restrict g, unsigned logn) {
+
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t F_re, F_im, G_re, G_im;
+    float64x2x4_t f_re, f_im, g_re, g_im;
+    float64x2x4_t a_re, a_im;
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(F_re, &F[i]);
+        vloadx4(F_im, &F[i + hn]);
+        vloadx4(f_re, &f[i]);
+        vloadx4(f_im, &f[i + hn]);
+
+        FPC_MUL_CONJx4(a_re, a_im, F_re, F_im, f_re, f_im);
+
+        vloadx4(G_re, &G[i]);
+        vloadx4(g_re, &g[i]);
+
+        vloadx4(G_im, &G[i + hn]);
+        vloadx4(g_im, &g[i + hn]);
+
+        FPC_MLA_CONJx4(a_re, a_im, G_re, G_im, g_re, g_im);
+
+        vstorex4(&d[i], a_re);
+        vstorex4(&d[i + hn], a_im);
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, c_re, c_im;
+    float64x2x2_t a_re_im, b_re_im, c_re_im;
+    switch (logn) {
+    case 1:
+        // n = 2; hn = 1; i = 0
+        vload(a_re.val[0], &a[0]);
+        vfmuln(a_re.val[0], a_re.val[0], b[0]);
+        vstore(&c[0], a_re.val[0]);
+        break;
+
+    case 2:
+        // n = 4; hn = 2; i = 0, 1
+        vload2(a_re_im, &a[0]);
+        vload(b_re_im.val[0], &b[0]);
+        vfmul_lane(c_re_im.val[0], a_re_im.val[0], b_re_im.val[0], 0);
+        vfmul_lane(c_re_im.val[1], a_re_im.val[1], b_re_im.val[0], 1);
+        vstore2(&c[0], c_re_im);
+        break;
+
+    case 3:
+        // n = 8; hn = 4; i = 0,1,2,3
+        vload4(a_re, &a[0]);
+        vloadx2(b_re_im, &b[0]);
+        vfmul_lane(c_re.val[0], a_re.val[0], b_re_im.val[0], 0);
+        vfmul_lane(c_re.val[1], a_re.val[1], b_re_im.val[0], 1);
+        vfmul_lane(c_re.val[2], a_re.val[2], b_re_im.val[1], 0);
+        vfmul_lane(c_re.val[3], a_re.val[3], b_re_im.val[1], 1);
+        vstore4(&c[0], c_re);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(a_re, &a[i]);
+            vloadx4(a_im, &a[i + hn]);
+            vloadx4(b_re, &b[i]);
+
+            vfmulx4(c_re, a_re, b_re);
+            vfmulx4(c_im, a_im, b_re);
+
+            vstorex4(&c[i], c_re);
+            vstorex4(&c[i + hn], c_im);
+        }
+        break;
+    }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a,
+        const fpr *restrict b,
+        unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t a_re, a_im, b_re, binv, c_re, c_im;
+
+    for (unsigned i = 0; i < hn; i += 8) {
+        vloadx4(b_re, &b[i]);
+        vfinvx4(binv, b_re);
+
+        vloadx4(a_re, &a[i]);
+        vloadx4(a_im, &a[i + hn]);
+
+        vfmulx4(c_re, a_re, binv);
+        vfmulx4(c_im, a_im, binv);
+
+        vstorex4(&c[i], c_re);
+        vstorex4(&c[i + hn], c_im);
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log1(
+    const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+    float64x2x4_t g00_re, g01_re, g11_re;
+    float64x2x4_t mu_re, m;
+    float64x2_t neon_1i2;
+
+    const fpr imagine[2] = {1.0, -1.0};
+    // n = 2; hn = 1;
+    vload(g00_re.val[0], &g00[0]);
+
+    // g00_re^2 | g00_im^2
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    // 1 / ( g00_re^2 + g00_im^2 )
+    m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+    vload(g01_re.val[0], &g01[0]);
+    vload(neon_1i2, &imagine[0]);
+
+    // g01_re * g00_re | g01_im * g01_im
+    vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+    // g01_im | -g01_re
+    vswap(g01_re.val[1], g01_re.val[0]);
+    vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+    // g01_im * g00_re  - g01_re * g00_im
+    vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+    mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+    // re: mu_re * g01_re + mu_im * g01_im
+    vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+    vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+    vswap(g01_re.val[2], g01_re.val[2]);
+    // im: -g01_im * mu_re  + g01_re * mu_im
+    vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+    g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+    vload(g11_re.val[0], &g11[0]);
+
+    vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+    vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+    vstore(&g11[0], g11_re.val[0]);
+    vstore(&g01[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log2(
+    const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+    float64x2x2_t tmp;
+
+    // n = 4; hn = 2
+    vloadx2(tmp, &g00[0]);
+    g00_re.val[0] = tmp.val[0];
+    g00_im.val[0] = tmp.val[1];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vloadx2(tmp, &g01[0]);
+    g01_re.val[0] = tmp.val[0];
+    g01_im.val[0] = tmp.val[1];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+    vloadx2(tmp, &g11[0]);
+    g11_re.val[0] = tmp.val[0];
+    g11_im.val[0] = tmp.val[1];
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+    tmp.val[0] = d_re.val[0];
+    tmp.val[1] = d_im.val[0];
+    vstorex2(&g11[0], tmp);
+
+    vfneg(mu_im.val[0], mu_im.val[0]);
+    tmp.val[0] = mu_re.val[0];
+    tmp.val[1] = mu_im.val[0];
+    vstorex2(&g01[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log3(
+    const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+    float64x2x4_t mu_re, mu_im, m, d_re;
+    //  n = 8; hn = 4
+    vloadx4(g00_re, &g00[0]);
+    g00_im.val[0] = g00_re.val[2];
+    g00_im.val[1] = g00_re.val[3];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+    vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+    vfinv(m.val[1], m.val[1]);
+
+    vloadx4(g01_re, &g01[0]);
+    g01_im.val[0] = g01_re.val[2];
+    g01_im.val[1] = g01_re.val[3];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+    vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+    vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+    vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+    vloadx4(g11_re, &g11[0]);
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+    vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+    vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+    vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+    vstorex4(&g11[0], d_re);
+
+    vfneg(mu_re.val[2], mu_im.val[0]);
+    vfneg(mu_re.val[3], mu_im.val[1]);
+
+    vstorex4(&g01[0], mu_re);
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+        fpr *restrict g01,
+        fpr *restrict g11, unsigned logn) {
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log1(g00, g01, g11);
+
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log2(g00, g01, g11);
+
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log3(g00, g01, g11);
+
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(g00_re, &g00[i]);
+            vloadx4(g00_im, &g00[i + hn]);
+
+            vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+            vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+            vfinv(m.val[0], m.val[0]);
+
+            vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+            vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+            vfinv(m.val[1], m.val[1]);
+
+            vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+            vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+            vfinv(m.val[2], m.val[2]);
+
+            vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+            vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+            vfinv(m.val[3], m.val[3]);
+
+            vloadx4(g01_re, &g01[i]);
+            vloadx4(g01_im, &g01[i + hn]);
+
+            vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+            vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+            vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+            vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+            vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+            vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+            vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+            vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+            vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+            vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+            vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+            vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+            vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+            vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+            vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+            vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+            vfmulx4(mu_re, mu_re, m);
+            vfmulx4(mu_im, mu_im, m);
+            vstorex4(&g01[i], mu_re);
+
+            vloadx4(g11_re, &g11[i]);
+            vloadx4(g11_im, &g11[i + hn]);
+
+            vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+            vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+            vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+            vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+            vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+            vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+            vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+            vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+            vstorex4(&g11[i], d_re);
+
+            vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+            vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+            vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+            vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+            vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+            vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+            vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+            vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+            vstorex4(&g11[i + hn], d_im);
+
+            vfnegx4(mu_im, mu_im);
+            vstorex4(&g01[i + hn], mu_im);
+        }
+        break;
+    }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log1(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11) {
+    float64x2x4_t g00_re, g01_re, g11_re;
+    float64x2x4_t mu_re, m;
+    float64x2_t neon_1i2;
+
+    const fpr imagine[2] = {1.0, -1.0};
+    // n = 2; hn = 1;
+    vload(g00_re.val[0], &g00[0]);
+
+    // g00_re^2 | g00_im^2
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    // 1 / ( g00_re^2 + g00_im^2 )
+    m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+    vload(g01_re.val[0], &g01[0]);
+    vload(neon_1i2, &imagine[0]);
+
+    // g01_re * g00_re | g01_im * g01_im
+    vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+    // g01_im | -g01_re
+    vswap(g01_re.val[1], g01_re.val[0]);
+    vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+    // g01_im * g00_re  - g01_re * g00_im
+    vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+    mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+    // re: mu_re * g01_re + mu_im * g01_im
+    vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+    vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+    vswap(g01_re.val[2], g01_re.val[2]);
+    // im: -g01_im * mu_re  + g01_re * mu_im
+    vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+    g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+    vload(g11_re.val[0], &g11[0]);
+
+    vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+    vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+    vstore(&d11[0], g11_re.val[0]);
+    vstore(&l10[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log2(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+    float64x2x2_t tmp;
+
+    // n = 4; hn = 2
+    vloadx2(tmp, &g00[0]);
+    g00_re.val[0] = tmp.val[0];
+    g00_im.val[0] = tmp.val[1];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vloadx2(tmp, &g01[0]);
+    g01_re.val[0] = tmp.val[0];
+    g01_im.val[0] = tmp.val[1];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+    vloadx2(tmp, &g11[0]);
+    g11_re.val[0] = tmp.val[0];
+    g11_im.val[0] = tmp.val[1];
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+    tmp.val[0] = d_re.val[0];
+    tmp.val[1] = d_im.val[0];
+    vstorex2(&d11[0], tmp);
+
+    vfneg(mu_im.val[0], mu_im.val[0]);
+    tmp.val[0] = mu_re.val[0];
+    tmp.val[1] = mu_im.val[0];
+    vstorex2(&l10[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log3(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11) {
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+    float64x2x4_t mu_re, mu_im, m, d_re;
+    //  n = 8; hn = 4
+    vloadx4(g00_re, &g00[0]);
+    g00_im.val[0] = g00_re.val[2];
+    g00_im.val[1] = g00_re.val[3];
+
+    vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+    vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+    vfinv(m.val[0], m.val[0]);
+
+    vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+    vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+    vfinv(m.val[1], m.val[1]);
+
+    vloadx4(g01_re, &g01[0]);
+    g01_im.val[0] = g01_re.val[2];
+    g01_im.val[1] = g01_re.val[3];
+
+    vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+    vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+    vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+    vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+    vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+    vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+    vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+    vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+    vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+    vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+    vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+    vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+    vloadx4(g11_re, &g11[0]);
+
+    vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+    vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+    vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+    vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+    vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+    vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+    vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+    vstorex4(&d11[0], d_re);
+
+    vfneg(mu_re.val[2], mu_im.val[0]);
+    vfneg(mu_re.val[3], mu_im.val[1]);
+
+    vstorex4(&l10[0], mu_re);
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(
+    fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+    const fpr *restrict g01, const fpr *restrict g11, unsigned logn) {
+
+    const unsigned falcon_n = 1 << logn;
+    const unsigned hn = falcon_n >> 1;
+    float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+    float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+    switch (logn) {
+    case 1:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log1(d11, l10, g00, g01, g11);
+        break;
+
+    case 2:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log2(d11, l10, g00, g01, g11);
+        break;
+
+    case 3:
+        PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log3(d11, l10, g00, g01, g11);
+        break;
+
+    default:
+        for (unsigned i = 0; i < hn; i += 8) {
+            vloadx4(g00_re, &g00[i]);
+            vloadx4(g00_im, &g00[i + hn]);
+
+            vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+            vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+            vfinv(m.val[0], m.val[0]);
+
+            vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+            vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+            vfinv(m.val[1], m.val[1]);
+
+            vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+            vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+            vfinv(m.val[2], m.val[2]);
+
+            vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+            vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+            vfinv(m.val[3], m.val[3]);
+
+            vloadx4(g01_re, &g01[i]);
+            vloadx4(g01_im, &g01[i + hn]);
+
+            vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+            vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+            vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+            vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+            vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+            vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+            vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+            vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+            vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+            vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+            vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+            vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+            vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+            vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+            vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+            vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+            vfmulx4(mu_re, mu_re, m);
+            vfmulx4(mu_im, mu_im, m);
+            vstorex4(&l10[i], mu_re);
+
+            vloadx4(g11_re, &g11[i]);
+            vloadx4(g11_im, &g11[i + hn]);
+
+            vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+            vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+            vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+            vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+            vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+            vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+            vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+            vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+            vstorex4(&d11[i], d_re);
+
+            vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+            vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+            vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+            vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+            vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+            vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+            vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+            vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+            vstorex4(&d11[i + hn], d_im);
+
+            vfnegx4(mu_im, mu_im);
+            vstorex4(&l10[i + hn], mu_im);
+        }
+        break;
+    }
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm,
+        const unsigned falcon_n) {
+    float64x2x4_t neon_t0;
+    uint16x8x4_t neon_hm;
+    uint16x8_t neon_zero;
+    uint32x4x4_t neon_hmu32[2];
+    int64x2x4_t neon_hms64[4];
+    neon_zero = vdupq_n_u16(0);
+    for (unsigned u = 0; u < falcon_n; u += 32) {
+        neon_hm = vld1q_u16_x4(&hm[u]);
+        neon_hmu32[0].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[0], neon_zero);
+        neon_hmu32[0].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[0], neon_zero);
+        neon_hmu32[0].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[1], neon_zero);
+        neon_hmu32[0].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[1], neon_zero);
+
+        neon_hmu32[1].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[2], neon_zero);
+        neon_hmu32[1].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[2], neon_zero);
+        neon_hmu32[1].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[3], neon_zero);
+        neon_hmu32[1].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[3], neon_zero);
+
+        neon_hms64[0].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[0].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[0].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+        neon_hms64[0].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+
+        neon_hms64[1].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[1].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[1].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+        neon_hms64[1].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+
+        neon_hms64[2].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[2].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+        neon_hms64[2].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+        neon_hms64[2].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+
+        neon_hms64[3].val[0] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[3].val[1] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+        neon_hms64[3].val[2] =
+            (int64x2_t)vzip1q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+        neon_hms64[3].val[3] =
+            (int64x2_t)vzip2q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+
+        vfcvtx4(neon_t0, neon_hms64[0]);
+        vstorex4(&t0[u], neon_t0);
+
+        vfcvtx4(neon_t0, neon_hms64[1]);
+        vstorex4(&t0[u + 8], neon_t0);
+
+        vfcvtx4(neon_t0, neon_hms64[2]);
+        vstorex4(&t0[u + 16], neon_t0);
+
+        vfcvtx4(neon_t0, neon_hms64[3]);
+        vstorex4(&t0[u + 24], neon_t0);
+    }
+}
+
+fpr PQCLEAN_FALCONPADDED512_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2) {
+    float64x2x4_t r1, r11, r2, r22;
+    float64x2x4_t bnorm, bnorm2;
+
+    vfdupx4(bnorm, 0);
+    vfdupx4(bnorm2, 0);
+
+    for (unsigned i = 0; i < FALCON_N;) {
+        vloadx4(r1, &rt1[i]);
+        i += 8;
+
+        vfmla(bnorm.val[0], bnorm.val[0], r1.val[0], r1.val[0]);
+        vfmla(bnorm.val[1], bnorm.val[1], r1.val[1], r1.val[1]);
+        vfmla(bnorm.val[2], bnorm.val[2], r1.val[2], r1.val[2]);
+        vfmla(bnorm.val[3], bnorm.val[3], r1.val[3], r1.val[3]);
+
+        vloadx4(r11, &rt1[i]);
+        i += 8;
+
+        vfmla(bnorm2.val[0], bnorm2.val[0], r11.val[0], r11.val[0]);
+        vfmla(bnorm2.val[1], bnorm2.val[1], r11.val[1], r11.val[1]);
+        vfmla(bnorm2.val[2], bnorm2.val[2], r11.val[2], r11.val[2]);
+        vfmla(bnorm2.val[3], bnorm2.val[3], r11.val[3], r11.val[3]);
+    }
+
+    for (unsigned i = 0; i < FALCON_N;) {
+        vloadx4(r2, &rt2[i]);
+        i += 8;
+
+        vfmla(bnorm.val[0], bnorm.val[0], r2.val[0], r2.val[0]);
+        vfmla(bnorm.val[1], bnorm.val[1], r2.val[1], r2.val[1]);
+        vfmla(bnorm.val[2], bnorm.val[2], r2.val[2], r2.val[2]);
+        vfmla(bnorm.val[3], bnorm.val[3], r2.val[3], r2.val[3]);
+
+        vloadx4(r22, &rt2[i]);
+        i += 8;
+
+        vfmla(bnorm2.val[0], bnorm2.val[0], r22.val[0], r22.val[0]);
+        vfmla(bnorm2.val[1], bnorm2.val[1], r22.val[1], r22.val[1]);
+        vfmla(bnorm2.val[2], bnorm2.val[2], r22.val[2], r22.val[2]);
+        vfmla(bnorm2.val[3], bnorm2.val[3], r22.val[3], r22.val[3]);
+    }
+
+    vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[1]);
+    vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[1]);
+    vfadd(bnorm.val[2], bnorm.val[2], bnorm.val[3]);
+    vfadd(bnorm2.val[2], bnorm2.val[2], bnorm2.val[3]);
+    vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[2]);
+    vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[2]);
+
+    vfadd(bnorm.val[0], bnorm.val[0], bnorm2.val[0]);
+
+    return vaddvq_f64(bnorm.val[0]);
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_int.c
new file mode 100644
index 000000000..3e1120687
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_int.c
@@ -0,0 +1,501 @@
+/*
+ * poly_int.c
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include <arm_neon.h>
+#include "macrous.h"
+#include "params.h"
+#include "poly.h"
+#include "ntt_consts.h"
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]) {
+    // Total SIMD registers: 24 = 16 + 8
+    int16x8x4_t a, b, e, f; // 16
+    int8x16x4_t c, d;       // 8
+
+    for (int i = 0; i < FALCON_N; i += 128) {
+        c = vld1q_s8_x4(&in[i]);
+
+        a.val[0] = vmovl_s8(vget_low_s8(c.val[0]));
+        a.val[2] = vmovl_s8(vget_low_s8(c.val[1]));
+        b.val[0] = vmovl_s8(vget_low_s8(c.val[2]));
+        b.val[2] = vmovl_s8(vget_low_s8(c.val[3]));
+
+        a.val[1] = vmovl_high_s8(c.val[0]);
+        a.val[3] = vmovl_high_s8(c.val[1]);
+        b.val[1] = vmovl_high_s8(c.val[2]);
+        b.val[3] = vmovl_high_s8(c.val[3]);
+
+        d = vld1q_s8_x4(&in[i + 64]);
+
+        e.val[0] = vmovl_s8(vget_low_s8(d.val[0]));
+        e.val[2] = vmovl_s8(vget_low_s8(d.val[1]));
+        f.val[0] = vmovl_s8(vget_low_s8(d.val[2]));
+        f.val[2] = vmovl_s8(vget_low_s8(d.val[3]));
+
+        e.val[1] = vmovl_high_s8(d.val[0]);
+        e.val[3] = vmovl_high_s8(d.val[1]);
+        f.val[1] = vmovl_high_s8(d.val[2]);
+        f.val[3] = vmovl_high_s8(d.val[3]);
+
+        vst1q_s16_x4(&out[i], a);
+        vst1q_s16_x4(&out[i + 32], b);
+        vst1q_s16_x4(&out[i + 64], e);
+        vst1q_s16_x4(&out[i + 96], f);
+    }
+}
+
+/*
+ * Return f[] = f[]/g[] % 12289
+ * See assembly https://godbolt.org/z/od3Ex7Mbx
+ */
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+    // Total SIMD registers: 24 = 4 + 19 + 1
+    int16x8x4_t src, dst, t, k; // 4
+    int16x8x4_t y0, y1, y2, y3, y4, y5,
+                y6, y7, y8, y9, y10, y11, y12,
+                y13, y14, y15, y16, y17, y18; // 19
+    int16x8_t neon_qmvm;              // 1
+
+    neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+
+    for (int i = 0; i < FALCON_N; i += 32) {
+        // Find y0 = g^12287
+        vload_s16_x4(y0, &g[i]);
+
+        // y0 is already in Montgomery domain
+
+        montmul_x4(y1, y0, y0, neon_qmvm, t);
+        montmul_x4(y2, y1, y0, neon_qmvm, k);
+        montmul_x4(y3, y2, y1, neon_qmvm, t);
+        montmul_x4(y4, y3, y3, neon_qmvm, k);
+        montmul_x4(y5, y4, y4, neon_qmvm, t);
+        montmul_x4(y6, y5, y5, neon_qmvm, k);
+        montmul_x4(y7, y6, y6, neon_qmvm, t);
+        montmul_x4(y8, y7, y7, neon_qmvm, k);
+        montmul_x4(y9, y8, y2, neon_qmvm, t);
+        montmul_x4(y10, y9, y8, neon_qmvm, k);
+        montmul_x4(y11, y10, y10, neon_qmvm, t);
+        montmul_x4(y12, y11, y11, neon_qmvm, k);
+        montmul_x4(y13, y12, y9, neon_qmvm, t);
+        montmul_x4(y14, y13, y13, neon_qmvm, k);
+        montmul_x4(y15, y14, y14, neon_qmvm, t);
+        montmul_x4(y16, y15, y10, neon_qmvm, k);
+        montmul_x4(y17, y16, y16, neon_qmvm, t);
+        montmul_x4(y18, y17, y0, neon_qmvm, k);
+
+        vload_s16_x4(src, &f[i]);
+
+        montmul_x4(dst, y18, src, neon_qmvm, t);
+
+        vstore_s16_x4(&f[i], dst);
+    }
+}
+
+/*
+ * f = g - s
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]) {
+    // Total SIMD registers: 29 = 28 + 1
+    int16x8x4_t a, b, c, d, e, h, t; // 28
+    int16x8_t neon_qmvm;             // 1
+    neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &g[i]);
+        vload_s16_x4(b, &s[i]);
+
+        e.val[0] = vsubq_s16(a.val[0], b.val[0]);
+        e.val[1] = vsubq_s16(a.val[1], b.val[1]);
+        e.val[2] = vsubq_s16(a.val[2], b.val[2]);
+        e.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+        vload_s16_x4(c, &g[i + 32]);
+        vload_s16_x4(d, &s[i + 32]);
+
+        h.val[0] = vsubq_s16(c.val[0], d.val[0]);
+        h.val[1] = vsubq_s16(c.val[1], d.val[1]);
+        h.val[2] = vsubq_s16(c.val[2], d.val[2]);
+        h.val[3] = vsubq_s16(c.val[3], d.val[3]);
+
+        barrett_x4(e, neon_qmvm, t);
+        barrett_x4(h, neon_qmvm, t);
+
+        vstore_s16_x4(&f[i], e);
+        vstore_s16_x4(&f[i + 32], h);
+    }
+}
+
+/*
+ * Check f[] has 0
+ * Return:
+ * 1 if 0 in f[]
+ * otherwise, 0
+ */
+uint16_t PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]) {
+    // Total SIMD registers: 22 = 12 + 8 + 2
+    int16x8x4_t a, b;      // 8
+    uint16x8x4_t c, d, e1; // 12
+    uint16x8x2_t e2;       // 2
+
+    e2.val[1] = vdupq_n_u16(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &f[i]);
+
+        // Compare bitwise Equal to zero (vector)
+        // a == 0 ? 1 : 0;
+        c.val[0] = vceqzq_s16(a.val[0]);
+        c.val[1] = vceqzq_s16(a.val[1]);
+        c.val[2] = vceqzq_s16(a.val[2]);
+        c.val[3] = vceqzq_s16(a.val[3]);
+
+        vload_s16_x4(b, &f[i + 32]);
+
+        d.val[0] = vceqzq_s16(b.val[0]);
+        d.val[1] = vceqzq_s16(b.val[1]);
+        d.val[2] = vceqzq_s16(b.val[2]);
+        d.val[3] = vceqzq_s16(b.val[3]);
+
+        e1.val[0] = vorrq_u16(d.val[0], c.val[0]);
+        e1.val[1] = vorrq_u16(d.val[1], c.val[1]);
+        e1.val[2] = vorrq_u16(d.val[2], c.val[2]);
+        e1.val[3] = vorrq_u16(d.val[3], c.val[3]);
+
+        e1.val[0] = vorrq_u16(e1.val[0], e1.val[2]);
+        e1.val[1] = vorrq_u16(e1.val[1], e1.val[3]);
+
+        e2.val[0] = vorrq_u16(e1.val[0], e1.val[1]);
+
+        e2.val[1] = vorrq_u16(e2.val[1], e2.val[0]);
+    }
+
+    uint16_t ret = vmaxvq_u16(e2.val[1]);
+
+    return ret;
+}
+
+/*
+ * Branchless conditional addtion with FALCON_Q if coeffcient is < 0
+ * If coefficient is larger than Q, it is subtracted with Q
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) {
+    // Total SIMD registers: 26 = 8 + 16 + 1 + 1
+    uint16x8x4_t b0, b1;        // 8
+    int16x8x4_t a0, a1, c0, c1; // 16
+    int16x8_t neon_q;           // 1
+    uint16x8_t neon_2q;         // 1
+
+    neon_q = vdupq_n_s16(FALCON_Q);
+    neon_2q = vdupq_n_u16(FALCON_Q << 1);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a0, &f[i]);
+
+        b0.val[0] = vcltzq_s16(a0.val[0]);
+        b0.val[1] = vcltzq_s16(a0.val[1]);
+        b0.val[2] = vcltzq_s16(a0.val[2]);
+        b0.val[3] = vcltzq_s16(a0.val[3]);
+
+        vload_s16_x4(a1, &f[i + 32]);
+
+        // Conditional addition with 2*FALCON_Q
+        b1.val[0] = vcltzq_s16(a1.val[0]);
+        b1.val[1] = vcltzq_s16(a1.val[1]);
+        b1.val[2] = vcltzq_s16(a1.val[2]);
+        b1.val[3] = vcltzq_s16(a1.val[3]);
+
+        c0.val[0] = vreinterpretq_s16_u16(vandq_u16(b0.val[0], neon_2q));
+        c0.val[1] = vreinterpretq_s16_u16(vandq_u16(b0.val[1], neon_2q));
+        c0.val[2] = vreinterpretq_s16_u16(vandq_u16(b0.val[2], neon_2q));
+        c0.val[3] = vreinterpretq_s16_u16(vandq_u16(b0.val[3], neon_2q));
+
+        c1.val[0] = vreinterpretq_s16_u16(vandq_u16(b1.val[0], neon_2q));
+        c1.val[1] = vreinterpretq_s16_u16(vandq_u16(b1.val[1], neon_2q));
+        c1.val[2] = vreinterpretq_s16_u16(vandq_u16(b1.val[2], neon_2q));
+        c1.val[3] = vreinterpretq_s16_u16(vandq_u16(b1.val[3], neon_2q));
+
+        vadd_x4(a0, a0, c0);
+        vadd_x4(a1, a1, c1);
+
+        // a > Q ? 1 : 0
+        b0.val[0] = vcgtq_s16(a0.val[0], neon_q);
+        b0.val[1] = vcgtq_s16(a0.val[1], neon_q);
+        b0.val[2] = vcgtq_s16(a0.val[2], neon_q);
+        b0.val[3] = vcgtq_s16(a0.val[3], neon_q);
+
+        b1.val[0] = vcgtq_s16(a1.val[0], neon_q);
+        b1.val[1] = vcgtq_s16(a1.val[1], neon_q);
+        b1.val[2] = vcgtq_s16(a1.val[2], neon_q);
+        b1.val[3] = vcgtq_s16(a1.val[3], neon_q);
+
+        // Conditional subtraction with FALCON_Q
+
+        c0.val[0] = vandq_s16(vreinterpretq_s16_u16(b0.val[0]), neon_q);
+        c0.val[1] = vandq_s16(vreinterpretq_s16_u16(b0.val[1]), neon_q);
+        c0.val[2] = vandq_s16(vreinterpretq_s16_u16(b0.val[2]), neon_q);
+        c0.val[3] = vandq_s16(vreinterpretq_s16_u16(b0.val[3]), neon_q);
+
+        c1.val[0] = vandq_s16(vreinterpretq_s16_u16(b1.val[0]), neon_q);
+        c1.val[1] = vandq_s16(vreinterpretq_s16_u16(b1.val[1]), neon_q);
+        c1.val[2] = vandq_s16(vreinterpretq_s16_u16(b1.val[2]), neon_q);
+        c1.val[3] = vandq_s16(vreinterpretq_s16_u16(b1.val[3]), neon_q);
+
+        vsub_x4(a0, a0, c0);
+        vsub_x4(a1, a1, c1);
+
+        vstore_s16_x4(&f[i], a0);
+        vstore_s16_x4(&f[i + 32], a1);
+    }
+}
+
+/*
+ * Perform conditional subtraction with Q and compare with min, max = -127, 127
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]) {
+    // Total SIMD registers: 32
+    int16x8x4_t a, f;                                   // 8
+    int16x8x4_t d0, d1;                                 // 8
+    uint16x8x4_t c0, c1, x0, x1;                        // 16
+    uint16x8x2_t e;                                     // 2
+    int8x16x4_t g;                                      // 4
+    int16x8_t neon_127, neon__127, neon_q_2, neon__q_2; // 4
+    uint16x8_t neon_q;                                  // 1
+    neon_127 = vdupq_n_s16(127);
+    neon__127 = vdupq_n_s16(-127);
+    neon_q = vdupq_n_u16(FALCON_Q);
+    neon_q_2 = vdupq_n_s16(FALCON_Q >> 1);
+    neon__q_2 = vdupq_n_s16(-(FALCON_Q >> 1));
+
+    e.val[1] = vdupq_n_u16(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        vload_s16_x4(a, &t[i]);
+        vload_s16_x4(f, &t[i + 32]);
+
+        // Conditional subtraction with FALCON_Q
+        // a >= Q/2 ? 1 : 0
+        c0.val[0] = vcgeq_s16(a.val[0], neon_q_2);
+        c0.val[1] = vcgeq_s16(a.val[1], neon_q_2);
+        c0.val[2] = vcgeq_s16(a.val[2], neon_q_2);
+        c0.val[3] = vcgeq_s16(a.val[3], neon_q_2);
+
+        c1.val[0] = vcgeq_s16(f.val[0], neon_q_2);
+        c1.val[1] = vcgeq_s16(f.val[1], neon_q_2);
+        c1.val[2] = vcgeq_s16(f.val[2], neon_q_2);
+        c1.val[3] = vcgeq_s16(f.val[3], neon_q_2);
+
+        // Perform subtraction with Q
+        d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+        d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+        d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+        d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+        d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+        d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+        d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+        d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+        vsub_x4(a, a, d0);
+        vsub_x4(f, f, d1);
+
+        // -Q/2 > a ? 1: 0
+        c0.val[0] = vcgtq_s16(neon__q_2, a.val[0]);
+        c0.val[1] = vcgtq_s16(neon__q_2, a.val[1]);
+        c0.val[2] = vcgtq_s16(neon__q_2, a.val[2]);
+        c0.val[3] = vcgtq_s16(neon__q_2, a.val[3]);
+
+        c1.val[0] = vcgtq_s16(neon__q_2, f.val[0]);
+        c1.val[1] = vcgtq_s16(neon__q_2, f.val[1]);
+        c1.val[2] = vcgtq_s16(neon__q_2, f.val[2]);
+        c1.val[3] = vcgtq_s16(neon__q_2, f.val[3]);
+
+        // Perform addition with Q
+        d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+        d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+        d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+        d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+        d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+        d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+        d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+        d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+        vadd_x4(a, a, d0);
+        vadd_x4(f, f, d1);
+
+        g.val[0] = vmovn_high_s16(vmovn_s16(a.val[0]), a.val[1]);
+        g.val[1] = vmovn_high_s16(vmovn_s16(a.val[2]), a.val[3]);
+        g.val[2] = vmovn_high_s16(vmovn_s16(f.val[0]), f.val[1]);
+        g.val[3] = vmovn_high_s16(vmovn_s16(f.val[2]), f.val[3]);
+
+        vst1q_s8_x4(&G[i], g);
+
+        // -127 > a ? 1 : 0
+        c0.val[0] = vcgtq_s16(neon__127, a.val[0]);
+        c0.val[1] = vcgtq_s16(neon__127, a.val[1]);
+        c0.val[2] = vcgtq_s16(neon__127, a.val[2]);
+        c0.val[3] = vcgtq_s16(neon__127, a.val[3]);
+        // a > 127 ? 1 : 0
+        c1.val[0] = vcgtq_s16(a.val[0], neon_127);
+        c1.val[1] = vcgtq_s16(a.val[1], neon_127);
+        c1.val[2] = vcgtq_s16(a.val[2], neon_127);
+        c1.val[3] = vcgtq_s16(a.val[3], neon_127);
+
+        // -127 > f ? 1 : 0
+        x0.val[0] = vcgtq_s16(neon__127, f.val[0]);
+        x0.val[1] = vcgtq_s16(neon__127, f.val[1]);
+        x0.val[2] = vcgtq_s16(neon__127, f.val[2]);
+        x0.val[3] = vcgtq_s16(neon__127, f.val[3]);
+        // f > 127 ? 1 : 0
+        x1.val[0] = vcgtq_s16(f.val[0], neon_127);
+        x1.val[1] = vcgtq_s16(f.val[1], neon_127);
+        x1.val[2] = vcgtq_s16(f.val[2], neon_127);
+        x1.val[3] = vcgtq_s16(f.val[3], neon_127);
+
+        c0.val[0] = vorrq_u16(c0.val[0], c1.val[0]);
+        c0.val[1] = vorrq_u16(c0.val[1], c1.val[1]);
+        c0.val[2] = vorrq_u16(c0.val[2], c1.val[2]);
+        c0.val[3] = vorrq_u16(c0.val[3], c1.val[3]);
+
+        x0.val[0] = vorrq_u16(x0.val[0], x1.val[0]);
+        x0.val[1] = vorrq_u16(x0.val[1], x1.val[1]);
+        x0.val[2] = vorrq_u16(x0.val[2], x1.val[2]);
+        x0.val[3] = vorrq_u16(x0.val[3], x1.val[3]);
+
+        c0.val[0] = vorrq_u16(c0.val[0], x0.val[0]);
+        c0.val[1] = vorrq_u16(c0.val[1], x0.val[1]);
+        c0.val[2] = vorrq_u16(c0.val[2], x0.val[2]);
+        c0.val[3] = vorrq_u16(c0.val[3], x0.val[3]);
+
+        c0.val[0] = vorrq_u16(c0.val[0], c0.val[2]);
+        c0.val[1] = vorrq_u16(c0.val[1], c0.val[3]);
+
+        e.val[0] = vorrq_u16(c0.val[0], c0.val[1]);
+
+        e.val[1] = vorrq_u16(e.val[1], e.val[0]);
+    }
+    if (vmaxvq_u16(e.val[1])) {
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+        const int8_t low, const int8_t high) {
+    // Total SIMD registers: 15
+    int8x16x4_t a;                 // 4
+    uint8x16x4_t c, d;             // 8
+    uint8x16_t e;                  // 1
+    int8x16_t neon_low, neon_high; // 2
+
+    neon_high = vdupq_n_s8(high);
+    neon_low = vdupq_n_s8(low);
+    e = vdupq_n_u8(0);
+
+    for (int i = 0; i < FALCON_N; i += 64) {
+        a = vld1q_s8_x4(&t[i]);
+
+        // low > a ? 1 : 0
+        c.val[0] = vcgtq_s8(neon_low, a.val[0]);
+        c.val[1] = vcgtq_s8(neon_low, a.val[1]);
+        c.val[2] = vcgtq_s8(neon_low, a.val[2]);
+        c.val[3] = vcgtq_s8(neon_low, a.val[3]);
+        // a > high ? 1 : 0
+        d.val[0] = vcgtq_s8(a.val[0], neon_high);
+        d.val[1] = vcgtq_s8(a.val[1], neon_high);
+        d.val[2] = vcgtq_s8(a.val[2], neon_high);
+        d.val[3] = vcgtq_s8(a.val[3], neon_high);
+
+        c.val[0] = vorrq_u8(c.val[0], d.val[0]);
+        c.val[1] = vorrq_u8(c.val[1], d.val[1]);
+        c.val[2] = vorrq_u8(c.val[2], d.val[2]);
+        c.val[3] = vorrq_u8(c.val[3], d.val[3]);
+
+        c.val[0] = vorrq_u8(c.val[0], c.val[2]);
+        c.val[1] = vorrq_u8(c.val[1], c.val[3]);
+
+        c.val[0] = vorrq_u8(c.val[0], c.val[1]);
+
+        e = vorrq_u8(e, c.val[0]);
+
+        if (vmaxvq_u8(e)) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ * Work for FALCON_N >= 32, or FALCON_LOGN >= 5
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+        const int16_t low, const int16_t high) {
+    // Total SIMD registers = 15
+    int16x8x4_t a;                 // 4
+    uint16x8x4_t c, d;             // 8
+    uint16x8_t e;                  // 1
+    int16x8_t neon_low, neon_high; // 2
+
+    neon_high = vdupq_n_s16(high);
+    neon_low = vdupq_n_s16(low);
+    e = vdupq_n_u16(0);
+
+    for (int i = 0; i < FALCON_N; i += 32) {
+        a = vld1q_s16_x4(&t[i]);
+
+        // low > a ? 1 : 0
+        c.val[0] = vcgtq_s16(neon_low, a.val[0]);
+        c.val[1] = vcgtq_s16(neon_low, a.val[1]);
+        c.val[2] = vcgtq_s16(neon_low, a.val[2]);
+        c.val[3] = vcgtq_s16(neon_low, a.val[3]);
+        // a > high ? 1 : 0
+        d.val[0] = vcgtq_s16(a.val[0], neon_high);
+        d.val[1] = vcgtq_s16(a.val[1], neon_high);
+        d.val[2] = vcgtq_s16(a.val[2], neon_high);
+        d.val[3] = vcgtq_s16(a.val[3], neon_high);
+
+        c.val[0] = vorrq_u16(c.val[0], d.val[0]);
+        c.val[1] = vorrq_u16(c.val[1], d.val[1]);
+        c.val[2] = vorrq_u16(c.val[2], d.val[2]);
+        c.val[3] = vorrq_u16(c.val[3], d.val[3]);
+
+        c.val[0] = vorrq_u16(c.val[0], c.val[2]);
+        c.val[1] = vorrq_u16(c.val[1], c.val[3]);
+
+        c.val[0] = vorrq_u16(c.val[0], c.val[1]);
+
+        e = vorrq_u16(e, c.val[0]);
+
+        if (vmaxvq_u16(e)) {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/pqclean.c
new file mode 100644
index 000000000..bd6f04943
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/pqclean.c
@@ -0,0 +1,377 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN   40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ *   private key:
+ *      header byte: 0101nnnn
+ *      private f  (6 or 5 bits by element, depending on degree)
+ *      private g  (6 or 5 bits by element, depending on degree)
+ *      private F  (8 bits by element)
+ *
+ *   public key:
+ *      header byte: 0000nnnn
+ *      public h   (14 bits by element)
+ *
+ *   signature:
+ *      header byte: 0011nnnn
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
+ *      padding    to PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES bytes
+ *
+ *   message + signature:
+ *      signature  PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES bytes
+ *      message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk) {
+    union {
+        uint8_t b[28 * FALCON_N];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N];
+    uint16_t h[FALCON_N];
+    unsigned char seed[48];
+    inner_shake256_context rng;
+    size_t u, v;
+
+    /*
+     * Generate key pair.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
+    PQCLEAN_FALCONPADDED512_AARCH64_keygen(&rng, f, g, F, NULL, h, FALCON_LOGN, tmp.b);
+    inner_shake256_ctx_release(&rng);
+
+    /*
+     * Encode private key.
+     */
+    sk[0] = 0x50 + FALCON_LOGN;
+    u = 1;
+    v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+            f, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+            g, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+            F, PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[FALCON_LOGN]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+
+    /*
+     * Encode public key.
+     */
+    pk[0] = 0x00 + FALCON_LOGN;
+    v = PQCLEAN_FALCONPADDED512_AARCH64_modq_encode(
+            pk + 1, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1,
+            h, FALCON_LOGN);
+    if (v != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+        const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    union {
+        uint8_t b[72 * FALCON_N];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N], G[FALCON_N];
+    struct {
+        int16_t sig[FALCON_N];
+        uint16_t hm[FALCON_N];
+    } r;
+    unsigned char seed[48];
+    inner_shake256_context sc;
+    size_t u, v;
+
+    /*
+     * Decode the private key.
+     */
+    if (sk[0] != 0x50 + FALCON_LOGN) {
+        return -1;
+    }
+    u = 1;
+    v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(
+            f, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN],
+            sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(
+            g, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN],
+            sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(
+            F, PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[FALCON_LOGN],
+            sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+    if (!PQCLEAN_FALCONPADDED512_AARCH64_complete_private(G, f, g, F, tmp.b)) {
+        return -1;
+    }
+
+    /*
+     * Create a random nonce (40 bytes).
+     */
+    randombytes(nonce, NONCELEN);
+
+    /*
+     * Hash message nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(&sc, r.hm, FALCON_LOGN, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Initialize a RNG.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
+
+    /*
+     * Compute and return the signature. This loops until a signature
+     * value is found that fits in the provided buffer.
+     */
+    for (;;) {
+        PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+        v = PQCLEAN_FALCONPADDED512_AARCH64_comp_encode(sigbuf, sigbuflen, r.sig);
+        if (v != 0) {
+            inner_shake256_ctx_release(&sc);
+            memset(sigbuf + v, 0, sigbuflen - v);
+            return 0;
+        }
+    }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+    const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    union {
+        uint8_t b[2 * FALCON_N];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int16_t h[FALCON_N];
+    int16_t hm[FALCON_N];
+    int16_t sig[FALCON_N];
+    inner_shake256_context sc;
+    size_t v;
+
+    /*
+     * Decode public key.
+     */
+    if (pk[0] != 0x00 + FALCON_LOGN) {
+        return -1;
+    }
+    if (PQCLEAN_FALCONPADDED512_AARCH64_modq_decode( (uint16_t *) h,
+            pk + 1, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1, FALCON_LOGN)
+            != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+    // We move the conversion to NTT domain of `h` inside verify_raw()
+
+    /*
+     * Decode signature.
+     */
+    if (sigbuflen == 0) {
+        return -1;
+    }
+
+    v = PQCLEAN_FALCONPADDED512_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+    if (v == 0) {
+        return -1;
+    }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
+
+    /*
+     * Hash nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(&sc, (uint16_t *) hm, FALCON_LOGN, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Verify signature.
+     */
+    if (!PQCLEAN_FALCONPADDED512_AARCH64_verify_raw(hm, sig, h, (int16_t *) tmp.b)) {
+        return -1;
+    }
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    size_t vlen;
+
+    vlen = PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sig[0] = 0x30 + FALCON_LOGN;
+    *siglen = 1 + NONCELEN + vlen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    if (siglen < 1 + NONCELEN) {
+        return -1;
+    }
+    if (sig[0] != 0x30 + FALCON_LOGN) {
+        return -1;
+    }
+    return do_verify(sig + 1,
+                     sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    uint8_t *sigbuf;
+    size_t sigbuflen;
+
+    /*
+     * Move the message to its final location; this is a memmove() so
+     * it handles overlaps properly.
+     */
+    memmove(sm + PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES, m, mlen);
+    sigbuf = sm + 1 + NONCELEN;
+    sigbuflen = PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sm[0] = 0x30 + FALCON_LOGN;
+    sigbuflen ++;
+    *smlen = mlen + NONCELEN + sigbuflen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+    const uint8_t *sigbuf;
+    size_t pmlen, sigbuflen;
+
+    if (smlen < PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES) {
+        return -1;
+    }
+    sigbuflen = PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+    pmlen = smlen - PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES;
+    if (sm[0] != 0x30 + FALCON_LOGN) {
+        return -1;
+    }
+    sigbuf = sm + 1 + NONCELEN;
+
+    /*
+     * The one-byte signature header has been verified. Nonce is at sm+1
+     * followed by the signature (pointed to by sigbuf). The message
+     * follows the signature value.
+     */
+    if (do_verify(sm + 1, sigbuf, sigbuflen,
+                  sm + PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES, pmlen, pk) < 0) {
+        return -1;
+    }
+
+    /*
+     * Signature is correct, we just have to copy/move the message
+     * to its final destination. The memmove() properly handles
+     * overlaps.
+     */
+    memmove(m, sm + PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES, pmlen);
+    *mlen = pmlen;
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/rng.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/rng.c
new file mode 100644
index 000000000..cd5bd7703
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/rng.c
@@ -0,0 +1,194 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include "inner.h"
+
+int PQCLEAN_FALCONPADDED512_AARCH64_get_seed(void *seed, size_t len) {
+    unsigned char tmp[48];
+    for (size_t i = 0; i < len; i++) {
+        tmp[i] = (unsigned char) i;
+    }
+    memcpy(seed, tmp, len);
+    return 1;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_prng_init(prng *p, inner_shake256_context *src) {
+    /*
+     * To ensure reproducibility for a given seed, we
+     * must enforce little-endian interpretation of
+     * the state words.
+     */
+    uint8_t tmp[56];
+    uint64_t th, tl;
+    int i;
+
+    inner_shake256_extract(src, tmp, 56);
+    for (i = 0; i < 14; i ++) {
+        uint32_t w;
+
+        w = (uint32_t)tmp[(i << 2) + 0]
+            | ((uint32_t)tmp[(i << 2) + 1] << 8)
+            | ((uint32_t)tmp[(i << 2) + 2] << 16)
+            | ((uint32_t)tmp[(i << 2) + 3] << 24);
+        *(uint32_t *)(p->state.d + (i << 2)) = w;
+    }
+    tl = *(uint32_t *)(p->state.d + 48);
+    th = *(uint32_t *)(p->state.d + 52);
+    *(uint64_t *)(p->state.d + 48) = tl + (th << 32);
+    PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(prng *p) {
+
+    static const uint32_t CW[] = {
+        0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+    };
+
+    uint64_t cc;
+    size_t u;
+
+    /*
+     * State uses local endianness. Only the output bytes must be
+     * converted to little endian (if used on a big-endian machine).
+     */
+    cc = *(uint64_t *)(p->state.d + 48);
+    for (u = 0; u < 8; u ++) {
+        uint32_t state[16];
+        size_t v;
+        int i;
+
+        memcpy(&state[0], CW, sizeof CW);
+        memcpy(&state[4], p->state.d, 48);
+        state[14] ^= (uint32_t)cc;
+        state[15] ^= (uint32_t)(cc >> 32);
+        for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d)   do { \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] << 16) | (state[d] >> 16); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] << 12) | (state[b] >> 20); \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] <<  8) | (state[d] >> 24); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] <<  7) | (state[b] >> 25); \
+    } while (0)
+
+            QROUND( 0,  4,  8, 12);
+            QROUND( 1,  5,  9, 13);
+            QROUND( 2,  6, 10, 14);
+            QROUND( 3,  7, 11, 15);
+            QROUND( 0,  5, 10, 15);
+            QROUND( 1,  6, 11, 12);
+            QROUND( 2,  7,  8, 13);
+            QROUND( 3,  4,  9, 14);
+
+#undef QROUND
+
+        }
+
+        for (v = 0; v < 4; v ++) {
+            state[v] += CW[v];
+        }
+        for (v = 4; v < 14; v ++) {
+            state[v] += ((uint32_t *)p->state.d)[v - 4];
+        }
+        state[14] += ((uint32_t *)p->state.d)[10]
+                     ^ (uint32_t)cc;
+        state[15] += ((uint32_t *)p->state.d)[11]
+                     ^ (uint32_t)(cc >> 32);
+        cc ++;
+
+        /*
+         * We mimic the interleaving that is used in the AVX2
+         * implementation.
+         */
+        for (v = 0; v < 16; v ++) {
+            p->buf.d[(u << 2) + (v << 5) + 0] =
+                (uint8_t)state[v];
+            p->buf.d[(u << 2) + (v << 5) + 1] =
+                (uint8_t)(state[v] >> 8);
+            p->buf.d[(u << 2) + (v << 5) + 2] =
+                (uint8_t)(state[v] >> 16);
+            p->buf.d[(u << 2) + (v << 5) + 3] =
+                (uint8_t)(state[v] >> 24);
+        }
+    }
+    *(uint64_t *)(p->state.d + 48) = cc;
+
+    p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len) {
+    uint8_t *buf;
+
+    buf = dst;
+    while (len > 0) {
+        size_t clen;
+
+        clen = (sizeof p->buf.d) - p->ptr;
+        if (clen > len) {
+            clen = len;
+        }
+        memcpy(buf, p->buf.d, clen);
+        buf += clen;
+        len -= clen;
+        p->ptr += clen;
+        if (p->ptr == sizeof p->buf.d) {
+            PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sampler.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sampler.c
new file mode 100644
index 000000000..e77dc4b52
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sampler.c
@@ -0,0 +1,292 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include <arm_neon.h>
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(prng *p) {
+
+    static const uint32_t dist[] = {
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
+    };
+
+    uint32_t v0, v1, v2, hi;
+    uint64_t lo;
+    int z;
+
+    /*
+     * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+     */
+    lo = prng_get_u64(p);
+    hi = prng_get_u8(p);
+    v0 = (uint32_t)lo & 0xFFFFFF;
+    v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+    v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+    /*
+     * Sampled value is z, such that v0..v2 is lower than the first
+     * z elements of the table.
+     */
+
+    uint32x4x3_t w;
+    uint32x4_t x0, x1, x2, cc0, cc1, cc2, zz;
+    uint32x2x3_t wh;
+    uint32x2_t cc0h, cc1h, cc2h, zzh;
+    x0 = vdupq_n_u32(v0);
+    x1 = vdupq_n_u32(v1);
+    x2 = vdupq_n_u32(v2);
+
+    // 0: 0, 3, 6, 9
+    // 1: 1, 4, 7, 10
+    // 2: 2, 5, 8, 11
+    // v0 - w0
+    // v1 - w1
+    // v2 - w2
+    // cc1 - cc0 >> 31
+    // cc2 - cc1 >> 31
+    // z + cc2 >> 31
+    w = vld3q_u32(&dist[0]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vshrq_n_u32(cc2, 31);
+
+    w = vld3q_u32(&dist[12]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vsraq_n_u32(zz, cc2, 31);
+
+    w = vld3q_u32(&dist[24]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vsraq_n_u32(zz, cc2, 31);
+
+    w = vld3q_u32(&dist[36]);
+    cc0 = vsubq_u32(x0, w.val[2]);
+    cc1 = vsubq_u32(x1, w.val[1]);
+    cc2 = vsubq_u32(x2, w.val[0]);
+    cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+    cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+    zz = vsraq_n_u32(zz, cc2, 31);
+
+    // 0: 48, 51
+    // 1: 49, 52
+    // 2: 50, 53
+    wh = vld3_u32(&dist[48]);
+    cc0h = vsub_u32(vget_low_u32(x0), wh.val[2]);
+    cc1h = vsub_u32(vget_low_u32(x1), wh.val[1]);
+    cc2h = vsub_u32(vget_low_u32(x2), wh.val[0]);
+    cc1h = (uint32x2_t)vsra_n_s32((int32x2_t)cc1h, (int32x2_t)cc0h, 31);
+    cc2h = (uint32x2_t)vsra_n_s32((int32x2_t)cc2h, (int32x2_t)cc1h, 31);
+    zzh = vshr_n_u32(cc2h, 31);
+
+    z = (int) (vaddvq_u32(zz) + vaddv_u32(zzh));
+    return z;
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+    int s, i;
+    fpr r;
+    uint32_t sw, w;
+    uint64_t z;
+
+    /*
+     * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+     * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+     */
+    s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+    r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+    /*
+     * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+     * (the minimum value for sigma), r = 0 and b = 1, then we get
+     * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+     * with probability about 0.000000000230383991, which is
+     * approximatively equal to 2^(-32). In any case, if s >= 64,
+     * then BerExp will be non-zero with probability less than
+     * 2^(-64), so we can simply saturate s at 63.
+     */
+    sw = (uint32_t)s;
+    sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+    s = (int)sw;
+
+    /*
+     * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+     * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+     * We scale it up to 2^64, then right-shift it by s bits because
+     * we really want exp(-x) = 2^(-s)*exp(-r).
+     *
+     * The "-1" operation makes sure that the value fits on 64 bits
+     * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+     * case). The bias is negligible since fpr_expm_p63() only computes
+     * with 51 bits of precision or so.
+     */
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+    /*
+     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+     * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+     * PRNG output to limit its consumption, the sign of the difference
+     * yields the expected result.
+     */
+    i = 64;
+    do {
+        i -= 8;
+        w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+    } while (!w && i > 0);
+    return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_sampler(void *ctx, fpr mu, fpr isigma) {
+    sampler_context *spc;
+    int s;
+    fpr r, dss, ccs;
+
+    spc = ctx;
+
+    /*
+     * Center is mu. We compute mu = s + r where s is an integer
+     * and 0 <= r < 1.
+     */
+    s = (int)fpr_floor(mu);
+    r = fpr_sub(mu, fpr_of(s));
+
+    /*
+     * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+     */
+    dss = fpr_half(fpr_sqr(isigma));
+
+    /*
+     * ccs = sigma_min / sigma = sigma_min * isigma.
+     */
+    ccs = fpr_mul(isigma, spc->sigma_min);
+
+    /*
+     * We now need to sample on center r.
+     */
+    for (;;) {
+        int z0, z, b;
+        fpr x;
+
+        /*
+         * Sample z for a Gaussian distribution. Then get a
+         * random bit b to turn the sampling into a bimodal
+         * distribution: if b = 1, we use z+1, otherwise we
+         * use -z. We thus have two situations:
+         *
+         *  - b = 1: z >= 1 and sampled against a Gaussian
+         *    centered on 1.
+         *  - b = 0: z <= 0 and sampled against a Gaussian
+         *    centered on 0.
+         */
+        z0 = PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(&spc->p);
+        b = (int)prng_get_u8(&spc->p) & 1;
+        z = b + ((b << 1) - 1) * z0;
+
+        /*
+         * Rejection sampling. We want a Gaussian centered on r;
+         * but we sampled against a Gaussian centered on b (0 or
+         * 1). But we know that z is always in the range where
+         * our sampling distribution is greater than the Gaussian
+         * distribution, so rejection works.
+         *
+         * We got z with distribution:
+         *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+         * We target distribution:
+         *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
+         * Rejection sampling works by keeping the value z with
+         * probability S(z)/G(z), and starting again otherwise.
+         * This requires S(z) <= G(z), which is the case here.
+         * Thus, we simply need to keep our z with probability:
+         *    P = exp(-x)
+         * where:
+         *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+         *
+         * Here, we scale up the Bernouilli distribution, which
+         * makes rejection more probable, but makes rejection
+         * rate sufficiently decorrelated from the Gaussian
+         * center and standard deviation that the whole sampler
+         * can be said to be constant-time.
+         */
+        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+        if (BerExp(&spc->p, x, ccs)) {
+            /*
+             * Rejection sampling was centered on r, but the
+             * actual center is mu = s + r.
+             */
+            return s + z;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sign.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sign.c
new file mode 100644
index 000000000..550a6e434
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sign.c
@@ -0,0 +1,953 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+#include "util.h"
+#include <arm_neon.h>
+#include <stdio.h>
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ *   N = 2^logn
+ *   phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+    /*
+     * For logn = 0 (polynomials are constant), the "tree" is a
+     * single element. Otherwise, the tree node has size 2^logn, and
+     * has two child trees for size logn-1 each. Thus, treesize s()
+     * must fulfill these two relations:
+     *
+     *   s(0) = 1
+     *   s(logn) = (2^logn) + 2*s(logn-1)
+     */
+    return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *restrict tree,
+                fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) {
+    size_t n, hn;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g0[0];
+        return;
+    }
+    hn = n >> 1;
+
+    /*
+     * The LDL decomposition yields L (which is written in the tree)
+     * and the diagonal of D. Since d00 = g0, we just write d11
+     * into tmp.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+    /*
+     * Split d00 (currently in g0) and d11 (currently in tmp). We
+     * reuse g0 and g1 as temporary storage spaces:
+     *   d00 splits into g1, g1+hn
+     *   d11 splits into g0, g0+hn
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(g1, g1 + hn, g0, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+    /*
+     * Each split result is the first row of a new auto-adjoint
+     * quasicyclic matrix for the next recursive step.
+     */
+    ffLDL_fft_inner(tree + n,
+                    g1, g1 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
+          const fpr *restrict g01, const fpr *restrict g11,
+          unsigned logn, fpr *restrict tmp) {
+    size_t n, hn;
+    fpr *d00, *d11;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g00[0];
+        return;
+    }
+    hn = n >> 1;
+    d00 = tmp;
+    d11 = tmp + n;
+    tmp += n << 1;
+
+    memcpy(d00, g00, n * sizeof * g00);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, d00, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(d00, d00 + hn, d11, logn);
+    memcpy(d11, tmp, n * sizeof * tmp);
+
+    ffLDL_fft_inner(tree + n, d11, d11 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), d00, d00 + hn, logn - 1, tmp);
+
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+    /*
+     * TODO: make an iterative version.
+     */
+    size_t n;
+
+    n = MKN(logn);
+    if (n == 1) {
+        /*
+         * We actually store in the tree leaf the inverse of
+         * the value mandated by the specification: this
+         * saves a division both here and in the sampler.
+         */
+        tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma_9);
+    } else {
+        ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+        ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+                               orig_logn, logn - 1);
+    }
+}
+
+/* =================================================================== */
+
+/*
+ * The expanded private key contains:
+ *  - The B0 matrix (four elements)
+ *  - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+    (void)logn;
+    return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+    return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+    return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+    return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+    return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_expand_privkey(fpr *restrict expanded_key,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        uint8_t *restrict tmp) {
+    fpr *rf, *rg, *rF, *rG;
+    fpr *b00, *b01, *b10, *b11;
+    fpr *g00, *g01, *g11, *gxx;
+    fpr *tree;
+
+    b00 = expanded_key + skoff_b00(FALCON_LOGN);
+    b01 = expanded_key + skoff_b01(FALCON_LOGN);
+    b10 = expanded_key + skoff_b10(FALCON_LOGN);
+    b11 = expanded_key + skoff_b11(FALCON_LOGN);
+    tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+    /*
+     * We load the private key elements directly into the B0 matrix,
+     * since B0 = [[g, -f], [G, -F]].
+     */
+    rg = b00;
+    rf = b01;
+    rG = b10;
+    rF = b11;
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rg, g, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rg, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rf, f, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rf, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(rf, rf, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rG, G, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rG, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rF, F, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(rF, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(rF, rF, FALCON_LOGN);
+
+    /*
+     * Compute the FFT for the key elements, and negate f and F.
+     */
+
+    /*
+     * The Gram matrix is G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle).
+     */
+    g00 = (fpr *)tmp;
+    g01 = g00 + FALCON_N;
+    g11 = g01 + FALCON_N;
+    gxx = g11 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(g00, b00, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(g00, g00, b01, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(g01, b00, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(g01, g01, b01, b11, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(g11, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(g11, g11, b11, FALCON_LOGN);
+
+    /*
+     * Compute the Falcon tree.
+     */
+    ffLDL_fft(tree, g00, g01, g11, FALCON_LOGN, gxx);
+
+    /*
+     * Normalize tree.
+     */
+    ffLDL_binary_normalize(tree, FALCON_LOGN, FALCON_LOGN);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+                       fpr *restrict t0, fpr *restrict t1,
+                       fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
+                       unsigned orig_logn, unsigned logn, fpr *restrict tmp) {
+    size_t n, hn;
+    fpr *z0, *z1;
+
+    /*
+     * Deepest level: the LDL tree leaf value is just g00 (the
+     * array has length only 1 at this point); we normalize it
+     * with regards to sigma, then use it for sampling.
+     */
+    if (logn == 0) {
+        fpr leaf;
+
+        leaf = g00[0];
+        leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma_9);
+        t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+        t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+        return;
+    }
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Decompose G into LDL. We only need d00 (identical to g00),
+     * d11, and l10; we do that in place.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft(g00, g01, g11, logn);
+
+    /*
+     * Split d00 and d11 and expand them into half-size quasi-cyclic
+     * Gram matrices. We also save l10 in tmp[].
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, g00, logn);
+    memcpy(g00, tmp, n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, g11, logn);
+    memcpy(g11, tmp, n * sizeof * tmp);
+    memcpy(tmp, g01, n * sizeof * g01);
+    memcpy(g01, g00, hn * sizeof * g00);
+    memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+    /*
+     * The half-size Gram matrices for the recursive LDL tree
+     * building are now:
+     *   - left sub-tree: g00, g00+hn, g01
+     *   - right sub-tree: g11, g11+hn, g01+hn
+     * l10 is in tmp[].
+     */
+
+    /*
+     * We split t1 and use the first recursive call on the two
+     * halves, using the right sub-tree. The result is merged
+     * back into tmp + 2*n.
+     */
+    z1 = tmp + n;
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+                           g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * l10.
+     * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+     * in tmp + (n << 1). The buffer in z1 is free.
+     *
+     * In the end, z1 is written over t1, and tb0 is in t0.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(z1, t1, tmp + (n << 1), logn);
+    memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, t0, tmp, z1, logn);
+
+    /*
+     * Second recursive invocation, on the split tb0 (currently in t0)
+     * and the left sub-tree.
+     */
+    z0 = tmp;
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z0, z0 + hn, t0, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+                           g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+               fpr *restrict z0, fpr *restrict z1,
+               const fpr *restrict tree,
+               const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
+               fpr *restrict tmp) {
+    size_t n, hn;
+    const fpr *tree0, *tree1;
+
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        // Split
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        // Sampling
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        // Merge
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        // Split
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        // Sampling
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        // Merge
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * General recursive case (logn >= 2).
+     */
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    tree0 = tree + n;
+    tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+    /*
+     * We split t1 into z1 (reused as temporary storage), then do
+     * the recursive invocation, with output in tmp. We finally
+     * merge back into z1.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree1, z1, z1 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(tmp, t1, z1, logn);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(tmp, t0, tmp, tree, logn);
+
+    /*
+     * Second recursive invocation.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z0, z0 + hn, tmp, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree0, z0, z0 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+             const fpr *restrict expanded_key,
+             const uint16_t *hm, fpr *restrict tmp) {
+    fpr *t0, *t1, *tx, *ty;
+    const fpr *b00, *b01, *b10, *b11, *tree;
+    fpr ni;
+    int16_t *s1tmp, *s2tmp;
+
+    t0 = tmp;
+    t1 = t0 + FALCON_N;
+    b00 = expanded_key + skoff_b00(FALCON_LOGN);
+    b01 = expanded_key + skoff_b01(FALCON_LOGN);
+    b10 = expanded_key + skoff_b10(FALCON_LOGN);
+    b11 = expanded_key + skoff_b11(FALCON_LOGN);
+    tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(t0, FALCON_LOGN);
+    ni = fpr_inverse_of_q;
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+    tx = t1 + FALCON_N;
+    ty = tx + FALCON_N;
+
+    /*
+     * Apply sampling. Output is written back in [tx, ty].
+     */
+    ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, FALCON_LOGN, ty + FALCON_N);
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, tx, b00, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, t0, ty, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t0, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, tx, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t1, t1, ty, b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t1, FALCON_LOGN);
+
+    /*
+     * Compute the signature.
+     */
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+
+    s1tmp = (int16_t *)tx;
+    s2tmp = (int16_t *)tmp;
+
+    if (PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+        memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+        memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+            const int8_t *restrict f, const int8_t *restrict g,
+            const int8_t *restrict F, const int8_t *restrict G,
+            const uint16_t *hm, fpr *restrict tmp) {
+    fpr *t0, *t1, *tx, *ty;
+    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+    fpr ni;
+    int16_t *s1tmp, *s2tmp;
+
+    /*
+     * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+     */
+    b00 = tmp;
+    b01 = b00 + FALCON_N;
+    b10 = b01 + FALCON_N;
+    b11 = b10 + FALCON_N;
+    t0 = b11 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b00, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b10, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+    /*
+     * Compute the Gram matrix G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle). g10 is not kept
+     * since it is equal to adj(g01).
+     *
+     * We _replace_ the matrix B with the Gram matrix, but we
+     * must keep b01 and b11 for computing the target vector.
+     *
+     * Memory layout:
+     * b00 | b01 | b10 | b11 | t0 | t1
+     * g00 | g01 | g11 | b01 | t0 | t1
+     */
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(t1, b00, b10, FALCON_LOGN);   // t1 <- b00*adj(b10)
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(t0, b01, FALCON_LOGN);    // t0 <- b01*adj(b01)
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(b00, b00, FALCON_LOGN);   // b00 <- b00*adj(b00)
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_add(b00, b00, t0, FALCON_LOGN);      // b00 <- g00
+
+    memcpy(t0, b01, FALCON_N * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(b01, t1, b01, b11, FALCON_LOGN);  // b01 <- b01*adj(b11)
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(b10, b10, FALCON_LOGN);   // b10 <- b10*adj(b10)
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(b10, b10, b11, FALCON_LOGN);    // t1 = g11 <- b11*adj(b11)
+
+    /*
+     * We rename variables to make things clearer. The three elements
+     * of the Gram matrix uses the first 3*n slots of tmp[], followed
+     * by b11 and b01 (in that order).
+     */
+    g00 = b00;
+    g01 = b01;
+    g11 = b10;
+    b01 = t0;
+    t0 = b01 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    /*
+     * Memory layout at that point:
+     *   g00 g01 g11 b11 b01 t0 t1
+     */
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(t0, FALCON_LOGN);
+    ni = fpr_inverse_of_q;
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+    /*
+     * b01 and b11 can be discarded, so we move back (t0,t1).
+     * Memory layout is now:
+     *      g00 g01 g11 t0 t1
+     */
+    memcpy(b11, t0, FALCON_N * 2 * sizeof * t0);
+    t0 = g11 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    /*
+     * Apply sampling; result is written over (t0,t1).
+     * t1, g00
+     */
+    ffSampling_fft_dyntree(samp, samp_ctx,
+                           t0, t1, g00, g01, g11, FALCON_LOGN, FALCON_LOGN, t1 + FALCON_N);
+
+    /*
+     * We arrange the layout back to:
+     *     b00 b01 b10 b11 t0 t1
+     *
+     * We did not conserve the matrix basis, so we must recompute
+     * it now.
+     */
+    b00 = tmp;
+    b01 = b00 + FALCON_N;
+    b10 = b01 + FALCON_N;
+    b11 = b10 + FALCON_N;
+    memmove(b11 + FALCON_N, t0, FALCON_N * 2 * sizeof * t0);
+    t0 = b11 + FALCON_N;
+    t1 = t0 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b00, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b10, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_FFT(b11, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+    tx = t1 + FALCON_N;
+    ty = tx + FALCON_N;
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(tx, t0, b00, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(ty, t0, b01, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, tx, t1, b10, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t1, ty, t1, b11, FALCON_LOGN);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t0, FALCON_LOGN);
+    PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t1, FALCON_LOGN);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s1tmp = (int16_t *)tx;
+    s2tmp = (int16_t *)tmp;
+
+    if (PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+        memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+        memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *restrict expanded_key,
+        const uint16_t *hm, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min_9;
+        PQCLEAN_FALCONPADDED512_AARCH64_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED512_AARCH64_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_tree(samp, samp_ctx, sig, expanded_key, hm, ftmp)) {
+            break;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *restrict f, const int8_t *restrict g,
+        const int8_t *restrict F, const int8_t *restrict G,
+        const uint16_t *hm, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+
+        spc.sigma_min = fpr_sigma_min_9;
+        PQCLEAN_FALCONPADDED512_AARCH64_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED512_AARCH64_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_dyn(samp, samp_ctx, sig, f, g, F, G, hm, ftmp)) {
+            break;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.c
new file mode 100644
index 000000000..5f63c48fc
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.c
@@ -0,0 +1,71 @@
+/*
+ * Utils function
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "util.h"
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, const unsigned logn) {
+    float64x2x4_t neon_flo64, neon_fhi64;
+    int64x2x4_t neon_lo64, neon_hi64;
+    int32x4_t neon_lo32[2], neon_hi32[2];
+    int16x8_t neon_lo16, neon_hi16;
+    int8x16_t neon_8;
+
+    const unsigned falcon_n =  1 << logn;
+
+    for (unsigned i = 0; i < falcon_n; i += 16) {
+        neon_8 = vld1q_s8(&t[i]);
+
+        // Extend from 8 to 16 bit
+        // x7 | x6 | x5 | x5 - x3 | x2 | x1 | x0
+        neon_lo16 = vmovl_s8(vget_low_s8(neon_8));
+        neon_hi16 = vmovl_high_s8(neon_8);
+
+        // Extend from 16 to 32 bit
+        // xxx3 | xxx2 | xxx1 | xxx0
+        neon_lo32[0] = vmovl_s16(vget_low_s16(neon_lo16));
+        neon_lo32[1] = vmovl_high_s16(neon_lo16);
+        neon_hi32[0] = vmovl_s16(vget_low_s16(neon_hi16));
+        neon_hi32[1] = vmovl_high_s16(neon_hi16);
+
+        // Extend from 32 to 64 bit
+        neon_lo64.val[0] = vmovl_s32(vget_low_s32(neon_lo32[0]));
+        neon_lo64.val[1] = vmovl_high_s32(neon_lo32[0]);
+        neon_lo64.val[2] = vmovl_s32(vget_low_s32(neon_lo32[1]));
+        neon_lo64.val[3] = vmovl_high_s32(neon_lo32[1]);
+
+        neon_hi64.val[0] = vmovl_s32(vget_low_s32(neon_hi32[0]));
+        neon_hi64.val[1] = vmovl_high_s32(neon_hi32[0]);
+        neon_hi64.val[2] = vmovl_s32(vget_low_s32(neon_hi32[1]));
+        neon_hi64.val[3] = vmovl_high_s32(neon_hi32[1]);
+
+        vfcvtx4(neon_flo64, neon_lo64);
+        vfcvtx4(neon_fhi64, neon_hi64);
+
+        vstorex4(&r[i], neon_flo64);
+        vstorex4(&r[i + 8], neon_fhi64);
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.h
new file mode 100644
index 000000000..e3576bc5c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.h
@@ -0,0 +1,8 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#define poly_small_to_fp PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr
+
+void PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/vrfy.c
new file mode 100644
index 000000000..c1345d95a
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/vrfy.c
@@ -0,0 +1,174 @@
+/*
+ * Falcon signature verification.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author   Duc Tri Nguyen <dnguye69@gmu.edu>, <cothannguyen@gmail.com>
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt(int16_t *h) {
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt_monty(int16_t *h) {
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_MONT);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+        int16_t *h, int16_t *tmp) {
+    int16_t *tt = tmp;
+
+    /*
+     * Compute s1 = c0 - s2*h mod phi mod q (in tt[]).
+     */
+
+    memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT_INV);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(tt, h);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(tt, INVNTT_NONE);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(tt, c0, tt);
+
+    /*
+     * Signature is valid if and only if the aggregate (s1,s2) vector
+     * is short enough.
+     */
+    return PQCLEAN_FALCONPADDED512_AARCH64_is_short(tt, s2);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_compute_public(int16_t *h, const int8_t *f, const int8_t *g, int16_t *tmp) {
+    int16_t *tt = tmp;
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(h, g);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(tt, f);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT);
+
+    if (PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(tt)) {
+        return 0;
+    }
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(h, tt);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_convert_to_unsigned(h);
+
+    return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_complete_private(int8_t *G, const int8_t *f,
+        const int8_t *g, const int8_t *F,
+        uint8_t *tmp) {
+    int16_t *t1, *t2;
+
+    t1 = (int16_t *)tmp;
+    t2 = t1 + FALCON_N;
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(t1, g);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(t1, NTT_NONE);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(t2, F);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(t2, NTT_MONT);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(t1, t2);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(t2, f);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(t2, NTT_MONT);
+
+    if (PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(t2)) {
+        return 0;
+    }
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(t1, t2);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(t1, INVNTT_NINV);
+
+    if (PQCLEAN_FALCONPADDED512_AARCH64_poly_int16_to_int8(G, t1)) {
+        return 0;
+    }
+    return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp) {
+    int16_t *tt = (int16_t *)tmp;
+    uint16_t r;
+
+    memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT);
+
+    r = PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(tt);
+
+    return (int)(1u - (r >> 15));
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+        const int16_t *s1, const int16_t *s2,
+        uint8_t *tmp) {
+    int16_t *tt = (int16_t *)tmp;
+    uint16_t r;
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(h, c0, s1);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT);
+    r = PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(tt);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(h, tt);
+
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = (uint16_t) (~r & (uint16_t) - PQCLEAN_FALCONPADDED512_AARCH64_is_short(s1, s2));
+    return (int)(r >> 15);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp) {
+    int16_t *s2 = (int16_t *)tmp;
+
+    memcpy(s2, sig, sizeof(int16_t) * FALCON_N);
+    PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(s2, NTT_MONT);
+
+    int r = PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(s2);
+
+    return r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/LICENSE b/src/sig/falcon/pqclean_falcon-padded-512_avx2/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin <thomas.pornin@nccgroup.com>.
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/api.h b/src/sig/falcon/pqclean_falcon-padded-512_avx2/api.h
new file mode 100644
index 000000000..c039206c7
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED512_AVX2_API_H
+#define PQCLEAN_FALCONPADDED512_AVX2_API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES   1281
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES   897
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES            666
+
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_ALGNAME          "Falcon-padded-512"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ *   public (pk): PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES
+ *   private (sk): PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/codec.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/codec.c
new file mode 100644
index 000000000..64f07533a
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_modq_encode(
+    void *out, size_t max_out_len,
+    const uint16_t *x, unsigned logn) {
+    size_t n, out_len, u;
+    uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        if (x[u] >= 12289) {
+            return 0;
+        }
+    }
+    out_len = ((n * 14) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << 14) | x[u];
+        acc_len += 14;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_modq_decode(
+    uint16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len, u;
+    const uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * 14) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    u = 0;
+    while (u < n) {
+        acc = (acc << 8) | (*buf ++);
+        acc_len += 8;
+        if (acc_len >= 14) {
+            unsigned w;
+
+            acc_len -= 14;
+            w = (acc >> acc_len) & 0x3FFF;
+            if (w >= 12289) {
+                return 0;
+            }
+            x[u ++] = (uint16_t)w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i16_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint16_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i16_decode(
+    int16_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            w |= -(w & mask2);
+            x[u ++] = (int16_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+    void *out, size_t max_out_len,
+    const int8_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint8_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+    int8_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            x[u ++] = (int8_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_comp_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn) {
+    uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = out;
+
+    /*
+     * Make sure that all values are within the -2047..+2047 range.
+     */
+    for (u = 0; u < n; u ++) {
+        if (x[u] < -2047 || x[u] > +2047) {
+            return 0;
+        }
+    }
+
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        int t;
+        unsigned w;
+
+        /*
+         * Get sign and absolute value of next integer; push the
+         * sign bit.
+         */
+        acc <<= 1;
+        t = x[u];
+        if (t < 0) {
+            t = -t;
+            acc |= 1;
+        }
+        w = (unsigned)t;
+
+        /*
+         * Push the low 7 bits of the absolute value.
+         */
+        acc <<= 7;
+        acc |= w & 127u;
+        w >>= 7;
+
+        /*
+         * We pushed exactly 8 bits.
+         */
+        acc_len += 8;
+
+        /*
+         * Push as many zeros as necessary, then a one. Since the
+         * absolute value is at most 2047, w can only range up to
+         * 15 at this point, thus we will add at most 16 bits
+         * here. With the 8 bits above and possibly up to 7 bits
+         * from previous iterations, we may go up to 31 bits, which
+         * will fit in the accumulator, which is an uint32_t.
+         */
+        acc <<= (w + 1);
+        acc |= 1;
+        acc_len += w + 1;
+
+        /*
+         * Produce all full bytes.
+         */
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            if (buf != NULL) {
+                if (v >= max_out_len) {
+                    return 0;
+                }
+                buf[v] = (uint8_t)(acc >> acc_len);
+            }
+            v ++;
+        }
+    }
+
+    /*
+     * Flush remaining bits (if any).
+     */
+    if (acc_len > 0) {
+        if (buf != NULL) {
+            if (v >= max_out_len) {
+                return 0;
+            }
+            buf[v] = (uint8_t)(acc << (8 - acc_len));
+        }
+        v ++;
+    }
+
+    return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_comp_decode(
+    int16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    const uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        unsigned b, s, m;
+
+        /*
+         * Get next eight bits: sign and low seven bits of the
+         * absolute value.
+         */
+        if (v >= max_in_len) {
+            return 0;
+        }
+        acc = (acc << 8) | (uint32_t)buf[v ++];
+        b = acc >> acc_len;
+        s = b & 128;
+        m = b & 127;
+
+        /*
+         * Get next bits until a 1 is reached.
+         */
+        for (;;) {
+            if (acc_len == 0) {
+                if (v >= max_in_len) {
+                    return 0;
+                }
+                acc = (acc << 8) | (uint32_t)buf[v ++];
+                acc_len = 8;
+            }
+            acc_len --;
+            if (((acc >> acc_len) & 1) != 0) {
+                break;
+            }
+            m += 128;
+            if (m > 2047) {
+                return 0;
+            }
+        }
+
+        /*
+         * "-0" is forbidden.
+         */
+        if (s && m == 0) {
+            return 0;
+        }
+        if (s) {
+            x[u] = (int16_t) - m;
+        } else {
+            x[u] = (int16_t)m;
+        }
+    }
+
+    /*
+     * Unused bits in the last byte must be zero.
+     */
+    if ((acc & ((1u << acc_len) - 1u)) != 0) {
+        return 0;
+    }
+
+    return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
+ *      1       2     129       56.31       143       60.02
+ *      2       4     123       40.93       160       46.52
+ *      3       8      97       28.97       159       38.01
+ *      4      16     100       21.48       154       32.50
+ *      5      32      71       15.41       151       29.36
+ *      6      64      59       11.07       138       27.77
+ *      7     128      39        7.91       144       27.00
+ *      8     256      32        5.63       148       26.61
+ *      9     512      22        4.00       137       26.46
+ *     10    1024      15        2.84       146       26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    7,
+    7,
+    6,
+    6,
+    5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ *   log(n)     n   max sig coeff (theoretical)
+ *      1       2       412
+ *      2       4       583
+ *      3       8       824
+ *      4      16      1166
+ *      5      32      1649
+ *      6      64      2332
+ *      7     128      3299
+ *      8     256      4665
+ *      9     512      6598
+ *     10    1024      9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_sig_bits[] = {
+    0, /* unused */
+    10,
+    11,
+    11,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/common.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/common.c
new file mode 100644
index 000000000..70ef4d04d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
+
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
+    /*
+     * Each 16-bit sample is a value in 0..65535. The value is
+     * kept if it falls in 0..61444 (because 61445 = 5*12289)
+     * and rejected otherwise; thus, each sample has probability
+     * about 0.93758 of being selected.
+     *
+     * We want to oversample enough to be sure that we will
+     * have enough values with probability at least 1 - 2^(-256).
+     * Depending on degree N, this leads to the following
+     * required oversampling:
+     *
+     *   logn     n  oversampling
+     *     1      2     65
+     *     2      4     67
+     *     3      8     71
+     *     4     16     77
+     *     5     32     86
+     *     6     64    100
+     *     7    128    122
+     *     8    256    154
+     *     9    512    205
+     *    10   1024    287
+     *
+     * If logn >= 7, then the provided temporary buffer is large
+     * enough. Otherwise, we use a stack buffer of 63 entries
+     * (i.e. 126 bytes) for the values that do not fit in tmp[].
+     */
+
+    static const uint16_t overtab[] = {
+        0, /* unused */
+        65,
+        67,
+        71,
+        77,
+        86,
+        100,
+        122,
+        154,
+        205,
+        287
+    };
+
+    unsigned n, n2, u, m, p, over;
+    uint16_t *tt1, tt2[63];
+
+    /*
+     * We first generate m 16-bit value. Values 0..n-1 go to x[].
+     * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+     * We also reduce modulo q the values; rejected values are set
+     * to 0xFFFF.
+     */
+    n = 1U << logn;
+    n2 = n << 1;
+    over = overtab[logn];
+    m = n + over;
+    tt1 = (uint16_t *)tmp;
+    for (u = 0; u < m; u ++) {
+        uint8_t buf[2];
+        uint32_t w, wr;
+
+        inner_shake256_extract(sc, buf, sizeof buf);
+        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+        wr |= ((w - 61445) >> 31) - 1;
+        if (u < n) {
+            x[u] = (uint16_t)wr;
+        } else if (u < n2) {
+            tt1[u - n] = (uint16_t)wr;
+        } else {
+            tt2[u - n2] = (uint16_t)wr;
+        }
+    }
+
+    /*
+     * Now we must "squeeze out" the invalid values. We do this in
+     * a logarithmic sequence of passes; each pass computes where a
+     * value should go, and moves it down by 'p' slots if necessary,
+     * where 'p' uses an increasing powers-of-two scale. It can be
+     * shown that in all cases where the loop decides that a value
+     * has to be moved down by p slots, the destination slot is
+     * "free" (i.e. contains an invalid value).
+     */
+    for (p = 1; p <= over; p <<= 1) {
+        unsigned v;
+
+        /*
+         * In the loop below:
+         *
+         *   - v contains the index of the final destination of
+         *     the value; it is recomputed dynamically based on
+         *     whether values are valid or not.
+         *
+         *   - u is the index of the value we consider ("source");
+         *     its address is s.
+         *
+         *   - The loop may swap the value with the one at index
+         *     u-p. The address of the swap destination is d.
+         */
+        v = 0;
+        for (u = 0; u < m; u ++) {
+            uint16_t *s, *d;
+            unsigned j, sv, dv, mk;
+
+            if (u < n) {
+                s = &x[u];
+            } else if (u < n2) {
+                s = &tt1[u - n];
+            } else {
+                s = &tt2[u - n2];
+            }
+            sv = *s;
+
+            /*
+             * The value in sv should ultimately go to
+             * address v, i.e. jump back by u-v slots.
+             */
+            j = u - v;
+
+            /*
+             * We increment v for the next iteration, but
+             * only if the source value is valid. The mask
+             * 'mk' is -1 if the value is valid, 0 otherwise,
+             * so we _subtract_ mk.
+             */
+            mk = (sv >> 15) - 1U;
+            v -= mk;
+
+            /*
+             * In this loop we consider jumps by p slots; if
+             * u < p then there is nothing more to do.
+             */
+            if (u < p) {
+                continue;
+            }
+
+            /*
+             * Destination for the swap: value at address u-p.
+             */
+            if ((u - p) < n) {
+                d = &x[u - p];
+            } else if ((u - p) < n2) {
+                d = &tt1[(u - p) - n];
+            } else {
+                d = &tt2[(u - p) - n2];
+            }
+            dv = *d;
+
+            /*
+             * The swap should be performed only if the source
+             * is valid AND the jump j has its 'p' bit set.
+             */
+            mk &= -(((j & p) + 0x1FF) >> 9);
+
+            *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+        }
+    }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+    0,    /* unused */
+    101498,
+    208714,
+    428865,
+    892039,
+    1852696,
+    3842630,
+    7959734,
+    16468416,
+    34034726,
+    70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_is_short(
+    const int16_t *s1, const int16_t *s2, unsigned logn) {
+    /*
+     * We use the l2-norm. Code below uses only 32-bit operations to
+     * compute the square of the norm with saturation to 2^32-1 if
+     * the value exceeds 2^31-1.
+     */
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = (size_t)1 << logn;
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s1[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+        z = s2[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    s |= -(ng >> 31);
+
+    return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_is_short_half(
+    uint32_t sqn, const int16_t *s2, unsigned logn) {
+    size_t n, u;
+    uint32_t ng;
+
+    n = (size_t)1 << logn;
+    ng = -(sqn >> 31);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s2[u];
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+    }
+    sqn |= -(ng >> 31);
+
+    return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/fft.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fft.c
new file mode 100644
index 000000000..8ba5b435d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fft.c
@@ -0,0 +1,1108 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_add(a_re, b_re); \
+        fpct_im = fpr_add(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_sub(a_re, b_re); \
+        fpct_im = fpr_sub(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+        fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+        fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ *    Re(f(w_j)) -> slot rev(j)/2
+ *    Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_FFT(fpr *f, unsigned logn) {
+    /*
+     * FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = N
+     *   for m = 1; m < N; m *= 2:
+     *       ht = t/2
+     *       for i1 = 0; i1 < m; i1 ++:
+     *           j1 = i1 * t
+     *           s = GM[m + i1]
+     *           for j = j1; j < (j1 + ht); j ++:
+     *               x = f[j]
+     *               y = s * f[j + ht]
+     *               f[j] = x + y
+     *               f[j + ht] = x - y
+     *       t = ht
+     *
+     * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+     *
+     * In the description above, f[] is supposed to contain complex
+     * numbers. In our in-memory representation, the real and
+     * imaginary parts of f[k] are in array slots k and k+N/2.
+     *
+     * We only keep the first half of the complex numbers. We can
+     * see that after the first iteration, the first and second halves
+     * of the array of complex numbers have separate lives, so we
+     * simply ignore the second part.
+     */
+
+    unsigned u;
+    size_t t, n, hn, m;
+
+    /*
+     * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+     * (because GM[1] = w^rev(1) = w^(N/2) = i).
+     * In our chosen representation, this is a no-op: everything is
+     * already where it should be.
+     */
+
+    /*
+     * Subsequent iterations are truncated to use only the first
+     * half of values.
+     */
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    t = hn;
+    for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+        size_t ht, hm, i1, j1;
+
+        ht = t >> 1;
+        hm = m >> 1;
+        for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+            size_t j, j2;
+
+            j2 = j1 + ht;
+            if (ht >= 4) {
+                __m256d s_re, s_im;
+
+                s_re = _mm256_set1_pd(
+                           fpr_gm_tab[((m + i1) << 1) + 0].v);
+                s_im = _mm256_set1_pd(
+                           fpr_gm_tab[((m + i1) << 1) + 1].v);
+                for (j = j1; j < j2; j += 4) {
+                    __m256d x_re, x_im, y_re, y_im;
+                    __m256d z_re, z_im;
+
+                    x_re = _mm256_loadu_pd(&f[j].v);
+                    x_im = _mm256_loadu_pd(&f[j + hn].v);
+                    z_re = _mm256_loadu_pd(&f[j + ht].v);
+                    z_im = _mm256_loadu_pd(&f[j + ht + hn].v);
+                    y_re = FMSUB(z_re, s_re,
+                                 _mm256_mul_pd(z_im, s_im));
+                    y_im = FMADD(z_re, s_im,
+                                 _mm256_mul_pd(z_im, s_re));
+                    _mm256_storeu_pd(&f[j].v,
+                                     _mm256_add_pd(x_re, y_re));
+                    _mm256_storeu_pd(&f[j + hn].v,
+                                     _mm256_add_pd(x_im, y_im));
+                    _mm256_storeu_pd(&f[j + ht].v,
+                                     _mm256_sub_pd(x_re, y_re));
+                    _mm256_storeu_pd(&f[j + ht + hn].v,
+                                     _mm256_sub_pd(x_im, y_im));
+                }
+            } else {
+                fpr s_re, s_im;
+
+                s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+                s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+                for (j = j1; j < j2; j ++) {
+                    fpr x_re, x_im, y_re, y_im;
+
+                    x_re = f[j];
+                    x_im = f[j + hn];
+                    y_re = f[j + ht];
+                    y_im = f[j + ht + hn];
+                    FPC_MUL(y_re, y_im,
+                            y_re, y_im, s_re, s_im);
+                    FPC_ADD(f[j], f[j + hn],
+                            x_re, x_im, y_re, y_im);
+                    FPC_SUB(f[j + ht], f[j + ht + hn],
+                            x_re, x_im, y_re, y_im);
+                }
+            }
+        }
+        t = ht;
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_iFFT(fpr *f, unsigned logn) {
+    /*
+     * Inverse FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = 1
+     *   for m = N; m > 1; m /= 2:
+     *       hm = m/2
+     *       dt = t*2
+     *       for i1 = 0; i1 < hm; i1 ++:
+     *           j1 = i1 * dt
+     *           s = iGM[hm + i1]
+     *           for j = j1; j < (j1 + t); j ++:
+     *               x = f[j]
+     *               y = f[j + t]
+     *               f[j] = x + y
+     *               f[j + t] = s * (x - y)
+     *       t = dt
+     *   for i1 = 0; i1 < N; i1 ++:
+     *       f[i1] = f[i1] / N
+     *
+     * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+     * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+     *
+     * In the main loop (not counting the final division loop), in
+     * all iterations except the last, the first and second half of f[]
+     * (as an array of complex numbers) are separate. In our chosen
+     * representation, we do not keep the second half.
+     *
+     * The last iteration recombines the recomputed half with the
+     * implicit half, and should yield only real numbers since the
+     * target polynomial is real; moreover, s = i at that step.
+     * Thus, when considering x and y:
+     *    y = conj(x) since the final f[j] must be real
+     *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+     *    filled with 2*Im(x).
+     * But we already have Re(x) and Im(x) in array slots j and j+t
+     * in our chosen representation. That last iteration is thus a
+     * simple doubling of the values in all the array.
+     *
+     * We make the last iteration a no-op by tweaking the final
+     * division into a division by N/2, not N.
+     */
+    size_t u, n, hn, t, m;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    hn = n >> 1;
+    for (u = logn; u > 1; u --) {
+        size_t hm, dt, i1, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+            size_t j, j2;
+
+            j2 = j1 + t;
+            if (t >= 4) {
+                __m256d s_re, s_im;
+
+                s_re = _mm256_set1_pd(
+                           fpr_gm_tab[((hm + i1) << 1) + 0].v);
+                s_im = _mm256_set1_pd(
+                           fpr_gm_tab[((hm + i1) << 1) + 1].v);
+                for (j = j1; j < j2; j += 4) {
+                    __m256d x_re, x_im, y_re, y_im;
+                    __m256d z_re, z_im;
+
+                    x_re = _mm256_loadu_pd(&f[j].v);
+                    x_im = _mm256_loadu_pd(&f[j + hn].v);
+                    y_re = _mm256_loadu_pd(&f[j + t].v);
+                    y_im = _mm256_loadu_pd(&f[j + t + hn].v);
+                    _mm256_storeu_pd(&f[j].v,
+                                     _mm256_add_pd(x_re, y_re));
+                    _mm256_storeu_pd(&f[j + hn].v,
+                                     _mm256_add_pd(x_im, y_im));
+                    x_re = _mm256_sub_pd(y_re, x_re);
+                    x_im = _mm256_sub_pd(x_im, y_im);
+                    z_re = FMSUB(x_im, s_im,
+                                 _mm256_mul_pd(x_re, s_re));
+                    z_im = FMADD(x_re, s_im,
+                                 _mm256_mul_pd(x_im, s_re));
+                    _mm256_storeu_pd(&f[j + t].v, z_re);
+                    _mm256_storeu_pd(&f[j + t + hn].v, z_im);
+                }
+            } else {
+                fpr s_re, s_im;
+
+                s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+                s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+                for (j = j1; j < j2; j ++) {
+                    fpr x_re, x_im, y_re, y_im;
+
+                    x_re = f[j];
+                    x_im = f[j + hn];
+                    y_re = f[j + t];
+                    y_im = f[j + t + hn];
+                    FPC_ADD(f[j], f[j + hn],
+                            x_re, x_im, y_re, y_im);
+                    FPC_SUB(x_re, x_im,
+                            x_re, x_im, y_re, y_im);
+                    FPC_MUL(f[j + t], f[j + t + hn],
+                            x_re, x_im, s_re, s_im);
+                }
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * Last iteration is a no-op, provided that we divide by N/2
+     * instead of N. We need to make a special case for logn = 0.
+     */
+    if (logn > 0) {
+        fpr ni;
+
+        ni = fpr_p2_tab[logn];
+        for (u = 0; u < n; u ++) {
+            f[u] = fpr_mul(f[u], ni);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_add(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_add_pd(
+                                 _mm256_loadu_pd(&a[u].v),
+                                 _mm256_loadu_pd(&b[u].v)));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_add(a[u], b[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_sub(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_sub_pd(
+                                 _mm256_loadu_pd(&a[u].v),
+                                 _mm256_loadu_pd(&b[u].v)));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_sub(a[u], b[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_neg(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        __m256d s;
+
+        s = _mm256_set1_pd(-0.0);
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_neg(a[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 8) {
+        __m256d s;
+
+        s = _mm256_set1_pd(-0.0);
+        for (u = (n >> 1); u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+        }
+    } else {
+        for (u = (n >> 1); u < n; u ++) {
+            a[u] = fpr_neg(a[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            c_re = FMSUB(
+                       a_re, b_re, _mm256_mul_pd(a_im, b_im));
+            c_im = FMADD(
+                       a_re, b_im, _mm256_mul_pd(a_im, b_re));
+            _mm256_storeu_pd(&a[u].v, c_re);
+            _mm256_storeu_pd(&a[u + hn].v, c_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = b[u + hn];
+            FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            c_re = FMADD(
+                       a_re, b_re, _mm256_mul_pd(a_im, b_im));
+            c_im = FMSUB(
+                       a_im, b_re, _mm256_mul_pd(a_re, b_im));
+            _mm256_storeu_pd(&a[u].v, c_re);
+            _mm256_storeu_pd(&a[u + hn].v, c_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = fpr_neg(b[u + hn]);
+            FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d zero;
+
+        zero = _mm256_setzero_pd();
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            _mm256_storeu_pd(&a[u].v,
+                             FMADD(a_re, a_re,
+                                   _mm256_mul_pd(a_im, a_im)));
+            _mm256_storeu_pd(&a[u + hn].v, zero);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+            a[u + hn] = fpr_zero;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    if (n >= 4) {
+        __m256d x4;
+
+        x4 = _mm256_set1_pd(x.v);
+        for (u = 0; u < n; u += 4) {
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
+        }
+    } else {
+        for (u = 0; u < n; u ++) {
+            a[u] = fpr_mul(a[u], x);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_div_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            t = _mm256_div_pd(one,
+                              FMADD(b_re, b_re,
+                                    _mm256_mul_pd(b_im, b_im)));
+            b_re = _mm256_mul_pd(b_re, t);
+            b_im = _mm256_mul_pd(b_im, t);
+            c_re = FMADD(
+                       a_re, b_re, _mm256_mul_pd(a_im, b_im));
+            c_im = FMSUB(
+                       a_im, b_re, _mm256_mul_pd(a_re, b_im));
+            _mm256_storeu_pd(&a[u].v, c_re);
+            _mm256_storeu_pd(&a[u + hn].v, c_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = b[u + hn];
+            FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, dv;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            b_re = _mm256_loadu_pd(&b[u].v);
+            b_im = _mm256_loadu_pd(&b[u + hn].v);
+            dv = _mm256_div_pd(one,
+                               _mm256_add_pd(
+                                   FMADD(a_re, a_re,
+                                         _mm256_mul_pd(a_im, a_im)),
+                                   FMADD(b_re, b_re,
+                                         _mm256_mul_pd(b_im, b_im))));
+            _mm256_storeu_pd(&d[u].v, dv);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr a_re, a_im;
+            fpr b_re, b_im;
+
+            a_re = a[u];
+            a_im = a[u + hn];
+            b_re = b[u];
+            b_im = b[u + hn];
+            d[u] = fpr_inv(fpr_add(
+                               fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+                               fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d F_re, F_im, G_re, G_im;
+            __m256d f_re, f_im, g_re, g_im;
+            __m256d a_re, a_im, b_re, b_im;
+
+            F_re = _mm256_loadu_pd(&F[u].v);
+            F_im = _mm256_loadu_pd(&F[u + hn].v);
+            G_re = _mm256_loadu_pd(&G[u].v);
+            G_im = _mm256_loadu_pd(&G[u + hn].v);
+            f_re = _mm256_loadu_pd(&f[u].v);
+            f_im = _mm256_loadu_pd(&f[u + hn].v);
+            g_re = _mm256_loadu_pd(&g[u].v);
+            g_im = _mm256_loadu_pd(&g[u + hn].v);
+
+            a_re = FMADD(F_re, f_re,
+                         _mm256_mul_pd(F_im, f_im));
+            a_im = FMSUB(F_im, f_re,
+                         _mm256_mul_pd(F_re, f_im));
+            b_re = FMADD(G_re, g_re,
+                         _mm256_mul_pd(G_im, g_im));
+            b_im = FMSUB(G_im, g_re,
+                         _mm256_mul_pd(G_re, g_im));
+            _mm256_storeu_pd(&d[u].v,
+                             _mm256_add_pd(a_re, b_re));
+            _mm256_storeu_pd(&d[u + hn].v,
+                             _mm256_add_pd(a_im, b_im));
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr F_re, F_im, G_re, G_im;
+            fpr f_re, f_im, g_re, g_im;
+            fpr a_re, a_im, b_re, b_im;
+
+            F_re = F[u];
+            F_im = F[u + hn];
+            G_re = G[u];
+            G_im = G[u + hn];
+            f_re = f[u];
+            f_im = f[u + hn];
+            g_re = g[u];
+            g_im = g[u + hn];
+
+            FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+            FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+            d[u] = fpr_add(a_re, b_re);
+            d[u + hn] = fpr_add(a_im, b_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        for (u = 0; u < hn; u += 4) {
+            __m256d a_re, a_im, bv;
+
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            bv = _mm256_loadu_pd(&b[u].v);
+            _mm256_storeu_pd(&a[u].v,
+                             _mm256_mul_pd(a_re, bv));
+            _mm256_storeu_pd(&a[u + hn].v,
+                             _mm256_mul_pd(a_im, bv));
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            a[u] = fpr_mul(a[u], b[u]);
+            a[u + hn] = fpr_mul(a[u + hn], b[u]);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_div_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d ib, a_re, a_im;
+
+            ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
+            a_re = _mm256_loadu_pd(&a[u].v);
+            a_im = _mm256_loadu_pd(&a[u + hn].v);
+            _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
+            _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr ib;
+
+            ib = fpr_inv(b[u]);
+            a[u] = fpr_mul(a[u], ib);
+            a[u + hn] = fpr_mul(a[u + hn], ib);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_LDL_fft(
+    const fpr *g00,
+    fpr *g01, fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+            g00_re = _mm256_loadu_pd(&g00[u].v);
+            g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+            g01_re = _mm256_loadu_pd(&g01[u].v);
+            g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+            g11_re = _mm256_loadu_pd(&g11[u].v);
+            g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+            t = _mm256_div_pd(one,
+                              FMADD(g00_re, g00_re,
+                                    _mm256_mul_pd(g00_im, g00_im)));
+            g00_re = _mm256_mul_pd(g00_re, t);
+            g00_im = _mm256_mul_pd(g00_im, t);
+            mu_re = FMADD(g01_re, g00_re,
+                          _mm256_mul_pd(g01_im, g00_im));
+            mu_im = FMSUB(g01_re, g00_im,
+                          _mm256_mul_pd(g01_im, g00_re));
+            xi_re = FMSUB(mu_re, g01_re,
+                          _mm256_mul_pd(mu_im, g01_im));
+            xi_im = FMADD(mu_im, g01_re,
+                          _mm256_mul_pd(mu_re, g01_im));
+            _mm256_storeu_pd(&g11[u].v,
+                             _mm256_sub_pd(g11_re, xi_re));
+            _mm256_storeu_pd(&g11[u + hn].v,
+                             _mm256_add_pd(g11_im, xi_im));
+            _mm256_storeu_pd(&g01[u].v, mu_re);
+            _mm256_storeu_pd(&g01[u + hn].v, mu_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            fpr mu_re, mu_im;
+
+            g00_re = g00[u];
+            g00_im = g00[u + hn];
+            g01_re = g01[u];
+            g01_im = g01[u + hn];
+            g11_re = g11[u];
+            g11_im = g11[u + hn];
+            FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+            FPC_MUL(g01_re, g01_im,
+                    mu_re, mu_im, g01_re, fpr_neg(g01_im));
+            FPC_SUB(g11[u], g11[u + hn],
+                    g11_re, g11_im, g01_re, g01_im);
+            g01[u] = mu_re;
+            g01[u + hn] = fpr_neg(mu_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(
+    fpr *d11, fpr *l10,
+    const fpr *g00, const fpr *g01,
+    const fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    if (n >= 8) {
+        __m256d one;
+
+        one = _mm256_set1_pd(1.0);
+        for (u = 0; u < hn; u += 4) {
+            __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+            g00_re = _mm256_loadu_pd(&g00[u].v);
+            g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+            g01_re = _mm256_loadu_pd(&g01[u].v);
+            g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+            g11_re = _mm256_loadu_pd(&g11[u].v);
+            g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+            t = _mm256_div_pd(one,
+                              FMADD(g00_re, g00_re,
+                                    _mm256_mul_pd(g00_im, g00_im)));
+            g00_re = _mm256_mul_pd(g00_re, t);
+            g00_im = _mm256_mul_pd(g00_im, t);
+            mu_re = FMADD(g01_re, g00_re,
+                          _mm256_mul_pd(g01_im, g00_im));
+            mu_im = FMSUB(g01_re, g00_im,
+                          _mm256_mul_pd(g01_im, g00_re));
+            xi_re = FMSUB(mu_re, g01_re,
+                          _mm256_mul_pd(mu_im, g01_im));
+            xi_im = FMADD(mu_im, g01_re,
+                          _mm256_mul_pd(mu_re, g01_im));
+            _mm256_storeu_pd(&d11[u].v,
+                             _mm256_sub_pd(g11_re, xi_re));
+            _mm256_storeu_pd(&d11[u + hn].v,
+                             _mm256_add_pd(g11_im, xi_im));
+            _mm256_storeu_pd(&l10[u].v, mu_re);
+            _mm256_storeu_pd(&l10[u + hn].v, mu_im);
+        }
+    } else {
+        for (u = 0; u < hn; u ++) {
+            fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+            fpr mu_re, mu_im;
+
+            g00_re = g00[u];
+            g00_im = g00[u + hn];
+            g01_re = g01[u];
+            g01_im = g01[u + hn];
+            g11_re = g11[u];
+            g11_im = g11[u + hn];
+            FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+            FPC_MUL(g01_re, g01_im,
+                    mu_re, mu_im, g01_re, fpr_neg(g01_im));
+            FPC_SUB(d11[u], d11[u + hn],
+                    g11_re, g11_im, g01_re, g01_im);
+            l10[u] = mu_re;
+            l10[u + hn] = fpr_neg(mu_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(
+    fpr *f0, fpr *f1,
+    const fpr *f, unsigned logn) {
+    /*
+     * The FFT representation we use is in bit-reversed order
+     * (element i contains f(w^(rev(i))), where rev() is the
+     * bit-reversal function over the ring degree. This changes
+     * indexes with regards to the Falcon specification.
+     */
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    if (n >= 8) {
+        __m256d half, sv;
+
+        half = _mm256_set1_pd(0.5);
+        sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
+        for (u = 0; u < qn; u += 2) {
+            __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
+
+            ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
+            ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
+            ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
+            ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
+            _mm_storeu_pd(&f0[u].v,
+                          _mm256_extractf128_pd(ff0, 0));
+            _mm_storeu_pd(&f0[u + qn].v,
+                          _mm256_extractf128_pd(ff0, 1));
+
+            ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
+            gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+            ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
+            ff3 = _mm256_hadd_pd(
+                      _mm256_mul_pd(ff1, gmt),
+                      _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
+            ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
+            _mm_storeu_pd(&f1[u].v,
+                          _mm256_extractf128_pd(ff3, 0));
+            _mm_storeu_pd(&f1[u + qn].v,
+                          _mm256_extractf128_pd(ff3, 1));
+        }
+    } else {
+        f0[0] = f[0];
+        f1[0] = f[hn];
+
+        for (u = 0; u < qn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+            fpr t_re, t_im;
+
+            a_re = f[(u << 1) + 0];
+            a_im = f[(u << 1) + 0 + hn];
+            b_re = f[(u << 1) + 1];
+            b_im = f[(u << 1) + 1 + hn];
+
+            FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+            f0[u] = fpr_half(t_re);
+            f0[u + qn] = fpr_half(t_im);
+
+            FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+            FPC_MUL(t_re, t_im, t_re, t_im,
+                    fpr_gm_tab[((u + hn) << 1) + 0],
+                    fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+            f1[u] = fpr_half(t_re);
+            f1[u + qn] = fpr_half(t_im);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(
+    fpr *f,
+    const fpr *f0, const fpr *f1, unsigned logn) {
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    if (n >= 16) {
+        for (u = 0; u < qn; u += 4) {
+            __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+            __m256d gm1, gm2, g_re, g_im;
+            __m256d t_re, t_im, u_re, u_im;
+            __m256d tu1_re, tu2_re, tu1_im, tu2_im;
+
+            a_re = _mm256_loadu_pd(&f0[u].v);
+            a_im = _mm256_loadu_pd(&f0[u + qn].v);
+            c_re = _mm256_loadu_pd(&f1[u].v);
+            c_im = _mm256_loadu_pd(&f1[u + qn].v);
+
+            gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+            gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
+            g_re = _mm256_unpacklo_pd(gm1, gm2);
+            g_im = _mm256_unpackhi_pd(gm1, gm2);
+            g_re = _mm256_permute4x64_pd(g_re, 0xD8);
+            g_im = _mm256_permute4x64_pd(g_im, 0xD8);
+
+            b_re = FMSUB(
+                       c_re, g_re, _mm256_mul_pd(c_im, g_im));
+            b_im = FMADD(
+                       c_re, g_im, _mm256_mul_pd(c_im, g_re));
+
+            t_re = _mm256_add_pd(a_re, b_re);
+            t_im = _mm256_add_pd(a_im, b_im);
+            u_re = _mm256_sub_pd(a_re, b_re);
+            u_im = _mm256_sub_pd(a_im, b_im);
+
+            tu1_re = _mm256_unpacklo_pd(t_re, u_re);
+            tu2_re = _mm256_unpackhi_pd(t_re, u_re);
+            tu1_im = _mm256_unpacklo_pd(t_im, u_im);
+            tu2_im = _mm256_unpackhi_pd(t_im, u_im);
+            _mm256_storeu_pd(&f[(u << 1)].v,
+                             _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
+            _mm256_storeu_pd(&f[(u << 1) + 4].v,
+                             _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
+            _mm256_storeu_pd(&f[(u << 1) + hn].v,
+                             _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
+            _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
+                             _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
+        }
+    } else {
+        f[0] = f0[0];
+        f[hn] = f1[0];
+
+        for (u = 0; u < qn; u ++) {
+            fpr a_re, a_im, b_re, b_im;
+            fpr t_re, t_im;
+
+            a_re = f0[u];
+            a_im = f0[u + qn];
+            FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+                    fpr_gm_tab[((u + hn) << 1) + 0],
+                    fpr_gm_tab[((u + hn) << 1) + 1]);
+            FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+            f[(u << 1) + 0] = t_re;
+            f[(u << 1) + 0 + hn] = t_im;
+            FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+            f[(u << 1) + 1] = t_re;
+            f[(u << 1) + 1 + hn] = t_im;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.c
new file mode 100644
index 000000000..8940f3400
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.c
@@ -0,0 +1,1076 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+const fpr fpr_gm_tab[] = {
+    {0}, {0}, /* unused */
+    {-0.000000000000000000000000000}, { 1.000000000000000000000000000},
+    { 0.707106781186547524400844362}, { 0.707106781186547524400844362},
+    {-0.707106781186547524400844362}, { 0.707106781186547524400844362},
+    { 0.923879532511286756128183189}, { 0.382683432365089771728459984},
+    {-0.382683432365089771728459984}, { 0.923879532511286756128183189},
+    { 0.382683432365089771728459984}, { 0.923879532511286756128183189},
+    {-0.923879532511286756128183189}, { 0.382683432365089771728459984},
+    { 0.980785280403230449126182236}, { 0.195090322016128267848284868},
+    {-0.195090322016128267848284868}, { 0.980785280403230449126182236},
+    { 0.555570233019602224742830814}, { 0.831469612302545237078788378},
+    {-0.831469612302545237078788378}, { 0.555570233019602224742830814},
+    { 0.831469612302545237078788378}, { 0.555570233019602224742830814},
+    {-0.555570233019602224742830814}, { 0.831469612302545237078788378},
+    { 0.195090322016128267848284868}, { 0.980785280403230449126182236},
+    {-0.980785280403230449126182236}, { 0.195090322016128267848284868},
+    { 0.995184726672196886244836953}, { 0.098017140329560601994195564},
+    {-0.098017140329560601994195564}, { 0.995184726672196886244836953},
+    { 0.634393284163645498215171613}, { 0.773010453362736960810906610},
+    {-0.773010453362736960810906610}, { 0.634393284163645498215171613},
+    { 0.881921264348355029712756864}, { 0.471396736825997648556387626},
+    {-0.471396736825997648556387626}, { 0.881921264348355029712756864},
+    { 0.290284677254462367636192376}, { 0.956940335732208864935797887},
+    {-0.956940335732208864935797887}, { 0.290284677254462367636192376},
+    { 0.956940335732208864935797887}, { 0.290284677254462367636192376},
+    {-0.290284677254462367636192376}, { 0.956940335732208864935797887},
+    { 0.471396736825997648556387626}, { 0.881921264348355029712756864},
+    {-0.881921264348355029712756864}, { 0.471396736825997648556387626},
+    { 0.773010453362736960810906610}, { 0.634393284163645498215171613},
+    {-0.634393284163645498215171613}, { 0.773010453362736960810906610},
+    { 0.098017140329560601994195564}, { 0.995184726672196886244836953},
+    {-0.995184726672196886244836953}, { 0.098017140329560601994195564},
+    { 0.998795456205172392714771605}, { 0.049067674327418014254954977},
+    {-0.049067674327418014254954977}, { 0.998795456205172392714771605},
+    { 0.671558954847018400625376850}, { 0.740951125354959091175616897},
+    {-0.740951125354959091175616897}, { 0.671558954847018400625376850},
+    { 0.903989293123443331586200297}, { 0.427555093430282094320966857},
+    {-0.427555093430282094320966857}, { 0.903989293123443331586200297},
+    { 0.336889853392220050689253213}, { 0.941544065183020778412509403},
+    {-0.941544065183020778412509403}, { 0.336889853392220050689253213},
+    { 0.970031253194543992603984207}, { 0.242980179903263889948274162},
+    {-0.242980179903263889948274162}, { 0.970031253194543992603984207},
+    { 0.514102744193221726593693839}, { 0.857728610000272069902269984},
+    {-0.857728610000272069902269984}, { 0.514102744193221726593693839},
+    { 0.803207531480644909806676513}, { 0.595699304492433343467036529},
+    {-0.595699304492433343467036529}, { 0.803207531480644909806676513},
+    { 0.146730474455361751658850130}, { 0.989176509964780973451673738},
+    {-0.989176509964780973451673738}, { 0.146730474455361751658850130},
+    { 0.989176509964780973451673738}, { 0.146730474455361751658850130},
+    {-0.146730474455361751658850130}, { 0.989176509964780973451673738},
+    { 0.595699304492433343467036529}, { 0.803207531480644909806676513},
+    {-0.803207531480644909806676513}, { 0.595699304492433343467036529},
+    { 0.857728610000272069902269984}, { 0.514102744193221726593693839},
+    {-0.514102744193221726593693839}, { 0.857728610000272069902269984},
+    { 0.242980179903263889948274162}, { 0.970031253194543992603984207},
+    {-0.970031253194543992603984207}, { 0.242980179903263889948274162},
+    { 0.941544065183020778412509403}, { 0.336889853392220050689253213},
+    {-0.336889853392220050689253213}, { 0.941544065183020778412509403},
+    { 0.427555093430282094320966857}, { 0.903989293123443331586200297},
+    {-0.903989293123443331586200297}, { 0.427555093430282094320966857},
+    { 0.740951125354959091175616897}, { 0.671558954847018400625376850},
+    {-0.671558954847018400625376850}, { 0.740951125354959091175616897},
+    { 0.049067674327418014254954977}, { 0.998795456205172392714771605},
+    {-0.998795456205172392714771605}, { 0.049067674327418014254954977},
+    { 0.999698818696204220115765650}, { 0.024541228522912288031734529},
+    {-0.024541228522912288031734529}, { 0.999698818696204220115765650},
+    { 0.689540544737066924616730630}, { 0.724247082951466920941069243},
+    {-0.724247082951466920941069243}, { 0.689540544737066924616730630},
+    { 0.914209755703530654635014829}, { 0.405241314004989870908481306},
+    {-0.405241314004989870908481306}, { 0.914209755703530654635014829},
+    { 0.359895036534988148775104572}, { 0.932992798834738887711660256},
+    {-0.932992798834738887711660256}, { 0.359895036534988148775104572},
+    { 0.975702130038528544460395766}, { 0.219101240156869797227737547},
+    {-0.219101240156869797227737547}, { 0.975702130038528544460395766},
+    { 0.534997619887097210663076905}, { 0.844853565249707073259571205},
+    {-0.844853565249707073259571205}, { 0.534997619887097210663076905},
+    { 0.817584813151583696504920884}, { 0.575808191417845300745972454},
+    {-0.575808191417845300745972454}, { 0.817584813151583696504920884},
+    { 0.170961888760301226363642357}, { 0.985277642388941244774018433},
+    {-0.985277642388941244774018433}, { 0.170961888760301226363642357},
+    { 0.992479534598709998156767252}, { 0.122410675199216198498704474},
+    {-0.122410675199216198498704474}, { 0.992479534598709998156767252},
+    { 0.615231590580626845484913563}, { 0.788346427626606262009164705},
+    {-0.788346427626606262009164705}, { 0.615231590580626845484913563},
+    { 0.870086991108711418652292404}, { 0.492898192229784036873026689},
+    {-0.492898192229784036873026689}, { 0.870086991108711418652292404},
+    { 0.266712757474898386325286515}, { 0.963776065795439866686464356},
+    {-0.963776065795439866686464356}, { 0.266712757474898386325286515},
+    { 0.949528180593036667195936074}, { 0.313681740398891476656478846},
+    {-0.313681740398891476656478846}, { 0.949528180593036667195936074},
+    { 0.449611329654606600046294579}, { 0.893224301195515320342416447},
+    {-0.893224301195515320342416447}, { 0.449611329654606600046294579},
+    { 0.757208846506484547575464054}, { 0.653172842953776764084203014},
+    {-0.653172842953776764084203014}, { 0.757208846506484547575464054},
+    { 0.073564563599667423529465622}, { 0.997290456678690216135597140},
+    {-0.997290456678690216135597140}, { 0.073564563599667423529465622},
+    { 0.997290456678690216135597140}, { 0.073564563599667423529465622},
+    {-0.073564563599667423529465622}, { 0.997290456678690216135597140},
+    { 0.653172842953776764084203014}, { 0.757208846506484547575464054},
+    {-0.757208846506484547575464054}, { 0.653172842953776764084203014},
+    { 0.893224301195515320342416447}, { 0.449611329654606600046294579},
+    {-0.449611329654606600046294579}, { 0.893224301195515320342416447},
+    { 0.313681740398891476656478846}, { 0.949528180593036667195936074},
+    {-0.949528180593036667195936074}, { 0.313681740398891476656478846},
+    { 0.963776065795439866686464356}, { 0.266712757474898386325286515},
+    {-0.266712757474898386325286515}, { 0.963776065795439866686464356},
+    { 0.492898192229784036873026689}, { 0.870086991108711418652292404},
+    {-0.870086991108711418652292404}, { 0.492898192229784036873026689},
+    { 0.788346427626606262009164705}, { 0.615231590580626845484913563},
+    {-0.615231590580626845484913563}, { 0.788346427626606262009164705},
+    { 0.122410675199216198498704474}, { 0.992479534598709998156767252},
+    {-0.992479534598709998156767252}, { 0.122410675199216198498704474},
+    { 0.985277642388941244774018433}, { 0.170961888760301226363642357},
+    {-0.170961888760301226363642357}, { 0.985277642388941244774018433},
+    { 0.575808191417845300745972454}, { 0.817584813151583696504920884},
+    {-0.817584813151583696504920884}, { 0.575808191417845300745972454},
+    { 0.844853565249707073259571205}, { 0.534997619887097210663076905},
+    {-0.534997619887097210663076905}, { 0.844853565249707073259571205},
+    { 0.219101240156869797227737547}, { 0.975702130038528544460395766},
+    {-0.975702130038528544460395766}, { 0.219101240156869797227737547},
+    { 0.932992798834738887711660256}, { 0.359895036534988148775104572},
+    {-0.359895036534988148775104572}, { 0.932992798834738887711660256},
+    { 0.405241314004989870908481306}, { 0.914209755703530654635014829},
+    {-0.914209755703530654635014829}, { 0.405241314004989870908481306},
+    { 0.724247082951466920941069243}, { 0.689540544737066924616730630},
+    {-0.689540544737066924616730630}, { 0.724247082951466920941069243},
+    { 0.024541228522912288031734529}, { 0.999698818696204220115765650},
+    {-0.999698818696204220115765650}, { 0.024541228522912288031734529},
+    { 0.999924701839144540921646491}, { 0.012271538285719926079408262},
+    {-0.012271538285719926079408262}, { 0.999924701839144540921646491},
+    { 0.698376249408972853554813503}, { 0.715730825283818654125532623},
+    {-0.715730825283818654125532623}, { 0.698376249408972853554813503},
+    { 0.919113851690057743908477789}, { 0.393992040061048108596188661},
+    {-0.393992040061048108596188661}, { 0.919113851690057743908477789},
+    { 0.371317193951837543411934967}, { 0.928506080473215565937167396},
+    {-0.928506080473215565937167396}, { 0.371317193951837543411934967},
+    { 0.978317370719627633106240097}, { 0.207111376192218549708116020},
+    {-0.207111376192218549708116020}, { 0.978317370719627633106240097},
+    { 0.545324988422046422313987347}, { 0.838224705554838043186996856},
+    {-0.838224705554838043186996856}, { 0.545324988422046422313987347},
+    { 0.824589302785025264474803737}, { 0.565731810783613197389765011},
+    {-0.565731810783613197389765011}, { 0.824589302785025264474803737},
+    { 0.183039887955140958516532578}, { 0.983105487431216327180301155},
+    {-0.983105487431216327180301155}, { 0.183039887955140958516532578},
+    { 0.993906970002356041546922813}, { 0.110222207293883058807899140},
+    {-0.110222207293883058807899140}, { 0.993906970002356041546922813},
+    { 0.624859488142386377084072816}, { 0.780737228572094478301588484},
+    {-0.780737228572094478301588484}, { 0.624859488142386377084072816},
+    { 0.876070094195406607095844268}, { 0.482183772079122748517344481},
+    {-0.482183772079122748517344481}, { 0.876070094195406607095844268},
+    { 0.278519689385053105207848526}, { 0.960430519415565811199035138},
+    {-0.960430519415565811199035138}, { 0.278519689385053105207848526},
+    { 0.953306040354193836916740383}, { 0.302005949319228067003463232},
+    {-0.302005949319228067003463232}, { 0.953306040354193836916740383},
+    { 0.460538710958240023633181487}, { 0.887639620402853947760181617},
+    {-0.887639620402853947760181617}, { 0.460538710958240023633181487},
+    { 0.765167265622458925888815999}, { 0.643831542889791465068086063},
+    {-0.643831542889791465068086063}, { 0.765167265622458925888815999},
+    { 0.085797312344439890461556332}, { 0.996312612182778012627226190},
+    {-0.996312612182778012627226190}, { 0.085797312344439890461556332},
+    { 0.998118112900149207125155861}, { 0.061320736302208577782614593},
+    {-0.061320736302208577782614593}, { 0.998118112900149207125155861},
+    { 0.662415777590171761113069817}, { 0.749136394523459325469203257},
+    {-0.749136394523459325469203257}, { 0.662415777590171761113069817},
+    { 0.898674465693953843041976744}, { 0.438616238538527637647025738},
+    {-0.438616238538527637647025738}, { 0.898674465693953843041976744},
+    { 0.325310292162262934135954708}, { 0.945607325380521325730945387},
+    {-0.945607325380521325730945387}, { 0.325310292162262934135954708},
+    { 0.966976471044852109087220226}, { 0.254865659604514571553980779},
+    {-0.254865659604514571553980779}, { 0.966976471044852109087220226},
+    { 0.503538383725717558691867071}, { 0.863972856121586737918147054},
+    {-0.863972856121586737918147054}, { 0.503538383725717558691867071},
+    { 0.795836904608883536262791915}, { 0.605511041404325513920626941},
+    {-0.605511041404325513920626941}, { 0.795836904608883536262791915},
+    { 0.134580708507126186316358409}, { 0.990902635427780025108237011},
+    {-0.990902635427780025108237011}, { 0.134580708507126186316358409},
+    { 0.987301418157858382399815802}, { 0.158858143333861441684385360},
+    {-0.158858143333861441684385360}, { 0.987301418157858382399815802},
+    { 0.585797857456438860328080838}, { 0.810457198252594791726703434},
+    {-0.810457198252594791726703434}, { 0.585797857456438860328080838},
+    { 0.851355193105265142261290312}, { 0.524589682678468906215098464},
+    {-0.524589682678468906215098464}, { 0.851355193105265142261290312},
+    { 0.231058108280671119643236018}, { 0.972939952205560145467720114},
+    {-0.972939952205560145467720114}, { 0.231058108280671119643236018},
+    { 0.937339011912574923201899593}, { 0.348418680249434568419308588},
+    {-0.348418680249434568419308588}, { 0.937339011912574923201899593},
+    { 0.416429560097637182562598911}, { 0.909167983090522376563884788},
+    {-0.909167983090522376563884788}, { 0.416429560097637182562598911},
+    { 0.732654271672412834615546649}, { 0.680600997795453050594430464},
+    {-0.680600997795453050594430464}, { 0.732654271672412834615546649},
+    { 0.036807222941358832324332691}, { 0.999322384588349500896221011},
+    {-0.999322384588349500896221011}, { 0.036807222941358832324332691},
+    { 0.999322384588349500896221011}, { 0.036807222941358832324332691},
+    {-0.036807222941358832324332691}, { 0.999322384588349500896221011},
+    { 0.680600997795453050594430464}, { 0.732654271672412834615546649},
+    {-0.732654271672412834615546649}, { 0.680600997795453050594430464},
+    { 0.909167983090522376563884788}, { 0.416429560097637182562598911},
+    {-0.416429560097637182562598911}, { 0.909167983090522376563884788},
+    { 0.348418680249434568419308588}, { 0.937339011912574923201899593},
+    {-0.937339011912574923201899593}, { 0.348418680249434568419308588},
+    { 0.972939952205560145467720114}, { 0.231058108280671119643236018},
+    {-0.231058108280671119643236018}, { 0.972939952205560145467720114},
+    { 0.524589682678468906215098464}, { 0.851355193105265142261290312},
+    {-0.851355193105265142261290312}, { 0.524589682678468906215098464},
+    { 0.810457198252594791726703434}, { 0.585797857456438860328080838},
+    {-0.585797857456438860328080838}, { 0.810457198252594791726703434},
+    { 0.158858143333861441684385360}, { 0.987301418157858382399815802},
+    {-0.987301418157858382399815802}, { 0.158858143333861441684385360},
+    { 0.990902635427780025108237011}, { 0.134580708507126186316358409},
+    {-0.134580708507126186316358409}, { 0.990902635427780025108237011},
+    { 0.605511041404325513920626941}, { 0.795836904608883536262791915},
+    {-0.795836904608883536262791915}, { 0.605511041404325513920626941},
+    { 0.863972856121586737918147054}, { 0.503538383725717558691867071},
+    {-0.503538383725717558691867071}, { 0.863972856121586737918147054},
+    { 0.254865659604514571553980779}, { 0.966976471044852109087220226},
+    {-0.966976471044852109087220226}, { 0.254865659604514571553980779},
+    { 0.945607325380521325730945387}, { 0.325310292162262934135954708},
+    {-0.325310292162262934135954708}, { 0.945607325380521325730945387},
+    { 0.438616238538527637647025738}, { 0.898674465693953843041976744},
+    {-0.898674465693953843041976744}, { 0.438616238538527637647025738},
+    { 0.749136394523459325469203257}, { 0.662415777590171761113069817},
+    {-0.662415777590171761113069817}, { 0.749136394523459325469203257},
+    { 0.061320736302208577782614593}, { 0.998118112900149207125155861},
+    {-0.998118112900149207125155861}, { 0.061320736302208577782614593},
+    { 0.996312612182778012627226190}, { 0.085797312344439890461556332},
+    {-0.085797312344439890461556332}, { 0.996312612182778012627226190},
+    { 0.643831542889791465068086063}, { 0.765167265622458925888815999},
+    {-0.765167265622458925888815999}, { 0.643831542889791465068086063},
+    { 0.887639620402853947760181617}, { 0.460538710958240023633181487},
+    {-0.460538710958240023633181487}, { 0.887639620402853947760181617},
+    { 0.302005949319228067003463232}, { 0.953306040354193836916740383},
+    {-0.953306040354193836916740383}, { 0.302005949319228067003463232},
+    { 0.960430519415565811199035138}, { 0.278519689385053105207848526},
+    {-0.278519689385053105207848526}, { 0.960430519415565811199035138},
+    { 0.482183772079122748517344481}, { 0.876070094195406607095844268},
+    {-0.876070094195406607095844268}, { 0.482183772079122748517344481},
+    { 0.780737228572094478301588484}, { 0.624859488142386377084072816},
+    {-0.624859488142386377084072816}, { 0.780737228572094478301588484},
+    { 0.110222207293883058807899140}, { 0.993906970002356041546922813},
+    {-0.993906970002356041546922813}, { 0.110222207293883058807899140},
+    { 0.983105487431216327180301155}, { 0.183039887955140958516532578},
+    {-0.183039887955140958516532578}, { 0.983105487431216327180301155},
+    { 0.565731810783613197389765011}, { 0.824589302785025264474803737},
+    {-0.824589302785025264474803737}, { 0.565731810783613197389765011},
+    { 0.838224705554838043186996856}, { 0.545324988422046422313987347},
+    {-0.545324988422046422313987347}, { 0.838224705554838043186996856},
+    { 0.207111376192218549708116020}, { 0.978317370719627633106240097},
+    {-0.978317370719627633106240097}, { 0.207111376192218549708116020},
+    { 0.928506080473215565937167396}, { 0.371317193951837543411934967},
+    {-0.371317193951837543411934967}, { 0.928506080473215565937167396},
+    { 0.393992040061048108596188661}, { 0.919113851690057743908477789},
+    {-0.919113851690057743908477789}, { 0.393992040061048108596188661},
+    { 0.715730825283818654125532623}, { 0.698376249408972853554813503},
+    {-0.698376249408972853554813503}, { 0.715730825283818654125532623},
+    { 0.012271538285719926079408262}, { 0.999924701839144540921646491},
+    {-0.999924701839144540921646491}, { 0.012271538285719926079408262},
+    { 0.999981175282601142656990438}, { 0.006135884649154475359640235},
+    {-0.006135884649154475359640235}, { 0.999981175282601142656990438},
+    { 0.702754744457225302452914421}, { 0.711432195745216441522130290},
+    {-0.711432195745216441522130290}, { 0.702754744457225302452914421},
+    { 0.921514039342041943465396332}, { 0.388345046698826291624993541},
+    {-0.388345046698826291624993541}, { 0.921514039342041943465396332},
+    { 0.377007410216418256726567823}, { 0.926210242138311341974793388},
+    {-0.926210242138311341974793388}, { 0.377007410216418256726567823},
+    { 0.979569765685440534439326110}, { 0.201104634842091911558443546},
+    {-0.201104634842091911558443546}, { 0.979569765685440534439326110},
+    { 0.550457972936604802977289893}, { 0.834862874986380056304401383},
+    {-0.834862874986380056304401383}, { 0.550457972936604802977289893},
+    { 0.828045045257755752067527592}, { 0.560661576197336023839710223},
+    {-0.560661576197336023839710223}, { 0.828045045257755752067527592},
+    { 0.189068664149806212754997837}, { 0.981963869109555264072848154},
+    {-0.981963869109555264072848154}, { 0.189068664149806212754997837},
+    { 0.994564570734255452119106243}, { 0.104121633872054579120943880},
+    {-0.104121633872054579120943880}, { 0.994564570734255452119106243},
+    { 0.629638238914927025372981341}, { 0.776888465673232450040827983},
+    {-0.776888465673232450040827983}, { 0.629638238914927025372981341},
+    { 0.879012226428633477831323711}, { 0.476799230063322133342158117},
+    {-0.476799230063322133342158117}, { 0.879012226428633477831323711},
+    { 0.284407537211271843618310615}, { 0.958703474895871555374645792},
+    {-0.958703474895871555374645792}, { 0.284407537211271843618310615},
+    { 0.955141168305770721498157712}, { 0.296150888243623824121786128},
+    {-0.296150888243623824121786128}, { 0.955141168305770721498157712},
+    { 0.465976495767966177902756065}, { 0.884797098430937780104007041},
+    {-0.884797098430937780104007041}, { 0.465976495767966177902756065},
+    { 0.769103337645579639346626069}, { 0.639124444863775743801488193},
+    {-0.639124444863775743801488193}, { 0.769103337645579639346626069},
+    { 0.091908956497132728624990979}, { 0.995767414467659793982495643},
+    {-0.995767414467659793982495643}, { 0.091908956497132728624990979},
+    { 0.998475580573294752208559038}, { 0.055195244349689939809447526},
+    {-0.055195244349689939809447526}, { 0.998475580573294752208559038},
+    { 0.666999922303637506650154222}, { 0.745057785441465962407907310},
+    {-0.745057785441465962407907310}, { 0.666999922303637506650154222},
+    { 0.901348847046022014570746093}, { 0.433093818853151968484222638},
+    {-0.433093818853151968484222638}, { 0.901348847046022014570746093},
+    { 0.331106305759876401737190737}, { 0.943593458161960361495301445},
+    {-0.943593458161960361495301445}, { 0.331106305759876401737190737},
+    { 0.968522094274417316221088329}, { 0.248927605745720168110682816},
+    {-0.248927605745720168110682816}, { 0.968522094274417316221088329},
+    { 0.508830142543107036931749324}, { 0.860866938637767279344583877},
+    {-0.860866938637767279344583877}, { 0.508830142543107036931749324},
+    { 0.799537269107905033500246232}, { 0.600616479383868926653875896},
+    {-0.600616479383868926653875896}, { 0.799537269107905033500246232},
+    { 0.140658239332849230714788846}, { 0.990058210262297105505906464},
+    {-0.990058210262297105505906464}, { 0.140658239332849230714788846},
+    { 0.988257567730749491404792538}, { 0.152797185258443427720336613},
+    {-0.152797185258443427720336613}, { 0.988257567730749491404792538},
+    { 0.590759701858874228423887908}, { 0.806847553543799272206514313},
+    {-0.806847553543799272206514313}, { 0.590759701858874228423887908},
+    { 0.854557988365400520767862276}, { 0.519355990165589587361829932},
+    {-0.519355990165589587361829932}, { 0.854557988365400520767862276},
+    { 0.237023605994367206867735915}, { 0.971503890986251775537099622},
+    {-0.971503890986251775537099622}, { 0.237023605994367206867735915},
+    { 0.939459223602189911962669246}, { 0.342660717311994397592781983},
+    {-0.342660717311994397592781983}, { 0.939459223602189911962669246},
+    { 0.422000270799799685941287941}, { 0.906595704514915365332960588},
+    {-0.906595704514915365332960588}, { 0.422000270799799685941287941},
+    { 0.736816568877369875090132520}, { 0.676092703575315960360419228},
+    {-0.676092703575315960360419228}, { 0.736816568877369875090132520},
+    { 0.042938256934940823077124540}, { 0.999077727752645382888781997},
+    {-0.999077727752645382888781997}, { 0.042938256934940823077124540},
+    { 0.999529417501093163079703322}, { 0.030674803176636625934021028},
+    {-0.030674803176636625934021028}, { 0.999529417501093163079703322},
+    { 0.685083667772700381362052545}, { 0.728464390448225196492035438},
+    {-0.728464390448225196492035438}, { 0.685083667772700381362052545},
+    { 0.911706032005429851404397325}, { 0.410843171057903942183466675},
+    {-0.410843171057903942183466675}, { 0.911706032005429851404397325},
+    { 0.354163525420490382357395796}, { 0.935183509938947577642207480},
+    {-0.935183509938947577642207480}, { 0.354163525420490382357395796},
+    { 0.974339382785575860518721668}, { 0.225083911359792835991642120},
+    {-0.225083911359792835991642120}, { 0.974339382785575860518721668},
+    { 0.529803624686294668216054671}, { 0.848120344803297251279133563},
+    {-0.848120344803297251279133563}, { 0.529803624686294668216054671},
+    { 0.814036329705948361654516690}, { 0.580813958095764545075595272},
+    {-0.580813958095764545075595272}, { 0.814036329705948361654516690},
+    { 0.164913120489969921418189113}, { 0.986308097244598647863297524},
+    {-0.986308097244598647863297524}, { 0.164913120489969921418189113},
+    { 0.991709753669099522860049931}, { 0.128498110793793172624415589},
+    {-0.128498110793793172624415589}, { 0.991709753669099522860049931},
+    { 0.610382806276309452716352152}, { 0.792106577300212351782342879},
+    {-0.792106577300212351782342879}, { 0.610382806276309452716352152},
+    { 0.867046245515692651480195629}, { 0.498227666972781852410983869},
+    {-0.498227666972781852410983869}, { 0.867046245515692651480195629},
+    { 0.260794117915275518280186509}, { 0.965394441697689374550843858},
+    {-0.965394441697689374550843858}, { 0.260794117915275518280186509},
+    { 0.947585591017741134653387321}, { 0.319502030816015677901518272},
+    {-0.319502030816015677901518272}, { 0.947585591017741134653387321},
+    { 0.444122144570429231642069418}, { 0.895966249756185155914560282},
+    {-0.895966249756185155914560282}, { 0.444122144570429231642069418},
+    { 0.753186799043612482483430486}, { 0.657806693297078656931182264},
+    {-0.657806693297078656931182264}, { 0.753186799043612482483430486},
+    { 0.067443919563664057897972422}, { 0.997723066644191609848546728},
+    {-0.997723066644191609848546728}, { 0.067443919563664057897972422},
+    { 0.996820299291165714972629398}, { 0.079682437971430121147120656},
+    {-0.079682437971430121147120656}, { 0.996820299291165714972629398},
+    { 0.648514401022112445084560551}, { 0.761202385484261814029709836},
+    {-0.761202385484261814029709836}, { 0.648514401022112445084560551},
+    { 0.890448723244757889952150560}, { 0.455083587126343823535869268},
+    {-0.455083587126343823535869268}, { 0.890448723244757889952150560},
+    { 0.307849640041534893682063646}, { 0.951435020969008369549175569},
+    {-0.951435020969008369549175569}, { 0.307849640041534893682063646},
+    { 0.962121404269041595429604316}, { 0.272621355449948984493347477},
+    {-0.272621355449948984493347477}, { 0.962121404269041595429604316},
+    { 0.487550160148435954641485027}, { 0.873094978418290098636085973},
+    {-0.873094978418290098636085973}, { 0.487550160148435954641485027},
+    { 0.784556597155575233023892575}, { 0.620057211763289178646268191},
+    {-0.620057211763289178646268191}, { 0.784556597155575233023892575},
+    { 0.116318630911904767252544319}, { 0.993211949234794533104601012},
+    {-0.993211949234794533104601012}, { 0.116318630911904767252544319},
+    { 0.984210092386929073193874387}, { 0.177004220412148756196839844},
+    {-0.177004220412148756196839844}, { 0.984210092386929073193874387},
+    { 0.570780745886967280232652864}, { 0.821102514991104679060430820},
+    {-0.821102514991104679060430820}, { 0.570780745886967280232652864},
+    { 0.841554977436898409603499520}, { 0.540171472729892881297845480},
+    {-0.540171472729892881297845480}, { 0.841554977436898409603499520},
+    { 0.213110319916091373967757518}, { 0.977028142657754351485866211},
+    {-0.977028142657754351485866211}, { 0.213110319916091373967757518},
+    { 0.930766961078983731944872340}, { 0.365612997804773870011745909},
+    {-0.365612997804773870011745909}, { 0.930766961078983731944872340},
+    { 0.399624199845646828544117031}, { 0.916679059921042663116457013},
+    {-0.916679059921042663116457013}, { 0.399624199845646828544117031},
+    { 0.720002507961381629076682999}, { 0.693971460889654009003734389},
+    {-0.693971460889654009003734389}, { 0.720002507961381629076682999},
+    { 0.018406729905804820927366313}, { 0.999830581795823422015722275},
+    {-0.999830581795823422015722275}, { 0.018406729905804820927366313},
+    { 0.999830581795823422015722275}, { 0.018406729905804820927366313},
+    {-0.018406729905804820927366313}, { 0.999830581795823422015722275},
+    { 0.693971460889654009003734389}, { 0.720002507961381629076682999},
+    {-0.720002507961381629076682999}, { 0.693971460889654009003734389},
+    { 0.916679059921042663116457013}, { 0.399624199845646828544117031},
+    {-0.399624199845646828544117031}, { 0.916679059921042663116457013},
+    { 0.365612997804773870011745909}, { 0.930766961078983731944872340},
+    {-0.930766961078983731944872340}, { 0.365612997804773870011745909},
+    { 0.977028142657754351485866211}, { 0.213110319916091373967757518},
+    {-0.213110319916091373967757518}, { 0.977028142657754351485866211},
+    { 0.540171472729892881297845480}, { 0.841554977436898409603499520},
+    {-0.841554977436898409603499520}, { 0.540171472729892881297845480},
+    { 0.821102514991104679060430820}, { 0.570780745886967280232652864},
+    {-0.570780745886967280232652864}, { 0.821102514991104679060430820},
+    { 0.177004220412148756196839844}, { 0.984210092386929073193874387},
+    {-0.984210092386929073193874387}, { 0.177004220412148756196839844},
+    { 0.993211949234794533104601012}, { 0.116318630911904767252544319},
+    {-0.116318630911904767252544319}, { 0.993211949234794533104601012},
+    { 0.620057211763289178646268191}, { 0.784556597155575233023892575},
+    {-0.784556597155575233023892575}, { 0.620057211763289178646268191},
+    { 0.873094978418290098636085973}, { 0.487550160148435954641485027},
+    {-0.487550160148435954641485027}, { 0.873094978418290098636085973},
+    { 0.272621355449948984493347477}, { 0.962121404269041595429604316},
+    {-0.962121404269041595429604316}, { 0.272621355449948984493347477},
+    { 0.951435020969008369549175569}, { 0.307849640041534893682063646},
+    {-0.307849640041534893682063646}, { 0.951435020969008369549175569},
+    { 0.455083587126343823535869268}, { 0.890448723244757889952150560},
+    {-0.890448723244757889952150560}, { 0.455083587126343823535869268},
+    { 0.761202385484261814029709836}, { 0.648514401022112445084560551},
+    {-0.648514401022112445084560551}, { 0.761202385484261814029709836},
+    { 0.079682437971430121147120656}, { 0.996820299291165714972629398},
+    {-0.996820299291165714972629398}, { 0.079682437971430121147120656},
+    { 0.997723066644191609848546728}, { 0.067443919563664057897972422},
+    {-0.067443919563664057897972422}, { 0.997723066644191609848546728},
+    { 0.657806693297078656931182264}, { 0.753186799043612482483430486},
+    {-0.753186799043612482483430486}, { 0.657806693297078656931182264},
+    { 0.895966249756185155914560282}, { 0.444122144570429231642069418},
+    {-0.444122144570429231642069418}, { 0.895966249756185155914560282},
+    { 0.319502030816015677901518272}, { 0.947585591017741134653387321},
+    {-0.947585591017741134653387321}, { 0.319502030816015677901518272},
+    { 0.965394441697689374550843858}, { 0.260794117915275518280186509},
+    {-0.260794117915275518280186509}, { 0.965394441697689374550843858},
+    { 0.498227666972781852410983869}, { 0.867046245515692651480195629},
+    {-0.867046245515692651480195629}, { 0.498227666972781852410983869},
+    { 0.792106577300212351782342879}, { 0.610382806276309452716352152},
+    {-0.610382806276309452716352152}, { 0.792106577300212351782342879},
+    { 0.128498110793793172624415589}, { 0.991709753669099522860049931},
+    {-0.991709753669099522860049931}, { 0.128498110793793172624415589},
+    { 0.986308097244598647863297524}, { 0.164913120489969921418189113},
+    {-0.164913120489969921418189113}, { 0.986308097244598647863297524},
+    { 0.580813958095764545075595272}, { 0.814036329705948361654516690},
+    {-0.814036329705948361654516690}, { 0.580813958095764545075595272},
+    { 0.848120344803297251279133563}, { 0.529803624686294668216054671},
+    {-0.529803624686294668216054671}, { 0.848120344803297251279133563},
+    { 0.225083911359792835991642120}, { 0.974339382785575860518721668},
+    {-0.974339382785575860518721668}, { 0.225083911359792835991642120},
+    { 0.935183509938947577642207480}, { 0.354163525420490382357395796},
+    {-0.354163525420490382357395796}, { 0.935183509938947577642207480},
+    { 0.410843171057903942183466675}, { 0.911706032005429851404397325},
+    {-0.911706032005429851404397325}, { 0.410843171057903942183466675},
+    { 0.728464390448225196492035438}, { 0.685083667772700381362052545},
+    {-0.685083667772700381362052545}, { 0.728464390448225196492035438},
+    { 0.030674803176636625934021028}, { 0.999529417501093163079703322},
+    {-0.999529417501093163079703322}, { 0.030674803176636625934021028},
+    { 0.999077727752645382888781997}, { 0.042938256934940823077124540},
+    {-0.042938256934940823077124540}, { 0.999077727752645382888781997},
+    { 0.676092703575315960360419228}, { 0.736816568877369875090132520},
+    {-0.736816568877369875090132520}, { 0.676092703575315960360419228},
+    { 0.906595704514915365332960588}, { 0.422000270799799685941287941},
+    {-0.422000270799799685941287941}, { 0.906595704514915365332960588},
+    { 0.342660717311994397592781983}, { 0.939459223602189911962669246},
+    {-0.939459223602189911962669246}, { 0.342660717311994397592781983},
+    { 0.971503890986251775537099622}, { 0.237023605994367206867735915},
+    {-0.237023605994367206867735915}, { 0.971503890986251775537099622},
+    { 0.519355990165589587361829932}, { 0.854557988365400520767862276},
+    {-0.854557988365400520767862276}, { 0.519355990165589587361829932},
+    { 0.806847553543799272206514313}, { 0.590759701858874228423887908},
+    {-0.590759701858874228423887908}, { 0.806847553543799272206514313},
+    { 0.152797185258443427720336613}, { 0.988257567730749491404792538},
+    {-0.988257567730749491404792538}, { 0.152797185258443427720336613},
+    { 0.990058210262297105505906464}, { 0.140658239332849230714788846},
+    {-0.140658239332849230714788846}, { 0.990058210262297105505906464},
+    { 0.600616479383868926653875896}, { 0.799537269107905033500246232},
+    {-0.799537269107905033500246232}, { 0.600616479383868926653875896},
+    { 0.860866938637767279344583877}, { 0.508830142543107036931749324},
+    {-0.508830142543107036931749324}, { 0.860866938637767279344583877},
+    { 0.248927605745720168110682816}, { 0.968522094274417316221088329},
+    {-0.968522094274417316221088329}, { 0.248927605745720168110682816},
+    { 0.943593458161960361495301445}, { 0.331106305759876401737190737},
+    {-0.331106305759876401737190737}, { 0.943593458161960361495301445},
+    { 0.433093818853151968484222638}, { 0.901348847046022014570746093},
+    {-0.901348847046022014570746093}, { 0.433093818853151968484222638},
+    { 0.745057785441465962407907310}, { 0.666999922303637506650154222},
+    {-0.666999922303637506650154222}, { 0.745057785441465962407907310},
+    { 0.055195244349689939809447526}, { 0.998475580573294752208559038},
+    {-0.998475580573294752208559038}, { 0.055195244349689939809447526},
+    { 0.995767414467659793982495643}, { 0.091908956497132728624990979},
+    {-0.091908956497132728624990979}, { 0.995767414467659793982495643},
+    { 0.639124444863775743801488193}, { 0.769103337645579639346626069},
+    {-0.769103337645579639346626069}, { 0.639124444863775743801488193},
+    { 0.884797098430937780104007041}, { 0.465976495767966177902756065},
+    {-0.465976495767966177902756065}, { 0.884797098430937780104007041},
+    { 0.296150888243623824121786128}, { 0.955141168305770721498157712},
+    {-0.955141168305770721498157712}, { 0.296150888243623824121786128},
+    { 0.958703474895871555374645792}, { 0.284407537211271843618310615},
+    {-0.284407537211271843618310615}, { 0.958703474895871555374645792},
+    { 0.476799230063322133342158117}, { 0.879012226428633477831323711},
+    {-0.879012226428633477831323711}, { 0.476799230063322133342158117},
+    { 0.776888465673232450040827983}, { 0.629638238914927025372981341},
+    {-0.629638238914927025372981341}, { 0.776888465673232450040827983},
+    { 0.104121633872054579120943880}, { 0.994564570734255452119106243},
+    {-0.994564570734255452119106243}, { 0.104121633872054579120943880},
+    { 0.981963869109555264072848154}, { 0.189068664149806212754997837},
+    {-0.189068664149806212754997837}, { 0.981963869109555264072848154},
+    { 0.560661576197336023839710223}, { 0.828045045257755752067527592},
+    {-0.828045045257755752067527592}, { 0.560661576197336023839710223},
+    { 0.834862874986380056304401383}, { 0.550457972936604802977289893},
+    {-0.550457972936604802977289893}, { 0.834862874986380056304401383},
+    { 0.201104634842091911558443546}, { 0.979569765685440534439326110},
+    {-0.979569765685440534439326110}, { 0.201104634842091911558443546},
+    { 0.926210242138311341974793388}, { 0.377007410216418256726567823},
+    {-0.377007410216418256726567823}, { 0.926210242138311341974793388},
+    { 0.388345046698826291624993541}, { 0.921514039342041943465396332},
+    {-0.921514039342041943465396332}, { 0.388345046698826291624993541},
+    { 0.711432195745216441522130290}, { 0.702754744457225302452914421},
+    {-0.702754744457225302452914421}, { 0.711432195745216441522130290},
+    { 0.006135884649154475359640235}, { 0.999981175282601142656990438},
+    {-0.999981175282601142656990438}, { 0.006135884649154475359640235},
+    { 0.999995293809576171511580126}, { 0.003067956762965976270145365},
+    {-0.003067956762965976270145365}, { 0.999995293809576171511580126},
+    { 0.704934080375904908852523758}, { 0.709272826438865651316533772},
+    {-0.709272826438865651316533772}, { 0.704934080375904908852523758},
+    { 0.922701128333878570437264227}, { 0.385516053843918864075607949},
+    {-0.385516053843918864075607949}, { 0.922701128333878570437264227},
+    { 0.379847208924051170576281147}, { 0.925049240782677590302371869},
+    {-0.925049240782677590302371869}, { 0.379847208924051170576281147},
+    { 0.980182135968117392690210009}, { 0.198098410717953586179324918},
+    {-0.198098410717953586179324918}, { 0.980182135968117392690210009},
+    { 0.553016705580027531764226988}, { 0.833170164701913186439915922},
+    {-0.833170164701913186439915922}, { 0.553016705580027531764226988},
+    { 0.829761233794523042469023765}, { 0.558118531220556115693702964},
+    {-0.558118531220556115693702964}, { 0.829761233794523042469023765},
+    { 0.192080397049892441679288205}, { 0.981379193313754574318224190},
+    {-0.981379193313754574318224190}, { 0.192080397049892441679288205},
+    { 0.994879330794805620591166107}, { 0.101069862754827824987887585},
+    {-0.101069862754827824987887585}, { 0.994879330794805620591166107},
+    { 0.632018735939809021909403706}, { 0.774953106594873878359129282},
+    {-0.774953106594873878359129282}, { 0.632018735939809021909403706},
+    { 0.880470889052160770806542929}, { 0.474100214650550014398580015},
+    {-0.474100214650550014398580015}, { 0.880470889052160770806542929},
+    { 0.287347459544729526477331841}, { 0.957826413027532890321037029},
+    {-0.957826413027532890321037029}, { 0.287347459544729526477331841},
+    { 0.956045251349996443270479823}, { 0.293219162694258650606608599},
+    {-0.293219162694258650606608599}, { 0.956045251349996443270479823},
+    { 0.468688822035827933697617870}, { 0.883363338665731594736308015},
+    {-0.883363338665731594736308015}, { 0.468688822035827933697617870},
+    { 0.771060524261813773200605759}, { 0.636761861236284230413943435},
+    {-0.636761861236284230413943435}, { 0.771060524261813773200605759},
+    { 0.094963495329638998938034312}, { 0.995480755491926941769171600},
+    {-0.995480755491926941769171600}, { 0.094963495329638998938034312},
+    { 0.998640218180265222418199049}, { 0.052131704680283321236358216},
+    {-0.052131704680283321236358216}, { 0.998640218180265222418199049},
+    { 0.669282588346636065720696366}, { 0.743007952135121693517362293},
+    {-0.743007952135121693517362293}, { 0.669282588346636065720696366},
+    { 0.902673318237258806751502391}, { 0.430326481340082633908199031},
+    {-0.430326481340082633908199031}, { 0.902673318237258806751502391},
+    { 0.333999651442009404650865481}, { 0.942573197601446879280758735},
+    {-0.942573197601446879280758735}, { 0.333999651442009404650865481},
+    { 0.969281235356548486048290738}, { 0.245955050335794611599924709},
+    {-0.245955050335794611599924709}, { 0.969281235356548486048290738},
+    { 0.511468850437970399504391001}, { 0.859301818357008404783582139},
+    {-0.859301818357008404783582139}, { 0.511468850437970399504391001},
+    { 0.801376171723140219430247777}, { 0.598160706996342311724958652},
+    {-0.598160706996342311724958652}, { 0.801376171723140219430247777},
+    { 0.143695033150294454819773349}, { 0.989622017463200834623694454},
+    {-0.989622017463200834623694454}, { 0.143695033150294454819773349},
+    { 0.988721691960323767604516485}, { 0.149764534677321517229695737},
+    {-0.149764534677321517229695737}, { 0.988721691960323767604516485},
+    { 0.593232295039799808047809426}, { 0.805031331142963597922659282},
+    {-0.805031331142963597922659282}, { 0.593232295039799808047809426},
+    { 0.856147328375194481019630732}, { 0.516731799017649881508753876},
+    {-0.516731799017649881508753876}, { 0.856147328375194481019630732},
+    { 0.240003022448741486568922365}, { 0.970772140728950302138169611},
+    {-0.970772140728950302138169611}, { 0.240003022448741486568922365},
+    { 0.940506070593268323787291309}, { 0.339776884406826857828825803},
+    {-0.339776884406826857828825803}, { 0.940506070593268323787291309},
+    { 0.424779681209108833357226189}, { 0.905296759318118774354048329},
+    {-0.905296759318118774354048329}, { 0.424779681209108833357226189},
+    { 0.738887324460615147933116508}, { 0.673829000378756060917568372},
+    {-0.673829000378756060917568372}, { 0.738887324460615147933116508},
+    { 0.046003182130914628814301788}, { 0.998941293186856850633930266},
+    {-0.998941293186856850633930266}, { 0.046003182130914628814301788},
+    { 0.999618822495178597116830637}, { 0.027608145778965741612354872},
+    {-0.027608145778965741612354872}, { 0.999618822495178597116830637},
+    { 0.687315340891759108199186948}, { 0.726359155084345976817494315},
+    {-0.726359155084345976817494315}, { 0.687315340891759108199186948},
+    { 0.912962190428398164628018233}, { 0.408044162864978680820747499},
+    {-0.408044162864978680820747499}, { 0.912962190428398164628018233},
+    { 0.357030961233430032614954036}, { 0.934092550404258914729877883},
+    {-0.934092550404258914729877883}, { 0.357030961233430032614954036},
+    { 0.975025345066994146844913468}, { 0.222093620973203534094094721},
+    {-0.222093620973203534094094721}, { 0.975025345066994146844913468},
+    { 0.532403127877197971442805218}, { 0.846490938774052078300544488},
+    {-0.846490938774052078300544488}, { 0.532403127877197971442805218},
+    { 0.815814410806733789010772660}, { 0.578313796411655563342245019},
+    {-0.578313796411655563342245019}, { 0.815814410806733789010772660},
+    { 0.167938294974731178054745536}, { 0.985797509167567424700995000},
+    {-0.985797509167567424700995000}, { 0.167938294974731178054745536},
+    { 0.992099313142191757112085445}, { 0.125454983411546238542336453},
+    {-0.125454983411546238542336453}, { 0.992099313142191757112085445},
+    { 0.612810082429409703935211936}, { 0.790230221437310055030217152},
+    {-0.790230221437310055030217152}, { 0.612810082429409703935211936},
+    { 0.868570705971340895340449876}, { 0.495565261825772531150266670},
+    {-0.495565261825772531150266670}, { 0.868570705971340895340449876},
+    { 0.263754678974831383611349322}, { 0.964589793289812723836432159},
+    {-0.964589793289812723836432159}, { 0.263754678974831383611349322},
+    { 0.948561349915730288158494826}, { 0.316593375556165867243047035},
+    {-0.316593375556165867243047035}, { 0.948561349915730288158494826},
+    { 0.446868840162374195353044389}, { 0.894599485631382678433072126},
+    {-0.894599485631382678433072126}, { 0.446868840162374195353044389},
+    { 0.755201376896536527598710756}, { 0.655492852999615385312679701},
+    {-0.655492852999615385312679701}, { 0.755201376896536527598710756},
+    { 0.070504573389613863027351471}, { 0.997511456140303459699448390},
+    {-0.997511456140303459699448390}, { 0.070504573389613863027351471},
+    { 0.997060070339482978987989949}, { 0.076623861392031492278332463},
+    {-0.076623861392031492278332463}, { 0.997060070339482978987989949},
+    { 0.650846684996380915068975573}, { 0.759209188978388033485525443},
+    {-0.759209188978388033485525443}, { 0.650846684996380915068975573},
+    { 0.891840709392342727796478697}, { 0.452349587233770874133026703},
+    {-0.452349587233770874133026703}, { 0.891840709392342727796478697},
+    { 0.310767152749611495835997250}, { 0.950486073949481721759926101},
+    {-0.950486073949481721759926101}, { 0.310767152749611495835997250},
+    { 0.962953266873683886347921481}, { 0.269668325572915106525464462},
+    {-0.269668325572915106525464462}, { 0.962953266873683886347921481},
+    { 0.490226483288291154229598449}, { 0.871595086655951034842481435},
+    {-0.871595086655951034842481435}, { 0.490226483288291154229598449},
+    { 0.786455213599085757522319464}, { 0.617647307937803932403979402},
+    {-0.617647307937803932403979402}, { 0.786455213599085757522319464},
+    { 0.119365214810991364593637790}, { 0.992850414459865090793563344},
+    {-0.992850414459865090793563344}, { 0.119365214810991364593637790},
+    { 0.984748501801904218556553176}, { 0.173983873387463827950700807},
+    {-0.173983873387463827950700807}, { 0.984748501801904218556553176},
+    { 0.573297166698042212820171239}, { 0.819347520076796960824689637},
+    {-0.819347520076796960824689637}, { 0.573297166698042212820171239},
+    { 0.843208239641845437161743865}, { 0.537587076295645482502214932},
+    {-0.537587076295645482502214932}, { 0.843208239641845437161743865},
+    { 0.216106797076219509948385131}, { 0.976369731330021149312732194},
+    {-0.976369731330021149312732194}, { 0.216106797076219509948385131},
+    { 0.931884265581668106718557199}, { 0.362755724367397216204854462},
+    {-0.362755724367397216204854462}, { 0.931884265581668106718557199},
+    { 0.402434650859418441082533934}, { 0.915448716088267819566431292},
+    {-0.915448716088267819566431292}, { 0.402434650859418441082533934},
+    { 0.722128193929215321243607198}, { 0.691759258364157774906734132},
+    {-0.691759258364157774906734132}, { 0.722128193929215321243607198},
+    { 0.021474080275469507418374898}, { 0.999769405351215321657617036},
+    {-0.999769405351215321657617036}, { 0.021474080275469507418374898},
+    { 0.999882347454212525633049627}, { 0.015339206284988101044151868},
+    {-0.015339206284988101044151868}, { 0.999882347454212525633049627},
+    { 0.696177131491462944788582591}, { 0.717870045055731736211325329},
+    {-0.717870045055731736211325329}, { 0.696177131491462944788582591},
+    { 0.917900775621390457642276297}, { 0.396809987416710328595290911},
+    {-0.396809987416710328595290911}, { 0.917900775621390457642276297},
+    { 0.368466829953372331712746222}, { 0.929640895843181265457918066},
+    {-0.929640895843181265457918066}, { 0.368466829953372331712746222},
+    { 0.977677357824509979943404762}, { 0.210111836880469621717489972},
+    {-0.210111836880469621717489972}, { 0.977677357824509979943404762},
+    { 0.542750784864515906586768661}, { 0.839893794195999504583383987},
+    {-0.839893794195999504583383987}, { 0.542750784864515906586768661},
+    { 0.822849781375826332046780034}, { 0.568258952670131549790548489},
+    {-0.568258952670131549790548489}, { 0.822849781375826332046780034},
+    { 0.180022901405699522679906590}, { 0.983662419211730274396237776},
+    {-0.983662419211730274396237776}, { 0.180022901405699522679906590},
+    { 0.993564135520595333782021697}, { 0.113270952177564349018228733},
+    {-0.113270952177564349018228733}, { 0.993564135520595333782021697},
+    { 0.622461279374149972519166721}, { 0.782650596166575738458949301},
+    {-0.782650596166575738458949301}, { 0.622461279374149972519166721},
+    { 0.874586652278176112634431897}, { 0.484869248000791101822951699},
+    {-0.484869248000791101822951699}, { 0.874586652278176112634431897},
+    { 0.275571819310958163076425168}, { 0.961280485811320641748659653},
+    {-0.961280485811320641748659653}, { 0.275571819310958163076425168},
+    { 0.952375012719765858529893608}, { 0.304929229735402406490728633},
+    {-0.304929229735402406490728633}, { 0.952375012719765858529893608},
+    { 0.457813303598877221904961155}, { 0.889048355854664562540777729},
+    {-0.889048355854664562540777729}, { 0.457813303598877221904961155},
+    { 0.763188417263381271704838297}, { 0.646176012983316364832802220},
+    {-0.646176012983316364832802220}, { 0.763188417263381271704838297},
+    { 0.082740264549375693111987083}, { 0.996571145790554847093566910},
+    {-0.996571145790554847093566910}, { 0.082740264549375693111987083},
+    { 0.997925286198596012623025462}, { 0.064382630929857460819324537},
+    {-0.064382630929857460819324537}, { 0.997925286198596012623025462},
+    { 0.660114342067420478559490747}, { 0.751165131909686411205819422},
+    {-0.751165131909686411205819422}, { 0.660114342067420478559490747},
+    { 0.897324580705418281231391836}, { 0.441371268731716692879988968},
+    {-0.441371268731716692879988968}, { 0.897324580705418281231391836},
+    { 0.322407678801069848384807478}, { 0.946600913083283570044599823},
+    {-0.946600913083283570044599823}, { 0.322407678801069848384807478},
+    { 0.966190003445412555433832961}, { 0.257831102162159005614471295},
+    {-0.257831102162159005614471295}, { 0.966190003445412555433832961},
+    { 0.500885382611240786241285004}, { 0.865513624090569082825488358},
+    {-0.865513624090569082825488358}, { 0.500885382611240786241285004},
+    { 0.793975477554337164895083757}, { 0.607949784967773667243642671},
+    {-0.607949784967773667243642671}, { 0.793975477554337164895083757},
+    { 0.131540028702883111103387493}, { 0.991310859846115418957349799},
+    {-0.991310859846115418957349799}, { 0.131540028702883111103387493},
+    { 0.986809401814185476970235952}, { 0.161886393780111837641387995},
+    {-0.161886393780111837641387995}, { 0.986809401814185476970235952},
+    { 0.583308652937698294392830961}, { 0.812250586585203913049744181},
+    {-0.812250586585203913049744181}, { 0.583308652937698294392830961},
+    { 0.849741768000852489471268395}, { 0.527199134781901348464274575},
+    {-0.527199134781901348464274575}, { 0.849741768000852489471268395},
+    { 0.228072083170885739254457379}, { 0.973644249650811925318383912},
+    {-0.973644249650811925318383912}, { 0.228072083170885739254457379},
+    { 0.936265667170278246576310996}, { 0.351292756085567125601307623},
+    {-0.351292756085567125601307623}, { 0.936265667170278246576310996},
+    { 0.413638312238434547471944324}, { 0.910441292258067196934095369},
+    {-0.910441292258067196934095369}, { 0.413638312238434547471944324},
+    { 0.730562769227827561177758850}, { 0.682845546385248068164596123},
+    {-0.682845546385248068164596123}, { 0.730562769227827561177758850},
+    { 0.033741171851377584833716112}, { 0.999430604555461772019008327},
+    {-0.999430604555461772019008327}, { 0.033741171851377584833716112},
+    { 0.999204758618363895492950001}, { 0.039872927587739811128578738},
+    {-0.039872927587739811128578738}, { 0.999204758618363895492950001},
+    { 0.678350043129861486873655042}, { 0.734738878095963464563223604},
+    {-0.734738878095963464563223604}, { 0.678350043129861486873655042},
+    { 0.907886116487666212038681480}, { 0.419216888363223956433010020},
+    {-0.419216888363223956433010020}, { 0.907886116487666212038681480},
+    { 0.345541324963989065539191723}, { 0.938403534063108112192420774},
+    {-0.938403534063108112192420774}, { 0.345541324963989065539191723},
+    { 0.972226497078936305708321144}, { 0.234041958583543423191242045},
+    {-0.234041958583543423191242045}, { 0.972226497078936305708321144},
+    { 0.521975292937154342694258318}, { 0.852960604930363657746588082},
+    {-0.852960604930363657746588082}, { 0.521975292937154342694258318},
+    { 0.808656181588174991946968128}, { 0.588281548222645304786439813},
+    {-0.588281548222645304786439813}, { 0.808656181588174991946968128},
+    { 0.155828397654265235743101486}, { 0.987784141644572154230969032},
+    {-0.987784141644572154230969032}, { 0.155828397654265235743101486},
+    { 0.990485084256457037998682243}, { 0.137620121586486044948441663},
+    {-0.137620121586486044948441663}, { 0.990485084256457037998682243},
+    { 0.603066598540348201693430617}, { 0.797690840943391108362662755},
+    {-0.797690840943391108362662755}, { 0.603066598540348201693430617},
+    { 0.862423956111040538690933878}, { 0.506186645345155291048942344},
+    {-0.506186645345155291048942344}, { 0.862423956111040538690933878},
+    { 0.251897818154216950498106628}, { 0.967753837093475465243391912},
+    {-0.967753837093475465243391912}, { 0.251897818154216950498106628},
+    { 0.944604837261480265659265493}, { 0.328209843579092526107916817},
+    {-0.328209843579092526107916817}, { 0.944604837261480265659265493},
+    { 0.435857079922255491032544080}, { 0.900015892016160228714535267},
+    {-0.900015892016160228714535267}, { 0.435857079922255491032544080},
+    { 0.747100605980180144323078847}, { 0.664710978203344868130324985},
+    {-0.664710978203344868130324985}, { 0.747100605980180144323078847},
+    { 0.058258264500435759613979782}, { 0.998301544933892840738782163},
+    {-0.998301544933892840738782163}, { 0.058258264500435759613979782},
+    { 0.996044700901251989887944810}, { 0.088853552582524596561586535},
+    {-0.088853552582524596561586535}, { 0.996044700901251989887944810},
+    { 0.641481012808583151988739898}, { 0.767138911935820381181694573},
+    {-0.767138911935820381181694573}, { 0.641481012808583151988739898},
+    { 0.886222530148880631647990821}, { 0.463259783551860197390719637},
+    {-0.463259783551860197390719637}, { 0.886222530148880631647990821},
+    { 0.299079826308040476750336973}, { 0.954228095109105629780430732},
+    {-0.954228095109105629780430732}, { 0.299079826308040476750336973},
+    { 0.959571513081984528335528181}, { 0.281464937925757984095231007},
+    {-0.281464937925757984095231007}, { 0.959571513081984528335528181},
+    { 0.479493757660153026679839798}, { 0.877545290207261291668470750},
+    {-0.877545290207261291668470750}, { 0.479493757660153026679839798},
+    { 0.778816512381475953374724325}, { 0.627251815495144113509622565},
+    {-0.627251815495144113509622565}, { 0.778816512381475953374724325},
+    { 0.107172424956808849175529148}, { 0.994240449453187946358413442},
+    {-0.994240449453187946358413442}, { 0.107172424956808849175529148},
+    { 0.982539302287441255907040396}, { 0.186055151663446648105438304},
+    {-0.186055151663446648105438304}, { 0.982539302287441255907040396},
+    { 0.563199344013834115007363772}, { 0.826321062845663480311195452},
+    {-0.826321062845663480311195452}, { 0.563199344013834115007363772},
+    { 0.836547727223511984524285790}, { 0.547894059173100165608820571},
+    {-0.547894059173100165608820571}, { 0.836547727223511984524285790},
+    { 0.204108966092816874181696950}, { 0.978948175319062194715480124},
+    {-0.978948175319062194715480124}, { 0.204108966092816874181696950},
+    { 0.927362525650401087274536959}, { 0.374164062971457997104393020},
+    {-0.374164062971457997104393020}, { 0.927362525650401087274536959},
+    { 0.391170384302253888687512949}, { 0.920318276709110566440076541},
+    {-0.920318276709110566440076541}, { 0.391170384302253888687512949},
+    { 0.713584868780793592903125099}, { 0.700568793943248366792866380},
+    {-0.700568793943248366792866380}, { 0.713584868780793592903125099},
+    { 0.009203754782059819315102378}, { 0.999957644551963866333120920},
+    {-0.999957644551963866333120920}, { 0.009203754782059819315102378},
+    { 0.999957644551963866333120920}, { 0.009203754782059819315102378},
+    {-0.009203754782059819315102378}, { 0.999957644551963866333120920},
+    { 0.700568793943248366792866380}, { 0.713584868780793592903125099},
+    {-0.713584868780793592903125099}, { 0.700568793943248366792866380},
+    { 0.920318276709110566440076541}, { 0.391170384302253888687512949},
+    {-0.391170384302253888687512949}, { 0.920318276709110566440076541},
+    { 0.374164062971457997104393020}, { 0.927362525650401087274536959},
+    {-0.927362525650401087274536959}, { 0.374164062971457997104393020},
+    { 0.978948175319062194715480124}, { 0.204108966092816874181696950},
+    {-0.204108966092816874181696950}, { 0.978948175319062194715480124},
+    { 0.547894059173100165608820571}, { 0.836547727223511984524285790},
+    {-0.836547727223511984524285790}, { 0.547894059173100165608820571},
+    { 0.826321062845663480311195452}, { 0.563199344013834115007363772},
+    {-0.563199344013834115007363772}, { 0.826321062845663480311195452},
+    { 0.186055151663446648105438304}, { 0.982539302287441255907040396},
+    {-0.982539302287441255907040396}, { 0.186055151663446648105438304},
+    { 0.994240449453187946358413442}, { 0.107172424956808849175529148},
+    {-0.107172424956808849175529148}, { 0.994240449453187946358413442},
+    { 0.627251815495144113509622565}, { 0.778816512381475953374724325},
+    {-0.778816512381475953374724325}, { 0.627251815495144113509622565},
+    { 0.877545290207261291668470750}, { 0.479493757660153026679839798},
+    {-0.479493757660153026679839798}, { 0.877545290207261291668470750},
+    { 0.281464937925757984095231007}, { 0.959571513081984528335528181},
+    {-0.959571513081984528335528181}, { 0.281464937925757984095231007},
+    { 0.954228095109105629780430732}, { 0.299079826308040476750336973},
+    {-0.299079826308040476750336973}, { 0.954228095109105629780430732},
+    { 0.463259783551860197390719637}, { 0.886222530148880631647990821},
+    {-0.886222530148880631647990821}, { 0.463259783551860197390719637},
+    { 0.767138911935820381181694573}, { 0.641481012808583151988739898},
+    {-0.641481012808583151988739898}, { 0.767138911935820381181694573},
+    { 0.088853552582524596561586535}, { 0.996044700901251989887944810},
+    {-0.996044700901251989887944810}, { 0.088853552582524596561586535},
+    { 0.998301544933892840738782163}, { 0.058258264500435759613979782},
+    {-0.058258264500435759613979782}, { 0.998301544933892840738782163},
+    { 0.664710978203344868130324985}, { 0.747100605980180144323078847},
+    {-0.747100605980180144323078847}, { 0.664710978203344868130324985},
+    { 0.900015892016160228714535267}, { 0.435857079922255491032544080},
+    {-0.435857079922255491032544080}, { 0.900015892016160228714535267},
+    { 0.328209843579092526107916817}, { 0.944604837261480265659265493},
+    {-0.944604837261480265659265493}, { 0.328209843579092526107916817},
+    { 0.967753837093475465243391912}, { 0.251897818154216950498106628},
+    {-0.251897818154216950498106628}, { 0.967753837093475465243391912},
+    { 0.506186645345155291048942344}, { 0.862423956111040538690933878},
+    {-0.862423956111040538690933878}, { 0.506186645345155291048942344},
+    { 0.797690840943391108362662755}, { 0.603066598540348201693430617},
+    {-0.603066598540348201693430617}, { 0.797690840943391108362662755},
+    { 0.137620121586486044948441663}, { 0.990485084256457037998682243},
+    {-0.990485084256457037998682243}, { 0.137620121586486044948441663},
+    { 0.987784141644572154230969032}, { 0.155828397654265235743101486},
+    {-0.155828397654265235743101486}, { 0.987784141644572154230969032},
+    { 0.588281548222645304786439813}, { 0.808656181588174991946968128},
+    {-0.808656181588174991946968128}, { 0.588281548222645304786439813},
+    { 0.852960604930363657746588082}, { 0.521975292937154342694258318},
+    {-0.521975292937154342694258318}, { 0.852960604930363657746588082},
+    { 0.234041958583543423191242045}, { 0.972226497078936305708321144},
+    {-0.972226497078936305708321144}, { 0.234041958583543423191242045},
+    { 0.938403534063108112192420774}, { 0.345541324963989065539191723},
+    {-0.345541324963989065539191723}, { 0.938403534063108112192420774},
+    { 0.419216888363223956433010020}, { 0.907886116487666212038681480},
+    {-0.907886116487666212038681480}, { 0.419216888363223956433010020},
+    { 0.734738878095963464563223604}, { 0.678350043129861486873655042},
+    {-0.678350043129861486873655042}, { 0.734738878095963464563223604},
+    { 0.039872927587739811128578738}, { 0.999204758618363895492950001},
+    {-0.999204758618363895492950001}, { 0.039872927587739811128578738},
+    { 0.999430604555461772019008327}, { 0.033741171851377584833716112},
+    {-0.033741171851377584833716112}, { 0.999430604555461772019008327},
+    { 0.682845546385248068164596123}, { 0.730562769227827561177758850},
+    {-0.730562769227827561177758850}, { 0.682845546385248068164596123},
+    { 0.910441292258067196934095369}, { 0.413638312238434547471944324},
+    {-0.413638312238434547471944324}, { 0.910441292258067196934095369},
+    { 0.351292756085567125601307623}, { 0.936265667170278246576310996},
+    {-0.936265667170278246576310996}, { 0.351292756085567125601307623},
+    { 0.973644249650811925318383912}, { 0.228072083170885739254457379},
+    {-0.228072083170885739254457379}, { 0.973644249650811925318383912},
+    { 0.527199134781901348464274575}, { 0.849741768000852489471268395},
+    {-0.849741768000852489471268395}, { 0.527199134781901348464274575},
+    { 0.812250586585203913049744181}, { 0.583308652937698294392830961},
+    {-0.583308652937698294392830961}, { 0.812250586585203913049744181},
+    { 0.161886393780111837641387995}, { 0.986809401814185476970235952},
+    {-0.986809401814185476970235952}, { 0.161886393780111837641387995},
+    { 0.991310859846115418957349799}, { 0.131540028702883111103387493},
+    {-0.131540028702883111103387493}, { 0.991310859846115418957349799},
+    { 0.607949784967773667243642671}, { 0.793975477554337164895083757},
+    {-0.793975477554337164895083757}, { 0.607949784967773667243642671},
+    { 0.865513624090569082825488358}, { 0.500885382611240786241285004},
+    {-0.500885382611240786241285004}, { 0.865513624090569082825488358},
+    { 0.257831102162159005614471295}, { 0.966190003445412555433832961},
+    {-0.966190003445412555433832961}, { 0.257831102162159005614471295},
+    { 0.946600913083283570044599823}, { 0.322407678801069848384807478},
+    {-0.322407678801069848384807478}, { 0.946600913083283570044599823},
+    { 0.441371268731716692879988968}, { 0.897324580705418281231391836},
+    {-0.897324580705418281231391836}, { 0.441371268731716692879988968},
+    { 0.751165131909686411205819422}, { 0.660114342067420478559490747},
+    {-0.660114342067420478559490747}, { 0.751165131909686411205819422},
+    { 0.064382630929857460819324537}, { 0.997925286198596012623025462},
+    {-0.997925286198596012623025462}, { 0.064382630929857460819324537},
+    { 0.996571145790554847093566910}, { 0.082740264549375693111987083},
+    {-0.082740264549375693111987083}, { 0.996571145790554847093566910},
+    { 0.646176012983316364832802220}, { 0.763188417263381271704838297},
+    {-0.763188417263381271704838297}, { 0.646176012983316364832802220},
+    { 0.889048355854664562540777729}, { 0.457813303598877221904961155},
+    {-0.457813303598877221904961155}, { 0.889048355854664562540777729},
+    { 0.304929229735402406490728633}, { 0.952375012719765858529893608},
+    {-0.952375012719765858529893608}, { 0.304929229735402406490728633},
+    { 0.961280485811320641748659653}, { 0.275571819310958163076425168},
+    {-0.275571819310958163076425168}, { 0.961280485811320641748659653},
+    { 0.484869248000791101822951699}, { 0.874586652278176112634431897},
+    {-0.874586652278176112634431897}, { 0.484869248000791101822951699},
+    { 0.782650596166575738458949301}, { 0.622461279374149972519166721},
+    {-0.622461279374149972519166721}, { 0.782650596166575738458949301},
+    { 0.113270952177564349018228733}, { 0.993564135520595333782021697},
+    {-0.993564135520595333782021697}, { 0.113270952177564349018228733},
+    { 0.983662419211730274396237776}, { 0.180022901405699522679906590},
+    {-0.180022901405699522679906590}, { 0.983662419211730274396237776},
+    { 0.568258952670131549790548489}, { 0.822849781375826332046780034},
+    {-0.822849781375826332046780034}, { 0.568258952670131549790548489},
+    { 0.839893794195999504583383987}, { 0.542750784864515906586768661},
+    {-0.542750784864515906586768661}, { 0.839893794195999504583383987},
+    { 0.210111836880469621717489972}, { 0.977677357824509979943404762},
+    {-0.977677357824509979943404762}, { 0.210111836880469621717489972},
+    { 0.929640895843181265457918066}, { 0.368466829953372331712746222},
+    {-0.368466829953372331712746222}, { 0.929640895843181265457918066},
+    { 0.396809987416710328595290911}, { 0.917900775621390457642276297},
+    {-0.917900775621390457642276297}, { 0.396809987416710328595290911},
+    { 0.717870045055731736211325329}, { 0.696177131491462944788582591},
+    {-0.696177131491462944788582591}, { 0.717870045055731736211325329},
+    { 0.015339206284988101044151868}, { 0.999882347454212525633049627},
+    {-0.999882347454212525633049627}, { 0.015339206284988101044151868},
+    { 0.999769405351215321657617036}, { 0.021474080275469507418374898},
+    {-0.021474080275469507418374898}, { 0.999769405351215321657617036},
+    { 0.691759258364157774906734132}, { 0.722128193929215321243607198},
+    {-0.722128193929215321243607198}, { 0.691759258364157774906734132},
+    { 0.915448716088267819566431292}, { 0.402434650859418441082533934},
+    {-0.402434650859418441082533934}, { 0.915448716088267819566431292},
+    { 0.362755724367397216204854462}, { 0.931884265581668106718557199},
+    {-0.931884265581668106718557199}, { 0.362755724367397216204854462},
+    { 0.976369731330021149312732194}, { 0.216106797076219509948385131},
+    {-0.216106797076219509948385131}, { 0.976369731330021149312732194},
+    { 0.537587076295645482502214932}, { 0.843208239641845437161743865},
+    {-0.843208239641845437161743865}, { 0.537587076295645482502214932},
+    { 0.819347520076796960824689637}, { 0.573297166698042212820171239},
+    {-0.573297166698042212820171239}, { 0.819347520076796960824689637},
+    { 0.173983873387463827950700807}, { 0.984748501801904218556553176},
+    {-0.984748501801904218556553176}, { 0.173983873387463827950700807},
+    { 0.992850414459865090793563344}, { 0.119365214810991364593637790},
+    {-0.119365214810991364593637790}, { 0.992850414459865090793563344},
+    { 0.617647307937803932403979402}, { 0.786455213599085757522319464},
+    {-0.786455213599085757522319464}, { 0.617647307937803932403979402},
+    { 0.871595086655951034842481435}, { 0.490226483288291154229598449},
+    {-0.490226483288291154229598449}, { 0.871595086655951034842481435},
+    { 0.269668325572915106525464462}, { 0.962953266873683886347921481},
+    {-0.962953266873683886347921481}, { 0.269668325572915106525464462},
+    { 0.950486073949481721759926101}, { 0.310767152749611495835997250},
+    {-0.310767152749611495835997250}, { 0.950486073949481721759926101},
+    { 0.452349587233770874133026703}, { 0.891840709392342727796478697},
+    {-0.891840709392342727796478697}, { 0.452349587233770874133026703},
+    { 0.759209188978388033485525443}, { 0.650846684996380915068975573},
+    {-0.650846684996380915068975573}, { 0.759209188978388033485525443},
+    { 0.076623861392031492278332463}, { 0.997060070339482978987989949},
+    {-0.997060070339482978987989949}, { 0.076623861392031492278332463},
+    { 0.997511456140303459699448390}, { 0.070504573389613863027351471},
+    {-0.070504573389613863027351471}, { 0.997511456140303459699448390},
+    { 0.655492852999615385312679701}, { 0.755201376896536527598710756},
+    {-0.755201376896536527598710756}, { 0.655492852999615385312679701},
+    { 0.894599485631382678433072126}, { 0.446868840162374195353044389},
+    {-0.446868840162374195353044389}, { 0.894599485631382678433072126},
+    { 0.316593375556165867243047035}, { 0.948561349915730288158494826},
+    {-0.948561349915730288158494826}, { 0.316593375556165867243047035},
+    { 0.964589793289812723836432159}, { 0.263754678974831383611349322},
+    {-0.263754678974831383611349322}, { 0.964589793289812723836432159},
+    { 0.495565261825772531150266670}, { 0.868570705971340895340449876},
+    {-0.868570705971340895340449876}, { 0.495565261825772531150266670},
+    { 0.790230221437310055030217152}, { 0.612810082429409703935211936},
+    {-0.612810082429409703935211936}, { 0.790230221437310055030217152},
+    { 0.125454983411546238542336453}, { 0.992099313142191757112085445},
+    {-0.992099313142191757112085445}, { 0.125454983411546238542336453},
+    { 0.985797509167567424700995000}, { 0.167938294974731178054745536},
+    {-0.167938294974731178054745536}, { 0.985797509167567424700995000},
+    { 0.578313796411655563342245019}, { 0.815814410806733789010772660},
+    {-0.815814410806733789010772660}, { 0.578313796411655563342245019},
+    { 0.846490938774052078300544488}, { 0.532403127877197971442805218},
+    {-0.532403127877197971442805218}, { 0.846490938774052078300544488},
+    { 0.222093620973203534094094721}, { 0.975025345066994146844913468},
+    {-0.975025345066994146844913468}, { 0.222093620973203534094094721},
+    { 0.934092550404258914729877883}, { 0.357030961233430032614954036},
+    {-0.357030961233430032614954036}, { 0.934092550404258914729877883},
+    { 0.408044162864978680820747499}, { 0.912962190428398164628018233},
+    {-0.912962190428398164628018233}, { 0.408044162864978680820747499},
+    { 0.726359155084345976817494315}, { 0.687315340891759108199186948},
+    {-0.687315340891759108199186948}, { 0.726359155084345976817494315},
+    { 0.027608145778965741612354872}, { 0.999618822495178597116830637},
+    {-0.999618822495178597116830637}, { 0.027608145778965741612354872},
+    { 0.998941293186856850633930266}, { 0.046003182130914628814301788},
+    {-0.046003182130914628814301788}, { 0.998941293186856850633930266},
+    { 0.673829000378756060917568372}, { 0.738887324460615147933116508},
+    {-0.738887324460615147933116508}, { 0.673829000378756060917568372},
+    { 0.905296759318118774354048329}, { 0.424779681209108833357226189},
+    {-0.424779681209108833357226189}, { 0.905296759318118774354048329},
+    { 0.339776884406826857828825803}, { 0.940506070593268323787291309},
+    {-0.940506070593268323787291309}, { 0.339776884406826857828825803},
+    { 0.970772140728950302138169611}, { 0.240003022448741486568922365},
+    {-0.240003022448741486568922365}, { 0.970772140728950302138169611},
+    { 0.516731799017649881508753876}, { 0.856147328375194481019630732},
+    {-0.856147328375194481019630732}, { 0.516731799017649881508753876},
+    { 0.805031331142963597922659282}, { 0.593232295039799808047809426},
+    {-0.593232295039799808047809426}, { 0.805031331142963597922659282},
+    { 0.149764534677321517229695737}, { 0.988721691960323767604516485},
+    {-0.988721691960323767604516485}, { 0.149764534677321517229695737},
+    { 0.989622017463200834623694454}, { 0.143695033150294454819773349},
+    {-0.143695033150294454819773349}, { 0.989622017463200834623694454},
+    { 0.598160706996342311724958652}, { 0.801376171723140219430247777},
+    {-0.801376171723140219430247777}, { 0.598160706996342311724958652},
+    { 0.859301818357008404783582139}, { 0.511468850437970399504391001},
+    {-0.511468850437970399504391001}, { 0.859301818357008404783582139},
+    { 0.245955050335794611599924709}, { 0.969281235356548486048290738},
+    {-0.969281235356548486048290738}, { 0.245955050335794611599924709},
+    { 0.942573197601446879280758735}, { 0.333999651442009404650865481},
+    {-0.333999651442009404650865481}, { 0.942573197601446879280758735},
+    { 0.430326481340082633908199031}, { 0.902673318237258806751502391},
+    {-0.902673318237258806751502391}, { 0.430326481340082633908199031},
+    { 0.743007952135121693517362293}, { 0.669282588346636065720696366},
+    {-0.669282588346636065720696366}, { 0.743007952135121693517362293},
+    { 0.052131704680283321236358216}, { 0.998640218180265222418199049},
+    {-0.998640218180265222418199049}, { 0.052131704680283321236358216},
+    { 0.995480755491926941769171600}, { 0.094963495329638998938034312},
+    {-0.094963495329638998938034312}, { 0.995480755491926941769171600},
+    { 0.636761861236284230413943435}, { 0.771060524261813773200605759},
+    {-0.771060524261813773200605759}, { 0.636761861236284230413943435},
+    { 0.883363338665731594736308015}, { 0.468688822035827933697617870},
+    {-0.468688822035827933697617870}, { 0.883363338665731594736308015},
+    { 0.293219162694258650606608599}, { 0.956045251349996443270479823},
+    {-0.956045251349996443270479823}, { 0.293219162694258650606608599},
+    { 0.957826413027532890321037029}, { 0.287347459544729526477331841},
+    {-0.287347459544729526477331841}, { 0.957826413027532890321037029},
+    { 0.474100214650550014398580015}, { 0.880470889052160770806542929},
+    {-0.880470889052160770806542929}, { 0.474100214650550014398580015},
+    { 0.774953106594873878359129282}, { 0.632018735939809021909403706},
+    {-0.632018735939809021909403706}, { 0.774953106594873878359129282},
+    { 0.101069862754827824987887585}, { 0.994879330794805620591166107},
+    {-0.994879330794805620591166107}, { 0.101069862754827824987887585},
+    { 0.981379193313754574318224190}, { 0.192080397049892441679288205},
+    {-0.192080397049892441679288205}, { 0.981379193313754574318224190},
+    { 0.558118531220556115693702964}, { 0.829761233794523042469023765},
+    {-0.829761233794523042469023765}, { 0.558118531220556115693702964},
+    { 0.833170164701913186439915922}, { 0.553016705580027531764226988},
+    {-0.553016705580027531764226988}, { 0.833170164701913186439915922},
+    { 0.198098410717953586179324918}, { 0.980182135968117392690210009},
+    {-0.980182135968117392690210009}, { 0.198098410717953586179324918},
+    { 0.925049240782677590302371869}, { 0.379847208924051170576281147},
+    {-0.379847208924051170576281147}, { 0.925049240782677590302371869},
+    { 0.385516053843918864075607949}, { 0.922701128333878570437264227},
+    {-0.922701128333878570437264227}, { 0.385516053843918864075607949},
+    { 0.709272826438865651316533772}, { 0.704934080375904908852523758},
+    {-0.704934080375904908852523758}, { 0.709272826438865651316533772},
+    { 0.003067956762965976270145365}, { 0.999995293809576171511580126},
+    {-0.999995293809576171511580126}, { 0.003067956762965976270145365}
+};
+
+const fpr fpr_p2_tab[] = {
+    { 2.00000000000 },
+    { 1.00000000000 },
+    { 0.50000000000 },
+    { 0.25000000000 },
+    { 0.12500000000 },
+    { 0.06250000000 },
+    { 0.03125000000 },
+    { 0.01562500000 },
+    { 0.00781250000 },
+    { 0.00390625000 },
+    { 0.00195312500 }
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.h b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.h
new file mode 100644
index 000000000..a0aefe702
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.h
@@ -0,0 +1,362 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/* ====================================================================== */
+
+#include <math.h>
+
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef struct {
+    double v;
+} fpr;
+
+static inline fpr
+FPR(double v) {
+    fpr x;
+
+    x.v = v;
+    return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+    return FPR((double)i);
+}
+
+static const fpr fpr_q = { 12289.0 };
+static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
+static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
+static const fpr fpr_inv_sigma[] = {
+    { 0.0 }, /* unused */
+    { 0.0069054793295940891952143765991630516 },
+    { 0.0068102267767177975961393730687908629 },
+    { 0.0067188101910722710707826117910434131 },
+    { 0.0065883354370073665545865037227681924 },
+    { 0.0064651781207602900738053897763485516 },
+    { 0.0063486788828078995327741182928037856 },
+    { 0.0062382586529084374473367528433697537 },
+    { 0.0061334065020930261548984001431770281 },
+    { 0.0060336696681577241031668062510953022 },
+    { 0.0059386453095331159950250124336477482 }
+};
+static const fpr fpr_sigma_min[] = {
+    { 0.0 }, /* unused */
+    { 1.1165085072329102588881898380334015 },
+    { 1.1321247692325272405718031785357108 },
+    { 1.1475285353733668684571123112513188 },
+    { 1.1702540788534828939713084716509250 },
+    { 1.1925466358390344011122170489094133 },
+    { 1.2144300507766139921088487776957699 },
+    { 1.2359260567719808790104525941706723 },
+    { 1.2570545284063214162779743112075080 },
+    { 1.2778336969128335860256340575729042 },
+    { 1.2982803343442918539708792538826807 }
+};
+static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
+static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
+static const fpr fpr_bnorm_max = { 16822.4121 };
+static const fpr fpr_zero = { 0.0 };
+static const fpr fpr_one = { 1.0 };
+static const fpr fpr_two = { 2.0 };
+static const fpr fpr_onehalf = { 0.5 };
+static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
+static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
+static const fpr fpr_ptwo31 = { 2147483648.0 };
+static const fpr fpr_ptwo31m1 = { 2147483647.0 };
+static const fpr fpr_mtwo31m1 = { -2147483647.0 };
+static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
+static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
+static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
+
+static inline int64_t
+fpr_rint(fpr x) {
+    /*
+     * We do not want to use llrint() since it might be not
+     * constant-time.
+     *
+     * Suppose that x >= 0. If x >= 2^52, then it is already an
+     * integer. Otherwise, if x < 2^52, then computing x+2^52 will
+     * yield a value that will be rounded to the nearest integer
+     * with exactly the right rules (round-to-nearest-even).
+     *
+     * In order to have constant-time processing, we must do the
+     * computation for both x >= 0 and x < 0 cases, and use a
+     * cast to an integer to access the sign and select the proper
+     * value. Such casts also allow us to find out if |x| < 2^52.
+     */
+    int64_t sx, tx, rp, rn, m;
+    uint32_t ub;
+
+    sx = (int64_t)(x.v - 1.0);
+    tx = (int64_t)x.v;
+    rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
+    rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
+
+    /*
+     * If tx >= 2^52 or tx < -2^52, then result is tx.
+     * Otherwise, if sx >= 0, then result is rp.
+     * Otherwise, result is rn. We use the fact that when x is
+     * close to 0 (|x| <= 0.25) then both rp and rn are correct;
+     * and if x is not close to 0, then trunc(x-1.0) yields the
+     * appropriate sign.
+     */
+
+    /*
+     * Clamp rp to zero if tx < 0.
+     * Clamp rn to zero if tx >= 0.
+     */
+    m = sx >> 63;
+    rn &= m;
+    rp &= ~m;
+
+    /*
+     * Get the 12 upper bits of tx; if they are not all zeros or
+     * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
+     * rp and rn to zero. Otherwise, we clamp tx to zero.
+     */
+    ub = (uint32_t)((uint64_t)tx >> 52);
+    m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
+    rp &= m;
+    rn &= m;
+    tx &= ~m;
+
+    /*
+     * Only one of tx, rn or rp (at most) can be non-zero at this
+     * point.
+     */
+    return tx | rn | rp;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+    int64_t r;
+
+    /*
+     * The cast performs a trunc() (rounding toward 0) and thus is
+     * wrong by 1 for most negative values. The correction below is
+     * constant-time as long as the compiler turns the
+     * floating-point conversion result into a 0/1 integer without a
+     * conditional branch or another non-constant-time construction.
+     * This should hold on all modern architectures with an FPU (and
+     * if it is false on a given arch, then chances are that the FPU
+     * itself is not constant-time, making the point moot).
+     */
+    r = (int64_t)x.v;
+    return r - (x.v < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+    return (int64_t)x.v;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+    return FPR(x.v + y.v);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+    return FPR(x.v - y.v);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+    return FPR(-x.v);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+    return FPR(x.v * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+    return FPR(x.v + x.v);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+    return FPR(x.v * y.v);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+    return FPR(x.v * x.v);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+    return FPR(1.0 / x.v);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+    return FPR(x.v / y.v);
+}
+
+static inline void
+fpr_sqrt_avx2(double *t) {
+    __m128d x;
+
+    x = _mm_load1_pd(t);
+    x = _mm_sqrt_pd(x);
+    _mm_storel_pd(t, x);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+    /*
+     * We prefer not to have a dependency on libm when it can be
+     * avoided. On x86, calling the sqrt() libm function inlines
+     * the relevant opcode (fsqrt or sqrtsd, depending on whether
+     * the 387 FPU or SSE2 is used for floating-point operations)
+     * but then makes an optional call to the library function
+     * for proper error handling, in case the operand is negative.
+     *
+     * To avoid this dependency, we use intrinsics or inline assembly
+     * on recognized platforms:
+     *
+     *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
+     *
+     *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
+     *
+     *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
+     *    to call the 387 FPU fsqrt opcode.
+     *
+     *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
+     *    the fsqrt opcode (Clang needs a special hack).
+     *
+     *  - On GCC/Clang on ARM with hardware floating-point, we use
+     *    inline assembly to call the vqsrt.f64 opcode. Due to a
+     *    complex ecosystem of compilers and assembly syntaxes, we
+     *    have to call it "fsqrt" or "fsqrtd", depending on case.
+     *
+     * If the platform is not recognized, a call to the system
+     * library function sqrt() is performed. On some compilers, this
+     * may actually inline the relevant opcode, and call the library
+     * function only when the input is invalid (e.g. negative);
+     * Falcon never actually calls sqrt() on a negative value, but
+     * the dependency to libm will still be there.
+     */
+
+    fpr_sqrt_avx2(&x.v);
+    return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+    return x.v < y.v;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+    /*
+     * Polynomial approximation of exp(-x) is taken from FACCT:
+     *   https://eprint.iacr.org/2018/1234
+     * Specifically, values are extracted from the implementation
+     * referenced from the FACCT article, and available at:
+     *   https://github.com/raykzhao/gaussian
+     * Tests over more than 24 billions of random inputs in the
+     * 0..log(2) range have never shown a deviation larger than
+     * 2^(-50) from the true mathematical value.
+     */
+
+    /*
+     * AVX2 implementation uses more operations than Horner's method,
+     * but with a lower expression tree depth. This helps because
+     * additions and multiplications have a latency of 4 cycles on
+     * a Skylake, but the CPU can issue two of them per cycle.
+     */
+
+    static const union {
+        double d[12];
+        __m256d v[3];
+    } c = {
+        {
+            0.999999999999994892974086724280,
+            0.500000000000019206858326015208,
+            0.166666666666984014666397229121,
+            0.041666666666110491190622155955,
+            0.008333333327800835146903501993,
+            0.001388888894063186997887560103,
+            0.000198412739277311890541063977,
+            0.000024801566833585381209939524,
+            0.000002755586350219122514855659,
+            0.000000275607356160477811864927,
+            0.000000025299506379442070029551,
+            0.000000002073772366009083061987
+        }
+    };
+
+    double d1, d2, d4, d8, y;
+    __m256d d14, d58, d9c;
+
+    d1 = -x.v;
+    d2 = d1 * d1;
+    d4 = d2 * d2;
+    d8 = d4 * d4;
+    d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
+    d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
+    d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
+    d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
+    d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
+    d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
+    d9c = _mm256_hadd_pd(d9c, d9c);
+    y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
+        + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
+    y *= ccs.v;
+
+    /*
+     * Final conversion goes through int64_t first, because that's what
+     * the underlying opcode (vcvttsd2si) will do, and we know that the
+     * result will fit, since x >= 0 and ccs < 1. If we did the
+     * conversion directly to uint64_t, then the compiler would add some
+     * extra code to cover the case of a source value of 2^63 or more,
+     * and though the alternate path would never be exercised, the
+     * extra comparison would cost us some cycles.
+     */
+    return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
+
+}
+
+#define fpr_gm_tab   PQCLEAN_FALCONPADDED512_AVX2_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab   PQCLEAN_FALCONPADDED512_AVX2_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/inner.h b/src/sig/falcon/pqclean_falcon-padded-512_avx2/inner.h
new file mode 100644
index 000000000..778174f93
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/inner.h
@@ -0,0 +1,827 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCONPADDED512_AVX2_ macro (e.g. PQCLEAN_FALCONPADDED512_AVX2_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * This implementation uses AVX2 and optionally FMA intrinsics.
+ */
+#include <immintrin.h>
+#define FMADD(a, b, c)   _mm256_add_pd(_mm256_mul_pd(a, b), c)
+#define FMSUB(a, b, c)   _mm256_sub_pd(_mm256_mul_pd(a, b), c)
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc)        shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ *   - modq: sequence of values modulo 12289, each encoded over exactly
+ *     14 bits. The encoder and decoder verify that integers are within
+ *     the valid range (0..12288). Values are arrays of uint16.
+ *
+ *   - trim: sequence of signed integers, a specified number of bits
+ *     each. The number of bits is provided as parameter and includes
+ *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ *     (which means that the -2^(bits-1) value is forbidden); encode and
+ *     decode functions check that property. Values are arrays of
+ *     int16_t or int8_t, corresponding to names 'trim_i16' and
+ *     'trim_i8', respectively.
+ *
+ *   - comp: variable-length encoding for signed integers; each integer
+ *     uses a minimum of 9 bits, possibly more. This is normally used
+ *     only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED512_AVX2_modq_encode(void *out, size_t max_out_len,
+        const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i16_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(void *out, size_t max_out_len,
+        const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_AVX2_comp_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED512_AVX2_modq_decode(uint16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AVX2_comp_decode(int16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ *   c0[]      contains the hashed nonce+message
+ *   s2[]      is the decoded signature
+ *   h[]       contains the public key, in NTT + Montgomery format
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+        const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ *   fpr
+ *         type for a real number
+ *
+ *   fpr fpr_of(int64_t i)
+ *         cast an integer into a real number; source must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_scaled(int64_t i, int sc)
+ *         compute i*2^sc as a real number; source 'i' must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_ldexp(fpr x, int e)
+ *         compute x*2^e
+ *
+ *   int64_t fpr_rint(fpr x)
+ *         round x to the nearest integer; x must be in the -(2^63-1)
+ *         to +(2^63-1) range
+ *
+ *   int64_t fpr_trunc(fpr x)
+ *         round to an integer; this rounds towards zero; value must
+ *         be in the -(2^63-1) to +(2^63-1) range
+ *
+ *   fpr fpr_add(fpr x, fpr y)
+ *         compute x + y
+ *
+ *   fpr fpr_sub(fpr x, fpr y)
+ *         compute x - y
+ *
+ *   fpr fpr_neg(fpr x)
+ *         compute -x
+ *
+ *   fpr fpr_half(fpr x)
+ *         compute x/2
+ *
+ *   fpr fpr_double(fpr x)
+ *         compute x*2
+ *
+ *   fpr fpr_mul(fpr x, fpr y)
+ *         compute x * y
+ *
+ *   fpr fpr_sqr(fpr x)
+ *         compute x * x
+ *
+ *   fpr fpr_inv(fpr x)
+ *         compute 1/x
+ *
+ *   fpr fpr_div(fpr x, fpr y)
+ *         compute x/y
+ *
+ *   fpr fpr_sqrt(fpr x)
+ *         compute the square root of x
+ *
+ *   int fpr_lt(fpr x, fpr y)
+ *         return 1 if x < y, 0 otherwise
+ *
+ *   uint64_t fpr_expm_p63(fpr x)
+ *         return exp(x), assuming that 0 <= x < log(2). Returned value
+ *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ *         rounded to the nearest integer). Computation should have a
+ *         precision of at least 45 bits.
+ *
+ *   const fpr fpr_gm_tab[]
+ *         array of constants for FFT / iFFT
+ *
+ *   const fpr fpr_p2_tab[]
+ *         precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ *   fpr fpr_q                 12289
+ *   fpr fpr_inverse_of_q      1/12289
+ *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
+ *   fpr fpr_inv_sigma[]       1/sigma (indexed by logn, 1 to 10)
+ *   fpr fpr_sigma_min[]       1/sigma_min (indexed by logn, 1 to 10)
+ *   fpr fpr_log2              log(2)
+ *   fpr fpr_inv_log2          1/log(2)
+ *   fpr fpr_bnorm_max         16822.4121
+ *   fpr fpr_zero              0
+ *   fpr fpr_one               1
+ *   fpr fpr_two               2
+ *   fpr fpr_onehalf           0.5
+ *   fpr fpr_ptwo31            2^31
+ *   fpr fpr_ptwo31m1          2^31-1
+ *   fpr fpr_mtwo31m1          -(2^31-1)
+ *   fpr fpr_ptwo63m1          2^63-1
+ *   fpr fpr_mtwo63m1          -(2^63-1)
+ *   fpr fpr_ptwo63            2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+    union {
+        uint8_t d[512]; /* MUST be 512, exactly */
+        uint64_t dummy_u64;
+    } buf;
+    size_t ptr;
+    union {
+        uint8_t d[256];
+        uint64_t dummy_u64;
+    } state;
+    int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+    }
+    return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_div_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_LDL_fft(const fpr *g00,
+        fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(fpr *d11, fpr *l10,
+        const fpr *g00, const fpr *g01,
+        const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(fpr *f0, fpr *f1,
+        const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(fpr *f,
+        const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1      136
+#define FALCON_KEYGEN_TEMP_2      272
+#define FALCON_KEYGEN_TEMP_3      224
+#define FALCON_KEYGEN_TEMP_4      448
+#define FALCON_KEYGEN_TEMP_5      896
+#define FALCON_KEYGEN_TEMP_6     1792
+#define FALCON_KEYGEN_TEMP_7     3584
+#define FALCON_KEYGEN_TEMP_8     7168
+#define FALCON_KEYGEN_TEMP_9    14336
+#define FALCON_KEYGEN_TEMP_10   28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_keygen(inner_shake256_context *rng,
+        int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED512_AVX2_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *expanded_key,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED512_AVX2_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED512_AVX2_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/keygen.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/keygen.c
new file mode 100644
index 000000000..8644e9163
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/keygen.c
@@ -0,0 +1,4233 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ *   Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ *   Montgomery multiplication computes (x*y)/R mod p for
+ *   operands x and y. Therefore:
+ *
+ *    - if operands are x*R and y*R (Montgomery representations of x and
+ *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ *      mod p, which is the Montgomery representation of the product x*y;
+ *
+ *    - if operands are x*R and y (or x and y*R), then Montgomery
+ *      multiplication returns x*y mod p: mixed-representation
+ *      multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ *  p   The prime itself.
+ *
+ *  g   A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ *  s   The inverse of the product of all previous primes in the array,
+ *      computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+    uint32_t p;
+    uint32_t g;
+    uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+    { 2147473409,  383167813,      10239 },
+    { 2147389441,  211808905,  471403745 },
+    { 2147387393,   37672282, 1329335065 },
+    { 2147377153, 1977035326,  968223422 },
+    { 2147358721, 1067163706,  132460015 },
+    { 2147352577, 1606082042,  598693809 },
+    { 2147346433, 2033915641, 1056257184 },
+    { 2147338241, 1653770625,  421286710 },
+    { 2147309569,  631200819, 1111201074 },
+    { 2147297281, 2038364663, 1042003613 },
+    { 2147295233, 1962540515,   19440033 },
+    { 2147239937, 2100082663,  353296760 },
+    { 2147235841, 1991153006, 1703918027 },
+    { 2147217409,  516405114, 1258919613 },
+    { 2147205121,  409347988, 1089726929 },
+    { 2147196929,  927788991, 1946238668 },
+    { 2147178497, 1136922411, 1347028164 },
+    { 2147100673,  868626236,  701164723 },
+    { 2147082241, 1897279176,  617820870 },
+    { 2147074049, 1888819123,  158382189 },
+    { 2147051521,   25006327,  522758543 },
+    { 2147043329,  327546255,   37227845 },
+    { 2147039233,  766324424, 1133356428 },
+    { 2146988033, 1862817362,   73861329 },
+    { 2146963457,  404622040,  653019435 },
+    { 2146959361, 1936581214,  995143093 },
+    { 2146938881, 1559770096,  634921513 },
+    { 2146908161,  422623708, 1985060172 },
+    { 2146885633, 1751189170,  298238186 },
+    { 2146871297,  578919515,  291810829 },
+    { 2146846721, 1114060353,  915902322 },
+    { 2146834433, 2069565474,   47859524 },
+    { 2146818049, 1552824584,  646281055 },
+    { 2146775041, 1906267847, 1597832891 },
+    { 2146756609, 1847414714, 1228090888 },
+    { 2146744321, 1818792070, 1176377637 },
+    { 2146738177, 1118066398, 1054971214 },
+    { 2146736129,   52057278,  933422153 },
+    { 2146713601,  592259376, 1406621510 },
+    { 2146695169,  263161877, 1514178701 },
+    { 2146656257,  685363115,  384505091 },
+    { 2146650113,  927727032,  537575289 },
+    { 2146646017,   52575506, 1799464037 },
+    { 2146643969, 1276803876, 1348954416 },
+    { 2146603009,  814028633, 1521547704 },
+    { 2146572289, 1846678872, 1310832121 },
+    { 2146547713,  919368090, 1019041349 },
+    { 2146508801,  671847612,   38582496 },
+    { 2146492417,  283911680,  532424562 },
+    { 2146490369, 1780044827,  896447978 },
+    { 2146459649,  327980850, 1327906900 },
+    { 2146447361, 1310561493,  958645253 },
+    { 2146441217,  412148926,  287271128 },
+    { 2146437121,  293186449, 2009822534 },
+    { 2146430977,  179034356, 1359155584 },
+    { 2146418689, 1517345488, 1790248672 },
+    { 2146406401, 1615820390, 1584833571 },
+    { 2146404353,  826651445,  607120498 },
+    { 2146379777,    3816988, 1897049071 },
+    { 2146363393, 1221409784, 1986921567 },
+    { 2146355201, 1388081168,  849968120 },
+    { 2146336769, 1803473237, 1655544036 },
+    { 2146312193, 1023484977,  273671831 },
+    { 2146293761, 1074591448,  467406983 },
+    { 2146283521,  831604668, 1523950494 },
+    { 2146203649,  712865423, 1170834574 },
+    { 2146154497, 1764991362, 1064856763 },
+    { 2146142209,  627386213, 1406840151 },
+    { 2146127873, 1638674429, 2088393537 },
+    { 2146099201, 1516001018,  690673370 },
+    { 2146093057, 1294931393,  315136610 },
+    { 2146091009, 1942399533,  973539425 },
+    { 2146078721, 1843461814, 2132275436 },
+    { 2146060289, 1098740778,  360423481 },
+    { 2146048001, 1617213232, 1951981294 },
+    { 2146041857, 1805783169, 2075683489 },
+    { 2146019329,  272027909, 1753219918 },
+    { 2145986561, 1206530344, 2034028118 },
+    { 2145976321, 1243769360, 1173377644 },
+    { 2145964033,  887200839, 1281344586 },
+    { 2145906689, 1651026455,  906178216 },
+    { 2145875969, 1673238256, 1043521212 },
+    { 2145871873, 1226591210, 1399796492 },
+    { 2145841153, 1465353397, 1324527802 },
+    { 2145832961, 1150638905,  554084759 },
+    { 2145816577,  221601706,  427340863 },
+    { 2145785857,  608896761,  316590738 },
+    { 2145755137, 1712054942, 1684294304 },
+    { 2145742849, 1302302867,  724873116 },
+    { 2145728513,  516717693,  431671476 },
+    { 2145699841,  524575579, 1619722537 },
+    { 2145691649, 1925625239,  982974435 },
+    { 2145687553,  463795662, 1293154300 },
+    { 2145673217,  771716636,  881778029 },
+    { 2145630209, 1509556977,  837364988 },
+    { 2145595393,  229091856,  851648427 },
+    { 2145587201, 1796903241,  635342424 },
+    { 2145525761,  715310882, 1677228081 },
+    { 2145495041, 1040930522,  200685896 },
+    { 2145466369,  949804237, 1809146322 },
+    { 2145445889, 1673903706,   95316881 },
+    { 2145390593,  806941852, 1428671135 },
+    { 2145372161, 1402525292,  159350694 },
+    { 2145361921, 2124760298, 1589134749 },
+    { 2145359873, 1217503067, 1561543010 },
+    { 2145355777,  338341402,   83865711 },
+    { 2145343489, 1381532164,  641430002 },
+    { 2145325057, 1883895478, 1528469895 },
+    { 2145318913, 1335370424,   65809740 },
+    { 2145312769, 2000008042, 1919775760 },
+    { 2145300481,  961450962, 1229540578 },
+    { 2145282049,  910466767, 1964062701 },
+    { 2145232897,  816527501,  450152063 },
+    { 2145218561, 1435128058, 1794509700 },
+    { 2145187841,   33505311, 1272467582 },
+    { 2145181697,  269767433, 1380363849 },
+    { 2145175553,   56386299, 1316870546 },
+    { 2145079297, 2106880293, 1391797340 },
+    { 2145021953, 1347906152,  720510798 },
+    { 2145015809,  206769262, 1651459955 },
+    { 2145003521, 1885513236, 1393381284 },
+    { 2144960513, 1810381315,   31937275 },
+    { 2144944129, 1306487838, 2019419520 },
+    { 2144935937,   37304730, 1841489054 },
+    { 2144894977, 1601434616,  157985831 },
+    { 2144888833,   98749330, 2128592228 },
+    { 2144880641, 1772327002, 2076128344 },
+    { 2144864257, 1404514762, 2029969964 },
+    { 2144827393,  801236594,  406627220 },
+    { 2144806913,  349217443, 1501080290 },
+    { 2144796673, 1542656776, 2084736519 },
+    { 2144778241, 1210734884, 1746416203 },
+    { 2144759809, 1146598851,  716464489 },
+    { 2144757761,  286328400, 1823728177 },
+    { 2144729089, 1347555695, 1836644881 },
+    { 2144727041, 1795703790,  520296412 },
+    { 2144696321, 1302475157,  852964281 },
+    { 2144667649, 1075877614,  504992927 },
+    { 2144573441,  198765808, 1617144982 },
+    { 2144555009,  321528767,  155821259 },
+    { 2144550913,  814139516, 1819937644 },
+    { 2144536577,  571143206,  962942255 },
+    { 2144524289, 1746733766,    2471321 },
+    { 2144512001, 1821415077,  124190939 },
+    { 2144468993,  917871546, 1260072806 },
+    { 2144458753,  378417981, 1569240563 },
+    { 2144421889,  175229668, 1825620763 },
+    { 2144409601, 1699216963,  351648117 },
+    { 2144370689, 1071885991,  958186029 },
+    { 2144348161, 1763151227,  540353574 },
+    { 2144335873, 1060214804,  919598847 },
+    { 2144329729,  663515846, 1448552668 },
+    { 2144327681, 1057776305,  590222840 },
+    { 2144309249, 1705149168, 1459294624 },
+    { 2144296961,  325823721, 1649016934 },
+    { 2144290817,  738775789,  447427206 },
+    { 2144243713,  962347618,  893050215 },
+    { 2144237569, 1655257077,  900860862 },
+    { 2144161793,  242206694, 1567868672 },
+    { 2144155649,  769415308, 1247993134 },
+    { 2144137217,  320492023,  515841070 },
+    { 2144120833, 1639388522,  770877302 },
+    { 2144071681, 1761785233,  964296120 },
+    { 2144065537,  419817825,  204564472 },
+    { 2144028673,  666050597, 2091019760 },
+    { 2144010241, 1413657615, 1518702610 },
+    { 2143952897, 1238327946,  475672271 },
+    { 2143940609,  307063413, 1176750846 },
+    { 2143918081, 2062905559,  786785803 },
+    { 2143899649, 1338112849, 1562292083 },
+    { 2143891457,   68149545,   87166451 },
+    { 2143885313,  921750778,  394460854 },
+    { 2143854593,  719766593,  133877196 },
+    { 2143836161, 1149399850, 1861591875 },
+    { 2143762433, 1848739366, 1335934145 },
+    { 2143756289, 1326674710,  102999236 },
+    { 2143713281,  808061791, 1156900308 },
+    { 2143690753,  388399459, 1926468019 },
+    { 2143670273, 1427891374, 1756689401 },
+    { 2143666177, 1912173949,  986629565 },
+    { 2143645697, 2041160111,  371842865 },
+    { 2143641601, 1279906897, 2023974350 },
+    { 2143635457,  720473174, 1389027526 },
+    { 2143621121, 1298309455, 1732632006 },
+    { 2143598593, 1548762216, 1825417506 },
+    { 2143567873,  620475784, 1073787233 },
+    { 2143561729, 1932954575,  949167309 },
+    { 2143553537,  354315656, 1652037534 },
+    { 2143541249,  577424288, 1097027618 },
+    { 2143531009,  357862822,  478640055 },
+    { 2143522817, 2017706025, 1550531668 },
+    { 2143506433, 2078127419, 1824320165 },
+    { 2143488001,  613475285, 1604011510 },
+    { 2143469569, 1466594987,  502095196 },
+    { 2143426561, 1115430331, 1044637111 },
+    { 2143383553,    9778045, 1902463734 },
+    { 2143377409, 1557401276, 2056861771 },
+    { 2143363073,  652036455, 1965915971 },
+    { 2143260673, 1464581171, 1523257541 },
+    { 2143246337, 1876119649,  764541916 },
+    { 2143209473, 1614992673, 1920672844 },
+    { 2143203329,  981052047, 2049774209 },
+    { 2143160321, 1847355533,  728535665 },
+    { 2143129601,  965558457,  603052992 },
+    { 2143123457, 2140817191,    8348679 },
+    { 2143100929, 1547263683,  694209023 },
+    { 2143092737,  643459066, 1979934533 },
+    { 2143082497,  188603778, 2026175670 },
+    { 2143062017, 1657329695,  377451099 },
+    { 2143051777,  114967950,  979255473 },
+    { 2143025153, 1698431342, 1449196896 },
+    { 2143006721, 1862741675, 1739650365 },
+    { 2142996481,  756660457,  996160050 },
+    { 2142976001,  927864010, 1166847574 },
+    { 2142965761,  905070557,  661974566 },
+    { 2142916609,   40932754, 1787161127 },
+    { 2142892033, 1987985648,  675335382 },
+    { 2142885889,  797497211, 1323096997 },
+    { 2142871553, 2068025830, 1411877159 },
+    { 2142861313, 1217177090, 1438410687 },
+    { 2142830593,  409906375, 1767860634 },
+    { 2142803969, 1197788993,  359782919 },
+    { 2142785537,  643817365,  513932862 },
+    { 2142779393, 1717046338,  218943121 },
+    { 2142724097,   89336830,  416687049 },
+    { 2142707713,    5944581, 1356813523 },
+    { 2142658561,  887942135, 2074011722 },
+    { 2142638081,  151851972, 1647339939 },
+    { 2142564353, 1691505537, 1483107336 },
+    { 2142533633, 1989920200, 1135938817 },
+    { 2142529537,  959263126, 1531961857 },
+    { 2142527489,  453251129, 1725566162 },
+    { 2142502913, 1536028102,  182053257 },
+    { 2142498817,  570138730,  701443447 },
+    { 2142416897,  326965800,  411931819 },
+    { 2142363649, 1675665410, 1517191733 },
+    { 2142351361,  968529566, 1575712703 },
+    { 2142330881, 1384953238, 1769087884 },
+    { 2142314497, 1977173242, 1833745524 },
+    { 2142289921,   95082313, 1714775493 },
+    { 2142283777,  109377615, 1070584533 },
+    { 2142277633,   16960510,  702157145 },
+    { 2142263297,  553850819,  431364395 },
+    { 2142208001,  241466367, 2053967982 },
+    { 2142164993, 1795661326, 1031836848 },
+    { 2142097409, 1212530046,  712772031 },
+    { 2142087169, 1763869720,  822276067 },
+    { 2142078977,  644065713, 1765268066 },
+    { 2142074881,  112671944,  643204925 },
+    { 2142044161, 1387785471, 1297890174 },
+    { 2142025729,  783885537, 1000425730 },
+    { 2142011393,  905662232, 1679401033 },
+    { 2141974529,  799788433,  468119557 },
+    { 2141943809, 1932544124,  449305555 },
+    { 2141933569, 1527403256,  841867925 },
+    { 2141931521, 1247076451,  743823916 },
+    { 2141902849, 1199660531,  401687910 },
+    { 2141890561,  150132350, 1720336972 },
+    { 2141857793, 1287438162,  663880489 },
+    { 2141833217,  618017731, 1819208266 },
+    { 2141820929,  999578638, 1403090096 },
+    { 2141786113,   81834325, 1523542501 },
+    { 2141771777,  120001928,  463556492 },
+    { 2141759489,  122455485, 2124928282 },
+    { 2141749249,  141986041,  940339153 },
+    { 2141685761,  889088734,  477141499 },
+    { 2141673473,  324212681, 1122558298 },
+    { 2141669377, 1175806187, 1373818177 },
+    { 2141655041, 1113654822,  296887082 },
+    { 2141587457,  991103258, 1585913875 },
+    { 2141583361, 1401451409, 1802457360 },
+    { 2141575169, 1571977166,  712760980 },
+    { 2141546497, 1107849376, 1250270109 },
+    { 2141515777,  196544219,  356001130 },
+    { 2141495297, 1733571506, 1060744866 },
+    { 2141483009,  321552363, 1168297026 },
+    { 2141458433,  505818251,  733225819 },
+    { 2141360129, 1026840098,  948342276 },
+    { 2141325313,  945133744, 2129965998 },
+    { 2141317121, 1871100260, 1843844634 },
+    { 2141286401, 1790639498, 1750465696 },
+    { 2141267969, 1376858592,  186160720 },
+    { 2141255681, 2129698296, 1876677959 },
+    { 2141243393, 2138900688, 1340009628 },
+    { 2141214721, 1933049835, 1087819477 },
+    { 2141212673, 1898664939, 1786328049 },
+    { 2141202433,  990234828,  940682169 },
+    { 2141175809, 1406392421,  993089586 },
+    { 2141165569, 1263518371,  289019479 },
+    { 2141073409, 1485624211,  507864514 },
+    { 2141052929, 1885134788,  311252465 },
+    { 2141040641, 1285021247,  280941862 },
+    { 2141028353, 1527610374,  375035110 },
+    { 2141011969, 1400626168,  164696620 },
+    { 2140999681,  632959608,  966175067 },
+    { 2140997633, 2045628978, 1290889438 },
+    { 2140993537, 1412755491,  375366253 },
+    { 2140942337,  719477232,  785367828 },
+    { 2140925953,   45224252,  836552317 },
+    { 2140917761, 1157376588, 1001839569 },
+    { 2140887041,  278480752, 2098732796 },
+    { 2140837889, 1663139953,  924094810 },
+    { 2140788737,  802501511, 2045368990 },
+    { 2140766209, 1820083885, 1800295504 },
+    { 2140764161, 1169561905, 2106792035 },
+    { 2140696577,  127781498, 1885987531 },
+    { 2140684289,   16014477, 1098116827 },
+    { 2140653569,  665960598, 1796728247 },
+    { 2140594177, 1043085491,  377310938 },
+    { 2140579841, 1732838211, 1504505945 },
+    { 2140569601,  302071939,  358291016 },
+    { 2140567553,  192393733, 1909137143 },
+    { 2140557313,  406595731, 1175330270 },
+    { 2140549121, 1748850918,  525007007 },
+    { 2140477441,  499436566, 1031159814 },
+    { 2140469249, 1886004401, 1029951320 },
+    { 2140426241, 1483168100, 1676273461 },
+    { 2140420097, 1779917297,  846024476 },
+    { 2140413953,  522948893, 1816354149 },
+    { 2140383233, 1931364473, 1296921241 },
+    { 2140366849, 1917356555,  147196204 },
+    { 2140354561,   16466177, 1349052107 },
+    { 2140348417, 1875366972, 1860485634 },
+    { 2140323841,  456498717, 1790256483 },
+    { 2140321793, 1629493973,  150031888 },
+    { 2140315649, 1904063898,  395510935 },
+    { 2140280833, 1784104328,  831417909 },
+    { 2140250113,  256087139,  697349101 },
+    { 2140229633,  388553070,  243875754 },
+    { 2140223489,  747459608, 1396270850 },
+    { 2140200961,  507423743, 1895572209 },
+    { 2140162049,  580106016, 2045297469 },
+    { 2140149761,  712426444,  785217995 },
+    { 2140137473, 1441607584,  536866543 },
+    { 2140119041,  346538902, 1740434653 },
+    { 2140090369,  282642885,   21051094 },
+    { 2140076033, 1407456228,  319910029 },
+    { 2140047361, 1619330500, 1488632070 },
+    { 2140041217, 2089408064, 2012026134 },
+    { 2140008449, 1705524800, 1613440760 },
+    { 2139924481, 1846208233, 1280649481 },
+    { 2139906049,  989438755, 1185646076 },
+    { 2139867137, 1522314850,  372783595 },
+    { 2139842561, 1681587377,  216848235 },
+    { 2139826177, 2066284988, 1784999464 },
+    { 2139824129,  480888214, 1513323027 },
+    { 2139789313,  847937200,  858192859 },
+    { 2139783169, 1642000434, 1583261448 },
+    { 2139770881,  940699589,  179702100 },
+    { 2139768833,  315623242,  964612676 },
+    { 2139666433,  331649203,  764666914 },
+    { 2139641857, 2118730799, 1313764644 },
+    { 2139635713,  519149027,  519212449 },
+    { 2139598849, 1526413634, 1769667104 },
+    { 2139574273,  551148610,  820739925 },
+    { 2139568129, 1386800242,  472447405 },
+    { 2139549697,  813760130, 1412328531 },
+    { 2139537409, 1615286260, 1609362979 },
+    { 2139475969, 1352559299, 1696720421 },
+    { 2139455489, 1048691649, 1584935400 },
+    { 2139432961,  836025845,  950121150 },
+    { 2139424769, 1558281165, 1635486858 },
+    { 2139406337, 1728402143, 1674423301 },
+    { 2139396097, 1727715782, 1483470544 },
+    { 2139383809, 1092853491, 1741699084 },
+    { 2139369473,  690776899, 1242798709 },
+    { 2139351041, 1768782380, 2120712049 },
+    { 2139334657, 1739968247, 1427249225 },
+    { 2139332609, 1547189119,  623011170 },
+    { 2139310081, 1346827917, 1605466350 },
+    { 2139303937,  369317948,  828392831 },
+    { 2139301889, 1560417239, 1788073219 },
+    { 2139283457, 1303121623,  595079358 },
+    { 2139248641, 1354555286,  573424177 },
+    { 2139240449,   60974056,  885781403 },
+    { 2139222017,  355573421, 1221054839 },
+    { 2139215873,  566477826, 1724006500 },
+    { 2139150337,  871437673, 1609133294 },
+    { 2139144193, 1478130914, 1137491905 },
+    { 2139117569, 1854880922,  964728507 },
+    { 2139076609,  202405335,  756508944 },
+    { 2139062273, 1399715741,  884826059 },
+    { 2139045889, 1051045798, 1202295476 },
+    { 2139033601, 1707715206,  632234634 },
+    { 2139006977, 2035853139,  231626690 },
+    { 2138951681,  183867876,  838350879 },
+    { 2138945537, 1403254661,  404460202 },
+    { 2138920961,  310865011, 1282911681 },
+    { 2138910721, 1328496553,  103472415 },
+    { 2138904577,   78831681,  993513549 },
+    { 2138902529, 1319697451, 1055904361 },
+    { 2138816513,  384338872, 1706202469 },
+    { 2138810369, 1084868275,  405677177 },
+    { 2138787841,  401181788, 1964773901 },
+    { 2138775553, 1850532988, 1247087473 },
+    { 2138767361,  874261901, 1576073565 },
+    { 2138757121, 1187474742,  993541415 },
+    { 2138748929, 1782458888, 1043206483 },
+    { 2138744833, 1221500487,  800141243 },
+    { 2138738689,  413465368, 1450660558 },
+    { 2138695681,  739045140,  342611472 },
+    { 2138658817, 1355845756,  672674190 },
+    { 2138644481,  608379162, 1538874380 },
+    { 2138632193, 1444914034,  686911254 },
+    { 2138607617,  484707818, 1435142134 },
+    { 2138591233,  539460669, 1290458549 },
+    { 2138572801, 2093538990, 2011138646 },
+    { 2138552321, 1149786988, 1076414907 },
+    { 2138546177,  840688206, 2108985273 },
+    { 2138533889,  209669619,  198172413 },
+    { 2138523649, 1975879426, 1277003968 },
+    { 2138490881, 1351891144, 1976858109 },
+    { 2138460161, 1817321013, 1979278293 },
+    { 2138429441, 1950077177,  203441928 },
+    { 2138400769,  908970113,  628395069 },
+    { 2138398721,  219890864,  758486760 },
+    { 2138376193, 1306654379,  977554090 },
+    { 2138351617,  298822498, 2004708503 },
+    { 2138337281,  441457816, 1049002108 },
+    { 2138320897, 1517731724, 1442269609 },
+    { 2138290177, 1355911197, 1647139103 },
+    { 2138234881,  531313247, 1746591962 },
+    { 2138214401, 1899410930,  781416444 },
+    { 2138202113, 1813477173, 1622508515 },
+    { 2138191873, 1086458299, 1025408615 },
+    { 2138183681, 1998800427,  827063290 },
+    { 2138173441, 1921308898,  749670117 },
+    { 2138103809, 1620902804, 2126787647 },
+    { 2138099713,  828647069, 1892961817 },
+    { 2138085377,  179405355, 1525506535 },
+    { 2138060801,  615683235, 1259580138 },
+    { 2138044417, 2030277840, 1731266562 },
+    { 2138042369, 2087222316, 1627902259 },
+    { 2138032129,  126388712, 1108640984 },
+    { 2138011649,  715026550, 1017980050 },
+    { 2137993217, 1693714349, 1351778704 },
+    { 2137888769, 1289762259, 1053090405 },
+    { 2137853953,  199991890, 1254192789 },
+    { 2137833473,  941421685,  896995556 },
+    { 2137817089,  750416446, 1251031181 },
+    { 2137792513,  798075119,  368077456 },
+    { 2137786369,  878543495, 1035375025 },
+    { 2137767937,    9351178, 1156563902 },
+    { 2137755649, 1382297614, 1686559583 },
+    { 2137724929, 1345472850, 1681096331 },
+    { 2137704449,  834666929,  630551727 },
+    { 2137673729, 1646165729, 1892091571 },
+    { 2137620481,  778943821,   48456461 },
+    { 2137618433, 1730837875, 1713336725 },
+    { 2137581569,  805610339, 1378891359 },
+    { 2137538561,  204342388, 1950165220 },
+    { 2137526273, 1947629754, 1500789441 },
+    { 2137516033,  719902645, 1499525372 },
+    { 2137491457,  230451261,  556382829 },
+    { 2137440257,  979573541,  412760291 },
+    { 2137374721,  927841248, 1954137185 },
+    { 2137362433, 1243778559,  861024672 },
+    { 2137313281, 1341338501,  980638386 },
+    { 2137311233,  937415182, 1793212117 },
+    { 2137255937,  795331324, 1410253405 },
+    { 2137243649,  150756339, 1966999887 },
+    { 2137182209,  163346914, 1939301431 },
+    { 2137171969, 1952552395,  758913141 },
+    { 2137159681,  570788721,  218668666 },
+    { 2137147393, 1896656810, 2045670345 },
+    { 2137141249,  358493842,  518199643 },
+    { 2137139201, 1505023029,  674695848 },
+    { 2137133057,   27911103,  830956306 },
+    { 2137122817,  439771337, 1555268614 },
+    { 2137116673,  790988579, 1871449599 },
+    { 2137110529,  432109234,  811805080 },
+    { 2137102337, 1357900653, 1184997641 },
+    { 2137098241,  515119035, 1715693095 },
+    { 2137090049,  408575203, 2085660657 },
+    { 2137085953, 2097793407, 1349626963 },
+    { 2137055233, 1556739954, 1449960883 },
+    { 2137030657, 1545758650, 1369303716 },
+    { 2136987649,  332602570,  103875114 },
+    { 2136969217, 1499989506, 1662964115 },
+    { 2136924161,  857040753,    4738842 },
+    { 2136895489, 1948872712,  570436091 },
+    { 2136893441,   58969960, 1568349634 },
+    { 2136887297, 2127193379,  273612548 },
+    { 2136850433,  111208983, 1181257116 },
+    { 2136809473, 1627275942, 1680317971 },
+    { 2136764417, 1574888217,   14011331 },
+    { 2136741889,   14011055, 1129154251 },
+    { 2136727553,   35862563, 1838555253 },
+    { 2136721409,  310235666, 1363928244 },
+    { 2136698881, 1612429202, 1560383828 },
+    { 2136649729, 1138540131,  800014364 },
+    { 2136606721,  602323503, 1433096652 },
+    { 2136563713,  182209265, 1919611038 },
+    { 2136555521,  324156477,  165591039 },
+    { 2136549377,  195513113,  217165345 },
+    { 2136526849, 1050768046,  939647887 },
+    { 2136508417, 1886286237, 1619926572 },
+    { 2136477697,  609647664,   35065157 },
+    { 2136471553,  679352216, 1452259468 },
+    { 2136457217,  128630031,  824816521 },
+    { 2136422401,   19787464, 1526049830 },
+    { 2136420353,  698316836, 1530623527 },
+    { 2136371201, 1651862373, 1804812805 },
+    { 2136334337,  326596005,  336977082 },
+    { 2136322049,   63253370, 1904972151 },
+    { 2136297473,  312176076,  172182411 },
+    { 2136248321,  381261841,  369032670 },
+    { 2136242177,  358688773, 1640007994 },
+    { 2136229889,  512677188,   75585225 },
+    { 2136219649, 2095003250, 1970086149 },
+    { 2136207361, 1909650722,  537760675 },
+    { 2136176641, 1334616195, 1533487619 },
+    { 2136158209, 2096285632, 1793285210 },
+    { 2136143873, 1897347517,  293843959 },
+    { 2136133633,  923586222, 1022655978 },
+    { 2136096769, 1464868191, 1515074410 },
+    { 2136094721, 2020679520, 2061636104 },
+    { 2136076289,  290798503, 1814726809 },
+    { 2136041473,  156415894, 1250757633 },
+    { 2135996417,  297459940, 1132158924 },
+    { 2135955457,  538755304, 1688831340 },
+    { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+    uint32_t w;
+
+    w = (uint32_t)x;
+    w += p & -(w >> 31);
+    return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+    return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+    uint32_t y;
+
+    y = 2 - p;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+    /*
+     * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+     * 2^31 - p.
+     */
+    return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a + b - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a - b;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+    a += p & -(a & 1);
+    return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+    uint64_t z, w;
+    uint32_t d;
+
+    z = (uint64_t)a * (uint64_t)b;
+    w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+    d = (uint32_t)((z + w) >> 31) - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+    uint32_t z;
+
+    /*
+     * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+     * representation), then double it with an addition.
+     */
+    z = modp_R(p);
+    z = modp_add(z, z, p);
+
+    /*
+     * Square it five times to obtain 2^32 in Montgomery representation
+     * (i.e. 2^63 mod p).
+     */
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+
+    /*
+     * Halve the value mod p to get 2^62.
+     */
+    z = (z + (p & -(z & 1))) >> 1;
+    return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+    int i;
+    uint32_t r, z;
+
+    /*
+     * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+     * representation of (2^31)^e mod p, where e = x-1.
+     * R2 is 2^31 in Montgomery representation.
+     */
+    x --;
+    r = R2;
+    z = modp_R(p);
+    for (i = 0; (1U << i) <= x; i ++) {
+        if ((x & (1U << i)) != 0) {
+            z = modp_montymul(z, r, p, p0i);
+        }
+        r = modp_montymul(r, r, p, p0i);
+    }
+    return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ *   a     dividend
+ *   b     divisor
+ *   p     odd prime modulus
+ *   p0i   -1/p mod 2^31
+ *   R     2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+    uint32_t z, e;
+    int i;
+
+    e = p - 2;
+    z = R;
+    for (i = 30; i >= 0; i --) {
+        uint32_t z2;
+
+        z = modp_montymul(z, z, p, p0i);
+        z2 = modp_montymul(z, b, p, p0i);
+        z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+    }
+
+    /*
+     * The loop above just assumed that b was in Montgomery
+     * representation, i.e. really contained b*R; under that
+     * assumption, it returns 1/b in Montgomery representation,
+     * which is R/b. But we gave it b in normal representation,
+     * so the loop really returned R/(b/R) = R^2/b.
+     *
+     * We want a/b, so we need one Montgomery multiplication with a,
+     * which also remove one of the R factors, and another such
+     * multiplication to remove the second R factor.
+     */
+    z = modp_montymul(z, 1, p, p0i);
+    return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+    0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
+    192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
+    96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
+    144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
+    48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
+    240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
+    72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
+    168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
+    24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
+    216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
+    120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
+    132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
+    36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
+    228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
+    84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
+    180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
+    12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
+    204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
+    108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
+    156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
+    60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
+    252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
+    66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
+    162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
+    18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
+    210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
+    114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
+    138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
+    42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
+    234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
+    90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
+    186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
+    6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
+    198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
+    102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
+    150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
+    54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
+    246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
+    78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
+    174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
+    30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
+    222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
+    126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
+    129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
+    33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
+    225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
+    81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
+    177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
+    9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
+    201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
+    105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
+    153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
+    57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
+    249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
+    69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
+    165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
+    21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
+    213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
+    117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
+    141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
+    45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
+    237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
+    93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
+    189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
+    3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
+    195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
+    99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
+    147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
+    51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
+    243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
+    75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
+    171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
+    27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
+    219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
+    123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
+    135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
+    39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
+    231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
+    87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
+    183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
+    15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
+    207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
+    111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
+    159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
+    63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
+    255,  767,  511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ *   gm[rev(i)] = g^i mod p
+ *   igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+           uint32_t g, uint32_t p, uint32_t p0i) {
+    size_t u, n;
+    unsigned k;
+    uint32_t ig, x1, x2, R2;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * We want g such that g^(2N) = 1 mod p, but the provided
+     * generator has order 2048. We must square it a few times.
+     */
+    R2 = modp_R2(p, p0i);
+    g = modp_montymul(g, R2, p, p0i);
+    for (k = logn; k < 10; k ++) {
+        g = modp_montymul(g, g, p, p0i);
+    }
+
+    ig = modp_div(R2, g, p, p0i, modp_R(p));
+    k = 10 - logn;
+    x1 = x2 = modp_R(p);
+    for (u = 0; u < n; u ++) {
+        size_t v;
+
+        v = REV10[u << k];
+        gm[v] = x1;
+        igm[v] = x2;
+        x1 = modp_montymul(x1, g, p, p0i);
+        x2 = modp_montymul(x2, ig, p, p0i);
+    }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+              uint32_t p, uint32_t p0i) {
+    size_t t, m, n;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, u, v1;
+
+        ht = t >> 1;
+        for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = gm[m + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + ht * stride;
+            for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = modp_montymul(*r2, s, p, p0i);
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_sub(x, y, p);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+               uint32_t p, uint32_t p0i) {
+    size_t t, m, n, k;
+    uint32_t ni;
+    uint32_t *r;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = 1;
+    for (m = n; m > 1; m >>= 1) {
+        size_t hm, dt, u, v1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = igm[hm + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + t * stride;
+            for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = *r2;
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_montymul(
+                          modp_sub(x, y, p), s, p, p0i);;
+            }
+        }
+        t = dt;
+    }
+
+    /*
+     * We need 1/n in Montgomery representation, i.e. R/n. Since
+     * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+     * thus a simple shift will do.
+     */
+    ni = (uint32_t)1 << (31 - logn);
+    for (k = 0, r = a; k < n; k ++, r += stride) {
+        *r = modp_montymul(*r, ni, p, p0i);
+    }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+                  uint32_t p, uint32_t p0i, uint32_t R2) {
+    size_t hn, u;
+
+    hn = (size_t)1 << (logn - 1);
+    for (u = 0; u < hn; u ++) {
+        uint32_t w0, w1;
+
+        w0 = f[(u << 1) + 0];
+        w1 = f[(u << 1) + 1];
+        f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+    }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ *  - Rebuild the resultant and the polynomial coefficients from their
+ *    values modulo small primes (of length 31 bits each).
+ *
+ *  - Compute an extended GCD between the two computed resultants.
+ *
+ *  - Extract top bits and add scaled values during the successive steps
+ *    of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+         uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    cc = 0;
+    m = -ctl;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, w;
+
+        aw = a[u];
+        w = aw - b[u] - cc;
+        cc = w >> 31;
+        aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+        a[u] = aw;
+    }
+    return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < mlen; u ++) {
+        uint64_t z;
+
+        z = (uint64_t)m[u] * (uint64_t)x + cc;
+        m[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ *  d is unsigned
+ *  p is prime
+ *  2^30 < p < 2^31
+ *  p0i = -(1/p) mod 2^31
+ *  R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+                        uint32_t p, uint32_t p0i, uint32_t R2) {
+    uint32_t x;
+    size_t u;
+
+    /*
+     * Algorithm: we inject words one by one, starting with the high
+     * word. Each step is:
+     *  - multiply x by 2^31
+     *  - add new word
+     */
+    x = 0;
+    u = dlen;
+    while (u -- > 0) {
+        uint32_t w;
+
+        x = modp_montymul(x, R2, p, p0i);
+        w = d[u] - p;
+        w += p & -(w >> 31);
+        x = modp_add(x, w, p);
+    }
+    return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+                      uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+    uint32_t z;
+
+    if (dlen == 0) {
+        return 0;
+    }
+    z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+    z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+    return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+                   const uint32_t *y, size_t len, uint32_t s) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t xw, yw;
+        uint64_t z;
+
+        xw = x[u];
+        yw = y[u];
+        z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+    size_t u;
+    uint32_t r, bb;
+
+    /*
+     * Compare x with p/2. We use the shifted version of p, and p
+     * is odd, so we really compare with (p-1)/2; we want to perform
+     * the subtraction if and only if x > (p-1)/2.
+     */
+    r = 0;
+    bb = 0;
+    u = len;
+    while (u -- > 0) {
+        uint32_t wx, wp, cc;
+
+        /*
+         * Get the two words to compare in wx and wp (both over
+         * 31 bits exactly).
+         */
+        wx = x[u];
+        wp = (p[u] >> 1) | (bb << 30);
+        bb = p[u] & 1;
+
+        /*
+         * We set cc to -1, 0 or 1, depending on whether wp is
+         * lower than, equal to, or greater than wx.
+         */
+        cc = wp - wx;
+        cc = ((-cc) >> 31) | -(cc >> 31);
+
+        /*
+         * If r != 0 then it is either 1 or -1, and we keep its
+         * value. Otherwise, if r = 0, then we replace it with cc.
+         */
+        r |= cc & ((r & 1) - 1);
+    }
+
+    /*
+     * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+     * is lower than, equal to, or greater than x. We thus want to
+     * do the subtraction only if r = -1.
+     */
+    zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+                 size_t num, const small_prime *primes, int normalize_signed,
+                 uint32_t *tmp) {
+    size_t u;
+    uint32_t *x;
+
+    tmp[0] = primes[0].p;
+    for (u = 1; u < xlen; u ++) {
+        /*
+         * At the entry of each loop iteration:
+         *  - the first u words of each array have been
+         *    reassembled;
+         *  - the first u words of tmp[] contains the
+         * product of the prime moduli processed so far.
+         *
+         * We call 'q' the product of all previous primes.
+         */
+        uint32_t p, p0i, s, R2;
+        size_t v;
+
+        p = primes[u].p;
+        s = primes[u].s;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        for (v = 0, x = xx; v < num; v ++, x += xstride) {
+            uint32_t xp, xq, xr;
+            /*
+             * xp = the integer x modulo the prime p for this
+             *      iteration
+             * xq = (x mod q) mod p
+             */
+            xp = x[u];
+            xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+            /*
+             * New value is (x mod q) + q * (s * (xp - xq) mod p)
+             */
+            xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+            zint_add_mul_small(x, tmp, u, xr);
+        }
+
+        /*
+         * Update product of primes in tmp[].
+         */
+        tmp[u] = zint_mul_small(tmp, u, p);
+    }
+
+    /*
+     * Normalize the reconstructed values around 0.
+     */
+    if (normalize_signed) {
+        for (u = 0, x = xx; u < num; u ++, x += xstride) {
+            zint_norm_zero(x, tmp, xlen);
+        }
+    }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    /*
+     * If ctl = 1 then we flip the bits of a by XORing with
+     * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+     * with 0 and add 0, which leaves the value unchanged.
+     */
+    cc = ctl;
+    m = -ctl >> 1;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw;
+
+        aw = a[u];
+        aw = (aw ^ m) + cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ *  0  both values were positive
+ *  1  new a had to be negated
+ *  2  new b had to be negated
+ *  3  both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+               int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t nega, negb;
+
+    cca = 0;
+    ccb = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    nega = (uint32_t)((uint64_t)cca >> 63);
+    negb = (uint32_t)((uint64_t)ccb >> 63);
+    zint_negate(a, len, nega);
+    zint_negate(b, len, negb);
+    return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+    size_t u;
+    uint32_t cc, xm, ym;
+
+    /*
+     * First pass: compare a (assumed nonnegative) with m. Note that
+     * if the top word uses 32 bits, subtracting m must yield a
+     * value less than 2^31 since a < 2*m.
+     */
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        cc = (a[u] - m[u] - cc) >> 31;
+    }
+
+    /*
+     * If neg = 1 then we must add m (regardless of cc)
+     * If neg = 0 and cc = 0 then we must subtract m
+     * If neg = 0 and cc = 1 then we must do nothing
+     *
+     * In the loop below, we conditionally subtract either m or -m
+     * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+     * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+     */
+    xm = -neg >> 1;
+    ym = -(neg | (1 - cc));
+    cc = neg;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, mw;
+
+        aw = a[u];
+        mw = (m[u] ^ xm) & ym;
+        aw = aw - mw - cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+                   uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t fa, fb;
+
+    /*
+     * These are actually four combined Montgomery multiplications.
+     */
+    cca = 0;
+    ccb = 0;
+    fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+    fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb
+             + m[u] * (uint64_t)fa + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+             + m[u] * (uint64_t)fb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    /*
+     * At this point:
+     *   -m <= a < 2*m
+     *   -m <= b < 2*m
+     * (this is a case of Montgomery reduction)
+     * The top words of 'a' and 'b' may have a 32-th bit set.
+     * We want to add or subtract the modulus, as required.
+     */
+    zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+    zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ *   0 <= u <= y
+ *   0 <= v <= x
+ *   x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+            const uint32_t *x, const uint32_t *y,
+            size_t len, uint32_t *tmp) {
+    /*
+     * Algorithm is an extended binary GCD. We maintain 6 values
+     * a, b, u0, u1, v0 and v1 with the following invariants:
+     *
+     *  a = x*u0 - y*v0
+     *  b = x*u1 - y*v1
+     *  0 <= a <= x
+     *  0 <= b <= y
+     *  0 <= u0 < y
+     *  0 <= v0 < x
+     *  0 <= u1 <= y
+     *  0 <= v1 < x
+     *
+     * Initial values are:
+     *
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     *
+     * Each iteration reduces either a or b, and maintains the
+     * invariants. Algorithm stops when a = b, at which point their
+     * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+     * the values (u,v) we want to return.
+     *
+     * The formal definition of the algorithm is a sequence of steps:
+     *
+     *  - If a is even, then:
+     *        a <- a/2
+     *        u0 <- u0/2 mod y
+     *        v0 <- v0/2 mod x
+     *
+     *  - Otherwise, if b is even, then:
+     *        b <- b/2
+     *        u1 <- u1/2 mod y
+     *        v1 <- v1/2 mod x
+     *
+     *  - Otherwise, if a > b, then:
+     *        a <- (a-b)/2
+     *        u0 <- (u0-u1)/2 mod y
+     *        v0 <- (v0-v1)/2 mod x
+     *
+     *  - Otherwise:
+     *        b <- (b-a)/2
+     *        u1 <- (u1-u0)/2 mod y
+     *        v1 <- (v1-v0)/2 mod y
+     *
+     * We can show that the operations above preserve the invariants:
+     *
+     *  - If a is even, then u0 and v0 are either both even or both
+     *    odd (since a = x*u0 - y*v0, and x and y are both odd).
+     *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+     *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+     *    the a = x*u0 - y*v0 invariant is preserved.
+     *
+     *  - The same holds for the case where b is even.
+     *
+     *  - If a and b are odd, and a > b, then:
+     *
+     *      a-b = x*(u0-u1) - y*(v0-v1)
+     *
+     *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+     *    a-b > 0; therefore, it must be that v0 < v1, and the
+     *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+     *    which preserves the invariants. Otherwise, if u0 > u1,
+     *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+     *    b >= 0, hence a-b <= x. It follows that, in that case,
+     *    v0-v1 >= 0. The first part of the update is then:
+     *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+     *    invariants.
+     *
+     *    Either way, once the subtraction is done, the new value of
+     *    a, which is the difference of two odd values, is even,
+     *    and the remaining of this step is a subcase of the
+     *    first algorithm case (i.e. when a is even).
+     *
+     *  - If a and b are odd, and b > a, then the a similar
+     *    argument holds.
+     *
+     * The values a and b start at x and y, respectively. Since x
+     * and y are odd, their GCD is odd, and it is easily seen that
+     * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+     * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+     * or b is reduced by at least one bit at each iteration, so
+     * the algorithm necessarily converges on the case a = b, at
+     * which point the common value is the GCD.
+     *
+     * In the algorithm expressed above, when a = b, the fourth case
+     * applies, and sets b = 0. Since a contains the GCD of x and y,
+     * which are both odd, a must be odd, and subsequent iterations
+     * (if any) will simply divide b by 2 repeatedly, which has no
+     * consequence. Thus, the algorithm can run for more iterations
+     * than necessary; the final GCD will be in a, and the (u,v)
+     * coefficients will be (u0,v0).
+     *
+     *
+     * The presentation above is bit-by-bit. It can be sped up by
+     * noticing that all decisions are taken based on the low bits
+     * and high bits of a and b. We can extract the two top words
+     * and low word of each of a and b, and compute reduction
+     * parameters pa, pb, qa and qb such that the new values for
+     * a and b are:
+     *    a' = (a*pa + b*pb) / (2^31)
+     *    b' = (a*qa + b*qb) / (2^31)
+     * the two divisions being exact. The coefficients are obtained
+     * just from the extracted words, and may be slightly off, requiring
+     * an optional correction: if a' < 0, then we replace pa with -pa
+     * and pb with -pb. Each such step will reduce the total length
+     * (sum of lengths of a and b) by at least 30 bits at each
+     * iteration.
+     */
+    uint32_t *u0, *u1, *v0, *v1, *a, *b;
+    uint32_t x0i, y0i;
+    uint32_t num, rc;
+    size_t j;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    /*
+     * u0 and v0 are the u and v result buffers; the four other
+     * values (u1, v1, a and b) are taken from tmp[].
+     */
+    u0 = u;
+    v0 = v;
+    u1 = tmp;
+    v1 = u1 + len;
+    a = v1 + len;
+    b = a + len;
+
+    /*
+     * We'll need the Montgomery reduction coefficients.
+     */
+    x0i = modp_ninv31(x[0]);
+    y0i = modp_ninv31(y[0]);
+
+    /*
+     * Initialize a, b, u0, u1, v0 and v1.
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     * Note that x is odd, so computing x-1 is easy.
+     */
+    memcpy(a, x, len * sizeof * x);
+    memcpy(b, y, len * sizeof * y);
+    u0[0] = 1;
+    memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+    memset(v0, 0, len * sizeof * v0);
+    memcpy(u1, y, len * sizeof * u1);
+    memcpy(v1, x, len * sizeof * v1);
+    v1[0] --;
+
+    /*
+     * Each input operand may be as large as 31*len bits, and we
+     * reduce the total length by at least 30 bits at each iteration.
+     */
+    for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+        uint32_t c0, c1;
+        uint32_t a0, a1, b0, b1;
+        uint64_t a_hi, b_hi;
+        uint32_t a_lo, b_lo;
+        int64_t pa, pb, qa, qb;
+        int i;
+        uint32_t r;
+
+        /*
+         * Extract the top words of a and b. If j is the highest
+         * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+         * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+         * If a and b are down to one word each, then we use
+         * a[0] and b[0].
+         */
+        c0 = (uint32_t) -1;
+        c1 = (uint32_t) -1;
+        a0 = 0;
+        a1 = 0;
+        b0 = 0;
+        b1 = 0;
+        j = len;
+        while (j -- > 0) {
+            uint32_t aw, bw;
+
+            aw = a[j];
+            bw = b[j];
+            a0 ^= (a0 ^ aw) & c0;
+            a1 ^= (a1 ^ aw) & c1;
+            b0 ^= (b0 ^ bw) & c0;
+            b1 ^= (b1 ^ bw) & c1;
+            c1 = c0;
+            c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+        }
+
+        /*
+         * If c1 = 0, then we grabbed two words for a and b.
+         * If c1 != 0 but c0 = 0, then we grabbed one word. It
+         * is not possible that c1 != 0 and c0 != 0, because that
+         * would mean that both integers are zero.
+         */
+        a1 |= a0 & c1;
+        a0 &= ~c1;
+        b1 |= b0 & c1;
+        b0 &= ~c1;
+        a_hi = ((uint64_t)a0 << 31) + a1;
+        b_hi = ((uint64_t)b0 << 31) + b1;
+        a_lo = a[0];
+        b_lo = b[0];
+
+        /*
+         * Compute reduction factors:
+         *
+         *   a' = a*pa + b*pb
+         *   b' = a*qa + b*qb
+         *
+         * such that a' and b' are both multiple of 2^31, but are
+         * only marginally larger than a and b.
+         */
+        pa = 1;
+        pb = 0;
+        qa = 0;
+        qb = 1;
+        for (i = 0; i < 31; i ++) {
+            /*
+             * At each iteration:
+             *
+             *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+             *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+             *   a <- a/2 if: a is even
+             *   b <- b/2 if: a is odd, b is even
+             *
+             * We multiply a_lo and b_lo by 2 at each
+             * iteration, thus a division by 2 really is a
+             * non-multiplication by 2.
+             */
+            uint32_t rt, oa, ob, cAB, cBA, cA;
+            uint64_t rz;
+
+            /*
+             * rt = 1 if a_hi > b_hi, 0 otherwise.
+             */
+            rz = b_hi - a_hi;
+            rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+                                   & (a_hi ^ rz))) >> 63);
+
+            /*
+             * cAB = 1 if b must be subtracted from a
+             * cBA = 1 if a must be subtracted from b
+             * cA = 1 if a must be divided by 2
+             *
+             * Rules:
+             *
+             *   cAB and cBA cannot both be 1.
+             *   If a is not divided by 2, b is.
+             */
+            oa = (a_lo >> i) & 1;
+            ob = (b_lo >> i) & 1;
+            cAB = oa & ob & rt;
+            cBA = oa & ob & ~rt;
+            cA = cAB | (oa ^ 1);
+
+            /*
+             * Conditional subtractions.
+             */
+            a_lo -= b_lo & -cAB;
+            a_hi -= b_hi & -(uint64_t)cAB;
+            pa -= qa & -(int64_t)cAB;
+            pb -= qb & -(int64_t)cAB;
+            b_lo -= a_lo & -cBA;
+            b_hi -= a_hi & -(uint64_t)cBA;
+            qa -= pa & -(int64_t)cBA;
+            qb -= pb & -(int64_t)cBA;
+
+            /*
+             * Shifting.
+             */
+            a_lo += a_lo & (cA - 1);
+            pa += pa & ((int64_t)cA - 1);
+            pb += pb & ((int64_t)cA - 1);
+            a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+            b_lo += b_lo & -cA;
+            qa += qa & -(int64_t)cA;
+            qb += qb & -(int64_t)cA;
+            b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+        }
+
+        /*
+         * Apply the computed parameters to our values. We
+         * may have to correct pa and pb depending on the
+         * returned value of zint_co_reduce() (when a and/or b
+         * had to be negated).
+         */
+        r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+        pa -= (pa + pa) & -(int64_t)(r & 1);
+        pb -= (pb + pb) & -(int64_t)(r & 1);
+        qa -= (qa + qa) & -(int64_t)(r >> 1);
+        qb -= (qb + qb) & -(int64_t)(r >> 1);
+        zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+        zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+    }
+
+    /*
+     * At that point, array a[] should contain the GCD, and the
+     * results (u,v) should already be set. We check that the GCD
+     * is indeed 1. We also check that the two operands x and y
+     * are odd.
+     */
+    rc = a[0] ^ 1;
+    for (j = 1; j < len; j ++) {
+        rc |= a[j];
+    }
+    return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+                          const uint32_t *y, size_t ylen, int32_t k,
+                          uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    int32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t wy, wys, ccu;
+        uint64_t z;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        /*
+         * The expression below does not overflow.
+         */
+        z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+        /*
+         * Right-shifting the signed value z would yield
+         * implementation-defined results (arithmetic shift is
+         * not guaranteed). However, we can cast to unsigned,
+         * and get the next carry as an unsigned word. We can
+         * then convert it back to signed by using the guaranteed
+         * fact that 'int32_t' uses two's complement with no
+         * trap representation or padding bit, and with a layout
+         * compatible with that of 'uint32_t'.
+         */
+        ccu = (uint32_t)(z >> 31);
+        cc = *(int32_t *)&ccu;
+    }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+                const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    uint32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t w, wy, wys;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        w = x[u] - wys - cc;
+        x[u] = w & 0x7FFFFFFF;
+        cc = w >> 31;
+    }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+    uint32_t w;
+
+    w = x[0];
+    w |= (w & 0x40000000) << 1;
+    return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+               unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    if (flen == 0) {
+        for (u = 0; u < n; u ++) {
+            d[u] = fpr_zero;
+        }
+        return;
+    }
+    for (u = 0; u < n; u ++, f += fstride) {
+        size_t v;
+        uint32_t neg, cc, xm;
+        fpr x, fsc;
+
+        /*
+         * Get sign of the integer; if it is negative, then we
+         * will load its absolute value instead, and negate the
+         * result.
+         */
+        neg = -(f[flen - 1] >> 30);
+        xm = neg >> 1;
+        cc = neg & 1;
+        x = fpr_zero;
+        fsc = fpr_one;
+        for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+            uint32_t w;
+
+            w = (f[v] ^ xm) + cc;
+            cc = w >> 31;
+            w &= 0x7FFFFFFF;
+            w -= (w << 1) & neg;
+            x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+        }
+        d[u] = x;
+    }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = zint_one_to_plain(s + u);
+        if (z < -lim || z > lim) {
+            return 0;
+        }
+        d[u] = (int8_t)z;
+    }
+    return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+                const uint32_t *f, size_t flen, size_t fstride,
+                const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t kf;
+        size_t v;
+        uint32_t *x;
+        const uint32_t *y;
+
+        kf = -k[u];
+        x = F + u * Fstride;
+        y = f;
+        for (v = 0; v < n; v ++) {
+            zint_add_scaled_mul_small(
+                x, Flen, y, flen, kf, sch, scl);
+            if (u + v == n - 1) {
+                x = F;
+                kf = -kf;
+            } else {
+                x += Fstride;
+            }
+            y += fstride;
+        }
+    }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+                    const uint32_t *f, size_t flen, size_t fstride,
+                    const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+                    uint32_t *tmp) {
+    uint32_t *gm, *igm, *fk, *t1, *x;
+    const uint32_t *y;
+    size_t n, u, tlen;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    tlen = flen + 1;
+    gm = tmp;
+    igm = gm + MKN(logn);
+    fk = igm + MKN(logn);
+    t1 = fk + n * tlen;
+
+    primes = PRIMES;
+
+    /*
+     * Compute k*f in fk[], in RNS notation.
+     */
+    for (u = 0; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0; v < n; v ++) {
+            t1[v] = modp_set(k[v], p);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, y = f, x = fk + u;
+                v < n; v ++, y += fstride, x += tlen) {
+            *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+        }
+        modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+        for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+            *x = modp_montymul(
+                     modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+        }
+        modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild k*f.
+     */
+    zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+    /*
+     * Subtract k*f, scaled, from F.
+     */
+    for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+        zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+    }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT   inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+    /*
+     * We enforce little-endian representation.
+     */
+
+    /*
+     * On little-endian systems we just interpret the bytes "as is"
+     * (this is correct because the exact-width types such as
+     * 'uint64_t' are guaranteed to have no padding and no trap
+     * representation).
+     */
+    uint64_t r;
+
+    inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
+    return r;
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ *    D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+    1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
+    2353523259288686585u,  1227179971273316331u,   575931623374121527u,
+    242543240509105209u,    91437049221049666u,    30799446349977173u,
+    9255276791179340u,     2478152334826140u,      590642893610164u,
+    125206034929641u,       23590435911403u,        3948334035941u,
+    586753615614u,          77391054539u,           9056793210u,
+    940121950u,             86539696u,              7062824u,
+    510971u,                32764u,                 1862u,
+    94u,                    4u,                    0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+    unsigned u, g;
+    int val;
+
+    g = 1U << (10 - logn);
+    val = 0;
+    for (u = 0; u < g; u ++) {
+        /*
+         * Each iteration generates one value with the
+         * Gaussian distribution for N = 1024.
+         *
+         * We use two random 64-bit values. First value
+         * decides on whether the generated value is 0, and,
+         * if not, the sign of the value. Second random 64-bit
+         * word is used to generate the non-zero value.
+         *
+         * For constant-time code we have to read the complete
+         * table. This has negligible cost, compared with the
+         * remainder of the keygen process (solving the NTRU
+         * equation).
+         */
+        uint64_t r;
+        uint32_t f, v, k, neg;
+
+        /*
+         * First value:
+         *  - flag 'neg' is randomly selected to be 0 or 1.
+         *  - flag 'f' is set to 1 if the generated value is zero,
+         *    or set to 0 otherwise.
+         */
+        r = get_rng_u64(rng);
+        neg = (uint32_t)(r >> 63);
+        r &= ~((uint64_t)1 << 63);
+        f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+        /*
+         * We produce a new random 63-bit integer r, and go over
+         * the array, starting at index 1. We store in v the
+         * index of the first array element which is not greater
+         * than r, unless the flag f was already 1.
+         */
+        v = 0;
+        r = get_rng_u64(rng);
+        r &= ~((uint64_t)1 << 63);
+        for (k = 1; k < (sizeof gauss_1024_12289)
+                / (sizeof gauss_1024_12289[0]); k ++) {
+            uint32_t t;
+
+            t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+            v |= k & -(t & (f ^ 1));
+            f |= t;
+        }
+
+        /*
+         * We apply the sign ('neg' flag). If the value is zero,
+         * the sign has no effect.
+         */
+        v = (v ^ -neg) + neg;
+
+        /*
+         * Generated value is added to val.
+         */
+        val += *(int32_t *)&v;
+    }
+    return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ *  - Within an array, values grow.
+ *
+ *  - The 'SMALL' array must have an entry for maximum depth, corresponding
+ *    to the size of values used in the binary GCD. There is no such value
+ *    for the 'LARGE' array (the binary GCD yields already reduced
+ *    coefficients).
+ *
+ *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ *  - Values must be large enough to handle the common cases, with some
+ *    margins.
+ *
+ *  - Values must not be "too large" either because we will convert some
+ *    integers into floating-point values by considering the top 10 words,
+ *    i.e. 310 bits; hence, for values of length more than 10 words, we
+ *    should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
+ *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
+ *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
+ *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
+ *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
+ *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
+ *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
+ *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
+ *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
+ *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
+ *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+    1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+    2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+    int avg;
+    int std;
+} BITLENGTH[] = {
+    {    4,  0 },
+    {   11,  1 },
+    {   24,  1 },
+    {   50,  1 },
+    {  102,  1 },
+    {  202,  2 },
+    {  401,  4 },
+    {  794,  5 },
+    { 1577,  8 },
+    { 3138, 13 },
+    { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG   4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = MKN(logn);
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = f[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(fpr);
+    if (km) {
+        k += (sizeof(fpr)) - km;
+    }
+    return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(uint32_t);
+    if (km) {
+        k += (sizeof(uint32_t)) - km;
+    }
+    return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        x[u] = fpr_of(f[u]);
+    }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+             int in_ntt, int out_ntt) {
+    size_t n, hn, u;
+    size_t slen, tlen;
+    uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+    const small_prime *primes;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    slen = MAX_BL_SMALL[depth];
+    tlen = MAX_BL_SMALL[depth + 1];
+    primes = PRIMES;
+
+    /*
+     * Prepare room for the result.
+     */
+    fd = data;
+    gd = fd + hn * tlen;
+    fs = gd + hn * tlen;
+    gs = fs + n * slen;
+    gm = gs + n * slen;
+    igm = gm + n;
+    t1 = igm + n;
+    memmove(fs, data, 2 * n * slen * sizeof * data);
+
+    /*
+     * First slen words: we use the input values directly, and apply
+     * inverse NTT as we go.
+     */
+    for (u = 0; u < slen; u ++) {
+        uint32_t p, p0i, R2;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+        }
+
+        for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+
+    /*
+     * Since the fs and gs words have been de-NTTized, we can use the
+     * CRT to rebuild the values.
+     */
+    zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+    zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+    /*
+     * Remaining words: use modular reductions to extract the values.
+     */
+    for (u = slen; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+        for (v = 0, x = fs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        for (v = 0, x = gs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ *   0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+        unsigned logn, unsigned depth, int out_ntt) {
+    size_t n, u;
+    uint32_t *ft, *gt, p0;
+    unsigned d;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    ft = data;
+    gt = ft + n;
+    primes = PRIMES;
+    p0 = primes[0].p;
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p0);
+        gt[u] = modp_set(g[u], p0);
+    }
+
+    if (depth == 0 && out_ntt) {
+        uint32_t *gm, *igm;
+        uint32_t p, p0i;
+
+        p = primes[0].p;
+        p0i = modp_ninv31(p);
+        gm = gt + n;
+        igm = gm + MKN(logn);
+        modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+        modp_NTT2(ft, gm, logn, p, p0i);
+        modp_NTT2(gt, gm, logn, p, p0i);
+        return;
+    }
+
+    if (depth == 0) {
+        return;
+    }
+
+    if (depth == 1) {
+        make_fg_step(data, logn, 0, 0, out_ntt);
+        return;
+    }
+
+    make_fg_step(data, logn, 0, 0, 1);
+    for (d = 1; d + 1 < depth; d ++) {
+        make_fg_step(data, logn - d, d, 1, 1);
+    }
+    make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+                   const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t len;
+    uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+    const small_prime *primes;
+
+    len = MAX_BL_SMALL[logn_top];
+    primes = PRIMES;
+
+    Fp = tmp;
+    Gp = Fp + len;
+    fp = Gp + len;
+    gp = fp + len;
+    t1 = gp + len;
+
+    make_fg(fp, f, g, logn_top, logn_top, 0);
+
+    /*
+     * We use the CRT to rebuild the resultants as big integers.
+     * There are two such big integers. The resultants are always
+     * nonnegative.
+     */
+    zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+    /*
+     * Apply the binary GCD. The zint_bezout() function works only
+     * if both inputs are odd.
+     *
+     * We can test on the result and return 0 because that would
+     * imply failure of the NTRU solving equation, and the (f,g)
+     * values will be abandoned in that case.
+     */
+    if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+        return 0;
+    }
+
+    /*
+     * Multiply the two values by the target value q. Values must
+     * fit in the destination arrays.
+     * We can again test on the returned words: a non-zero output
+     * of zint_mul_small() means that we exceeded our array
+     * capacity, and that implies failure and rejection of (f,g).
+     */
+    q = 12289;
+    if (zint_mul_small(Fp, len, q) != 0
+            || zint_mul_small(Gp, len, q) != 0) {
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+                        const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+    /*
+     * In this function, 'logn' is the log2 of the degree for
+     * this step. If N = 2^logn, then:
+     *  - the F and G values already in fk->tmp (from the deeper
+     *    levels) have degree N/2;
+     *  - this function should return F and G of degree N.
+     */
+    unsigned logn;
+    size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+    int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+    uint32_t *x, *y;
+    int32_t *k;
+    const small_prime *primes;
+
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2 or N/3)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+    primes = PRIMES;
+
+    /*
+     * Fd and Gd are the F and G from the deeper level.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+
+    /*
+     * Compute the input f and g for this level. Note that we get f
+     * and g in RNS + NTT representation.
+     */
+    ft = Gd + dlen * hn;
+    make_fg(ft, f, g, logn_top, depth, 1);
+
+    /*
+     * Move the newly computed f and g to make room for our candidate
+     * F and G (unreduced).
+     */
+    Ft = tmp;
+    Gt = Ft + n * llen;
+    t1 = Gt + n * llen;
+    memmove(t1, ft, 2 * n * slen * sizeof * ft);
+    ft = t1;
+    gt = ft + slen * n;
+    t1 = gt + slen * n;
+
+    /*
+     * Move Fd and Gd _after_ f and g.
+     */
+    memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+    Fd = t1;
+    Gd = Fd + hn * dlen;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt (only n/2 values in each).
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * We do not need Fd and Gd after that point.
+     */
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * If we processed slen words, then f and g have been
+         * de-NTTized, and are in RNS; we can rebuild them.
+         */
+        if (u == slen) {
+            zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+            zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+        }
+
+        gm = t1;
+        igm = gm + n;
+        fx = igm + n;
+        gx = fx + n;
+
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        if (u < slen) {
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = *x;
+                gx[v] = *y;
+            }
+            modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+            modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+        } else {
+            uint32_t Rx;
+
+            Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+            for (v = 0, x = ft, y = gt;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = zint_mod_small_signed(x, slen,
+                                              p, p0i, R2, Rx);
+                gx[v] = zint_mod_small_signed(y, slen,
+                                              p, p0i, R2, Rx);
+            }
+            modp_NTT2(fx, gm, logn, p, p0i);
+            modp_NTT2(gx, gm, logn, p, p0i);
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed in
+         * a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * General case:
+         *
+         *   we divide degree by d = 2 or 3
+         *   f'(x^d) = N(f)(x^d) = f * adj(f)
+         *   g'(x^d) = N(g)(x^d) = g * adj(g)
+         *   f'*G' - g'*F' = q
+         *   F = F'(x^d) * adj(g)
+         *   G = G'(x^d) * adj(f)
+         *
+         * We compute things in the NTT. We group roots of phi
+         * such that all roots x in a group share the same x^d.
+         * If the roots in a group are x_1, x_2... x_d, then:
+         *
+         *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+         *
+         * Thus, we have:
+         *
+         *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   ...
+         *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+         *
+         * In all cases, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, in our chosen NTT representation, roots
+         * from the same group are consecutive in RAM.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+                v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild F and G with the CRT.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+    zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+    /*
+     * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+     * order).
+     */
+
+    /*
+     * Apply Babai reduction to bring back F and G to size slen.
+     *
+     * We use the FFT to compute successive approximations of the
+     * reduction coefficient. We first isolate the top bits of
+     * the coefficients of f and g, and convert them to floating
+     * point; with the FFT, we compute adj(f), adj(g), and
+     * 1/(f*adj(f)+g*adj(g)).
+     *
+     * Then, we repeatedly apply the following:
+     *
+     *   - Get the top bits of the coefficients of F and G into
+     *     floating point, and use the FFT to compute:
+     *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+     *
+     *   - Convert back that value into normal representation, and
+     *     round it to the nearest integers, yielding a polynomial k.
+     *     Proper scaling is applied to f, g, F and G so that the
+     *     coefficients fit on 32 bits (signed).
+     *
+     *   - Subtract k*f from F and k*g from G.
+     *
+     * Under normal conditions, this process reduces the size of F
+     * and G by some bits at each iteration. For constant-time
+     * operation, we do not want to measure the actual length of
+     * F and G; instead, we do the following:
+     *
+     *   - f and g are converted to floating-point, with some scaling
+     *     if necessary to keep values in the representable range.
+     *
+     *   - For each iteration, we _assume_ a maximum size for F and G,
+     *     and use the values at that size. If we overreach, then
+     *     we get zeros, which is harmless: the resulting coefficients
+     *     of k will be 0 and the value won't be reduced.
+     *
+     *   - We conservatively assume that F and G will be reduced by
+     *     at least 25 bits at each iteration.
+     *
+     * Even when reaching the bottom of the reduction, reduction
+     * coefficient will remain low. If it goes out-of-range, then
+     * something wrong occurred and the whole NTRU solving fails.
+     */
+
+    /*
+     * Memory layout:
+     *  - We need to compute and keep adj(f), adj(g), and
+     *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+     *    respectively).
+     *  - At each iteration we need two extra fp buffer (N fp values),
+     *    and produce a k (N 32-bit words). k will be shared with one
+     *    of the fp buffers.
+     *  - To compute k*f and k*g efficiently (with the NTT), we need
+     *    some extra room; we reuse the space of the temporary buffers.
+     *
+     * Arrays of 'fpr' are obtained from the temporary array itself.
+     * We ensure that the base is at a properly aligned offset (the
+     * source array tmp[] is supposed to be already aligned).
+     */
+
+    rt3 = align_fpr(tmp, t1);
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt1 = rt5 + (n >> 1);
+    k = (int32_t *)align_u32(tmp, rt1);
+    rt2 = align_fpr(tmp, k + n);
+    if (rt2 < (rt1 + n)) {
+        rt2 = rt1 + n;
+    }
+    t1 = (uint32_t *)k + n;
+
+    /*
+     * Get f and g into rt3 and rt4 as floating-point approximations.
+     *
+     * We need to "scale down" the floating-point representation of
+     * coefficients when they are too big. We want to keep the value
+     * below 2^310 or so. Thus, when values are larger than 10 words,
+     * we consider only the top 10 words. Array lengths have been
+     * computed so that average maximum length will fall in the
+     * middle or the upper half of these top 10 words.
+     */
+    if (slen > 10) {
+        rlen = 10;
+    } else {
+        rlen = slen;
+    }
+    poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+    poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+    /*
+     * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+     */
+    scale_fg = 31 * (int)(slen - rlen);
+
+    /*
+     * Estimated boundaries for the maximum size (in bits) of the
+     * coefficients of (f,g). We use the measured average, and
+     * allow for a deviation of at most six times the standard
+     * deviation.
+     */
+    minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+    maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+    /*
+     * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+     * and adj(g) in rt3 and rt4, respectively.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt4, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt3, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt4, logn);
+
+    /*
+     * Reduce F and G repeatedly.
+     *
+     * The expected maximum bit length of coefficients of F and G
+     * is kept in maxbl_FG, with the corresponding word length in
+     * FGlen.
+     */
+    FGlen = llen;
+    maxbl_FG = 31 * (int)llen;
+
+    /*
+     * Each reduction operation computes the reduction polynomial
+     * "k". We need that polynomial to have coefficients that fit
+     * on 32-bit signed integers, with some scaling; thus, we use
+     * a descending sequence of scaling values, down to zero.
+     *
+     * The size of the coefficients of k is (roughly) the difference
+     * between the size of the coefficients of (F,G) and the size
+     * of the coefficients of (f,g). Thus, the maximum size of the
+     * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+     * this is our starting scale value for k.
+     *
+     * We need to estimate the size of (F,G) during the execution of
+     * the algorithm; we are allowed some overestimation but not too
+     * much (poly_big_to_fp() uses a 310-bit window). Generally
+     * speaking, after applying a reduction with k scaled to
+     * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+     * where 'dd' is a few bits to account for the fact that the
+     * reduction is never perfect (intuitively, dd is on the order
+     * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+     * bits).
+     *
+     * The size of (f,g) is not known exactly, but maxbl_fg is an
+     * upper bound.
+     */
+    scale_k = maxbl_FG - minbl_fg;
+
+    for (;;) {
+        int scale_FG, dc, new_maxbl_FG;
+        uint32_t scl, sch;
+        fpr pdc, pt;
+
+        /*
+         * Convert current F and G into floating-point. We apply
+         * scaling if the current length is more than 10 words.
+         */
+        if (FGlen > 10) {
+            rlen = 10;
+        } else {
+            rlen = FGlen;
+        }
+        scale_FG = 31 * (int)(FGlen - rlen);
+        poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+        poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+        /*
+         * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+         */
+        PQCLEAN_FALCONPADDED512_AVX2_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt2, rt4, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_add(rt2, rt1, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt2, rt5, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt2, logn);
+
+        /*
+         * (f,g) are scaled by 'scale_fg', meaning that the
+         * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+         * to have their true mathematical value.
+         *
+         * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+         * the value we computed in rt2 is scaled by
+         * 'scale_FG-scale_fg'.
+         *
+         * We want that value to be scaled by 'scale_k', hence we
+         * apply a corrective scaling. After scaling, the values
+         * should fit in -2^31-1..+2^31-1.
+         */
+        dc = scale_k - scale_FG + scale_fg;
+
+        /*
+         * We will need to multiply values by 2^(-dc). The value
+         * 'dc' is not secret, so we can compute 2^(-dc) with a
+         * non-constant-time process.
+         * (We could use ldexp(), but we prefer to avoid any
+         * dependency on libm. When using FP emulation, we could
+         * use our fpr_ldexp(), which is constant-time.)
+         */
+        if (dc < 0) {
+            dc = -dc;
+            pt = fpr_two;
+        } else {
+            pt = fpr_onehalf;
+        }
+        pdc = fpr_one;
+        while (dc != 0) {
+            if ((dc & 1) != 0) {
+                pdc = fpr_mul(pdc, pt);
+            }
+            dc >>= 1;
+            pt = fpr_sqr(pt);
+        }
+
+        for (u = 0; u < n; u ++) {
+            fpr xv;
+
+            xv = fpr_mul(rt2[u], pdc);
+
+            /*
+             * Sometimes the values can be out-of-bounds if
+             * the algorithm fails; we must not call
+             * fpr_rint() (and cast to int32_t) if the value
+             * is not in-bounds. Note that the test does not
+             * break constant-time discipline, since any
+             * failure here implies that we discard the current
+             * secret key (f,g).
+             */
+            if (!fpr_lt(fpr_mtwo31m1, xv)
+                    || !fpr_lt(xv, fpr_ptwo31m1)) {
+                return 0;
+            }
+            k[u] = (int32_t)fpr_rint(xv);
+        }
+
+        /*
+         * Values in k[] are integers. They really are scaled
+         * down by maxbl_FG - minbl_fg bits.
+         *
+         * If we are at low depth, then we use the NTT to
+         * compute k*f and k*g.
+         */
+        sch = (uint32_t)(scale_k / 31);
+        scl = (uint32_t)(scale_k % 31);
+        if (depth <= DEPTH_INT_FG) {
+            poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+                                k, sch, scl, logn, t1);
+            poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+                                k, sch, scl, logn, t1);
+        } else {
+            poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+                            k, sch, scl, logn);
+            poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+                            k, sch, scl, logn);
+        }
+
+        /*
+         * We compute the new maximum size of (F,G), assuming that
+         * (f,g) has _maximal_ length (i.e. that reduction is
+         * "late" instead of "early". We also adjust FGlen
+         * accordingly.
+         */
+        new_maxbl_FG = scale_k + maxbl_fg + 10;
+        if (new_maxbl_FG < maxbl_FG) {
+            maxbl_FG = new_maxbl_FG;
+            if ((int)FGlen * 31 >= maxbl_FG + 31) {
+                FGlen --;
+            }
+        }
+
+        /*
+         * We suppose that scaling down achieves a reduction by
+         * at least 25 bits per iteration. We stop when we have
+         * done the loop with an unscaled k.
+         */
+        if (scale_k <= 0) {
+            break;
+        }
+        scale_k -= 25;
+        if (scale_k < 0) {
+            scale_k = 0;
+        }
+    }
+
+    /*
+     * If (F,G) length was lowered below 'slen', then we must take
+     * care to re-extend the sign.
+     */
+    if (FGlen < slen) {
+        for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+            size_t v;
+            uint32_t sw;
+
+            sw = -(Ft[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Ft[v] = sw;
+            }
+            sw = -(Gt[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Gt[v] = sw;
+            }
+        }
+    }
+
+    /*
+     * Compress encoding of all values to 'slen' words (this is the
+     * expected output format).
+     */
+    for (u = 0, x = tmp, y = tmp;
+            u < (n << 1); u ++, x += slen, y += llen) {
+        memmove(x, y, slen * sizeof * y);
+    }
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    /*
+     * The first half of this function is a copy of the corresponding
+     * part in solve_NTRU_intermediate(), for the reconstruction of
+     * the unreduced F and G. The second half (Babai reduction) is
+     * done differently, because the unreduced F and G fit in 53 bits
+     * of precision, allowing a much simpler process with lower RAM
+     * usage.
+     */
+    unsigned depth, logn;
+    size_t n_top, n, hn, slen, dlen, llen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+    uint32_t *x, *y;
+
+    depth = 1;
+    n_top = (size_t)1 << logn_top;
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     */
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+
+    /*
+     * Fd and Gd are the F and G from the deeper level. Ft and Gt
+     * are the destination arrays for the unreduced F and G.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+    Ft = Gd + dlen * hn;
+    Gt = Ft + llen * n;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * Now Fd and Gd are not needed anymore; we can squeeze them out.
+     */
+    memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+    Ft = tmp;
+    memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+    Gt = Ft + llen * n;
+    ft = Gt + llen * n;
+    gt = ft + slen * n;
+
+    t1 = gt + slen * n;
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        unsigned e;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * We recompute things from the source f and g, of full
+         * degree. However, we will need only the n first elements
+         * of the inverse NTT table (igm); the call to modp_mkgm()
+         * below will fill n_top elements in igm[] (thus overflowing
+         * into fx[]) but later code will overwrite these extra
+         * elements.
+         */
+        gm = t1;
+        igm = gm + n_top;
+        fx = igm + n;
+        gx = fx + n_top;
+        modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+        /*
+         * Set ft and gt to f and g modulo p, respectively.
+         */
+        for (v = 0; v < n_top; v ++) {
+            fx[v] = modp_set(f[v], p);
+            gx[v] = modp_set(g[v], p);
+        }
+
+        /*
+         * Convert to NTT and compute our f and g.
+         */
+        modp_NTT2(fx, gm, logn_top, p, p0i);
+        modp_NTT2(gx, gm, logn_top, p, p0i);
+        for (e = logn_top; e > logn; e --) {
+            modp_poly_rec_res(fx, e, p, p0i, R2);
+            modp_poly_rec_res(gx, e, p, p0i, R2);
+        }
+
+        /*
+         * From that point onward, we only need tables for
+         * degree n, so we can save some space.
+         */
+        if (depth > 0) { /* always true */
+            memmove(gm + n, igm, n * sizeof * igm);
+            igm = gm + n;
+            memmove(igm + n, fx, n * sizeof * ft);
+            fx = igm + n;
+            memmove(fx + n, gx, n * sizeof * gt);
+            gx = fx + n;
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed
+         * in a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * Equations are:
+         *
+         *   f'(x^2) = N(f)(x^2) = f * adj(f)
+         *   g'(x^2) = N(g)(x^2) = g * adj(g)
+         *
+         *   f'*G' - g'*F' = q
+         *
+         *   F = F'(x^2) * adj(g)
+         *   G = G'(x^2) * adj(f)
+         *
+         * The NTT representation of f is f(w) for all w which
+         * are roots of phi. In the binary case, as well as in
+         * the ternary case for all depth except the deepest,
+         * these roots can be grouped in pairs (w,-w), and we
+         * then have:
+         *
+         *   f(w) = adj(f)(-w)
+         *   f(-w) = adj(f)(w)
+         *
+         * and w^2 is then a root for phi at the half-degree.
+         *
+         * At the deepest level in the ternary case, this still
+         * holds, in the following sense: the roots of x^2-x+1
+         * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+         * have:
+         *
+         *   f(w) = adj(f)(-w^2)
+         *   f(-w^2) = adj(f)(w)
+         *
+         * In all case, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, the two roots for each pair are consecutive
+         * in our bit-reversal encoding.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+        /*
+         * Also save ft and gt (only up to size slen).
+         */
+        if (u < slen) {
+            modp_iNTT2(fx, igm, logn, p, p0i);
+            modp_iNTT2(gx, igm, logn, p, p0i);
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                *x = fx[v];
+                *y = gx[v];
+            }
+        }
+    }
+
+    /*
+     * Rebuild f, g, F and G with the CRT. Note that the elements of F
+     * and G are consecutive, and thus can be rebuilt in a single
+     * loop; similarly, the elements of f and g are consecutive.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+    zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+    /*
+     * Here starts the Babai reduction, specialized for depth = 1.
+     *
+     * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+     * are converted to floating point. There is no scaling, and a
+     * single pass is sufficient.
+     */
+
+    /*
+     * Convert F and G into floating point (rt1 and rt2).
+     */
+    rt1 = align_fpr(tmp, gt + slen * n);
+    rt2 = rt1 + n;
+    poly_big_to_fp(rt1, Ft, llen, llen, logn);
+    poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+    /*
+     * Integer representation of F and G is no longer needed, we
+     * can remove it.
+     */
+    memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+    ft = tmp;
+    gt = ft + slen * n;
+    rt3 = align_fpr(tmp, gt + slen * n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * Convert f and g into floating point (rt3 and rt4).
+     */
+    poly_big_to_fp(rt3, ft, slen, slen, logn);
+    poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+    /*
+     * Remove unneeded ft and gt.
+     */
+    memmove(tmp, rt1, 4 * n * sizeof * rt1);
+    rt1 = (fpr *)tmp;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * We now have:
+     *   rt1 = F
+     *   rt2 = G
+     *   rt3 = f
+     *   rt4 = g
+     * in that order in RAM. We convert all of them to FFT.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt1, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt2, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = F*adj(f) + G*adj(g)
+     *   rt6 = 1 / (f*adj(f) + g*adj(g))
+     * (Note that rt6 is half-length.)
+     */
+    rt5 = rt4 + n;
+    rt6 = rt5 + n;
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+    /*
+     * Compute k as the rounded version of rt5. Check that none of
+     * the values is larger than 2^63-1 (in absolute value)
+     * because that would make the fpr_rint() do something undefined;
+     * note that any out-of-bounds value here implies a failure and
+     * (f,g) will be discarded, so we can make a simple test.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt5, logn);
+    for (u = 0; u < n; u ++) {
+        fpr z;
+
+        z = rt5[u];
+        if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+            return 0;
+        }
+        rt5[u] = fpr_of(fpr_rint(z));
+    }
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt5, logn);
+
+    /*
+     * Subtract k*f from F, and k*g from G.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt3, rt5, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt4, rt5, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_sub(rt1, rt3, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_sub(rt2, rt4, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt1, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt2, logn);
+
+    /*
+     * Convert back F and G to integers, and return.
+     */
+    Ft = tmp;
+    Gt = Ft + n;
+    rt3 = align_fpr(tmp, Gt + n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    for (u = 0; u < n; u ++) {
+        Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+        Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t n, hn, u;
+    uint32_t p, p0i, R2;
+    uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+    uint32_t *gm, *igm, *ft, *gt;
+    fpr *rt2, *rt3;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     *
+     * Everything should fit in 31-bit integers, hence we can just use
+     * the first small prime p = 2147473409.
+     */
+    p = PRIMES[0].p;
+    p0i = modp_ninv31(p);
+    R2 = modp_R2(p, p0i);
+
+    Fp = tmp;
+    Gp = Fp + hn;
+    ft = Gp + hn;
+    gt = ft + n;
+    gm = gt + n;
+    igm = gm + n;
+
+    modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F' anf G' in NTT representation.
+     */
+    for (u = 0; u < hn; u ++) {
+        Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+        Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+    }
+    modp_NTT2(Fp, gm, logn - 1, p, p0i);
+    modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+    /*
+     * Load f and g and convert them to NTT representation.
+     */
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+
+    /*
+     * Build the unreduced F,G in ft and gt.
+     */
+    for (u = 0; u < n; u += 2) {
+        uint32_t ftA, ftB, gtA, gtB;
+        uint32_t mFp, mGp;
+
+        ftA = ft[u + 0];
+        ftB = ft[u + 1];
+        gtA = gt[u + 0];
+        gtB = gt[u + 1];
+        mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+        mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+        ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+        ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+        gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+        gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+    }
+    modp_iNTT2(ft, igm, logn, p, p0i);
+    modp_iNTT2(gt, igm, logn, p, p0i);
+
+    Gp = Fp + n;
+    t1 = Gp + n;
+    memmove(Fp, ft, 2 * n * sizeof * ft);
+
+    /*
+     * We now need to apply the Babai reduction. At that point,
+     * we have F and G in two n-word arrays.
+     *
+     * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+     * modulo p, using the NTT. We still move memory around in
+     * order to save RAM.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+
+    /*
+     * Compute the NTT tables in t1 and t2. We do not keep t2
+     * (we'll recompute it later on).
+     */
+    modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F and G to NTT.
+     */
+    modp_NTT2(Fp, t1, logn, p, p0i);
+    modp_NTT2(Gp, t1, logn, p, p0i);
+
+    /*
+     * Load f and adj(f) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(f[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[n - u] = modp_set(-f[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Compute F*adj(f) in t2, and f*adj(f) in t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_montymul(w, Fp[u], p, p0i);
+        t3[u] = modp_montymul(w, t4[u], p, p0i);
+    }
+
+    /*
+     * Load g and adj(g) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(g[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(g[u], p);
+        t5[n - u] = modp_set(-g[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Add G*adj(g) to t2, and g*adj(g) to t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_add(t2[u],
+                         modp_montymul(w, Gp[u], p, p0i), p);
+        t3[u] = modp_add(t3[u],
+                         modp_montymul(w, t4[u], p, p0i), p);
+    }
+
+    /*
+     * Convert back t2 and t3 to normal representation (normalized
+     * around 0), and then
+     * move them to t1 and t2. We first need to recompute the
+     * inverse table for NTT.
+     */
+    modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+    modp_iNTT2(t2, t4, logn, p, p0i);
+    modp_iNTT2(t3, t4, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint32_t)modp_norm(t2[u], p);
+        t2[u] = (uint32_t)modp_norm(t3[u], p);
+    }
+
+    /*
+     * At that point, array contents are:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   F*adj(f)+G*adj(g) (t1)
+     *   f*adj(f)+g*adj(g) (t2)
+     *
+     * We want to divide t1 by t2. The result is not integral; it
+     * must be rounded. We thus need to use the FFT.
+     */
+
+    /*
+     * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+     * polynomial is auto-adjoint, all its coordinates in FFT
+     * representation are actually real, so we can truncate off
+     * the imaginary parts.
+     */
+    rt3 = align_fpr(tmp, t3);
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t2)[u]);
+    }
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+    rt2 = align_fpr(tmp, t2);
+    memmove(rt2, rt3, hn * sizeof * rt3);
+
+    /*
+     * Convert F*adj(f)+G*adj(g) in FFT representation.
+     */
+    rt3 = rt2 + hn;
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t1)[u]);
+    }
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+
+    /*
+     * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+     * its rounded normal representation in t1.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_div_autoadj_fft(rt3, rt2, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt3, logn);
+    for (u = 0; u < n; u ++) {
+        t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+    }
+
+    /*
+     * RAM contents are now:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   k (t1)
+     *
+     * We want to compute F-k*f, and G-k*g.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+    modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(t1, t2, logn, p, p0i);
+    modp_NTT2(t4, t2, logn, p, p0i);
+    modp_NTT2(t5, t2, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t kw;
+
+        kw = modp_montymul(t1[u], R2, p, p0i);
+        Fp[u] = modp_sub(Fp[u],
+                         modp_montymul(kw, t4[u], p, p0i), p);
+        Gp[u] = modp_sub(Gp[u],
+                         modp_montymul(kw, t5[u], p, p0i), p);
+    }
+    modp_iNTT2(Fp, t3, logn, p, p0i);
+    modp_iNTT2(Gp, t3, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+        Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+    }
+
+    return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+           const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+    size_t n, u;
+    uint32_t *ft, *gt, *Ft, *Gt, *gm;
+    uint32_t p, p0i, r;
+    const small_prime *primes;
+
+    n = MKN(logn);
+
+    if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+        return 0;
+    }
+
+    /*
+     * For logn <= 2, we need to use solve_NTRU_intermediate()
+     * directly, because coefficients are a bit too large and
+     * do not fit the hypotheses in solve_NTRU_binary_depth0().
+     */
+    if (logn <= 2) {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 0) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+    } else {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 2) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+        if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+            return 0;
+        }
+        if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+            return 0;
+        }
+    }
+
+    /*
+     * If no buffer has been provided for G, use a temporary one.
+     */
+    if (G == NULL) {
+        G = (int8_t *)(tmp + 2 * n);
+    }
+
+    /*
+     * Final F and G are in fk->tmp, one word per coefficient
+     * (signed value over 31 bits).
+     */
+    if (!poly_big_to_small(F, tmp, lim, logn)
+            || !poly_big_to_small(G, tmp + n, lim, logn)) {
+        return 0;
+    }
+
+    /*
+     * Verify that the NTRU equation is fulfilled. Since all elements
+     * have short lengths, verifying modulo a small prime p works, and
+     * allows using the NTT.
+     *
+     * We put Gt[] first in tmp[], and process it first, so that it does
+     * not overlap with G[] in case we allocated it ourselves.
+     */
+    Gt = tmp;
+    ft = Gt + n;
+    gt = ft + n;
+    Ft = gt + n;
+    gm = Ft + n;
+
+    primes = PRIMES;
+    p = primes[0].p;
+    p0i = modp_ninv31(p);
+    modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Gt[u] = modp_set(G[u], p);
+    }
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+        Ft[u] = modp_set(F[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+    modp_NTT2(Ft, gm, logn, p, p0i);
+    modp_NTT2(Gt, gm, logn, p, p0i);
+    r = modp_montymul(12289, 1, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t z;
+
+        z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+                     modp_montymul(gt[u], Ft[u], p, p0i), p);
+        if (z != r) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+    size_t n, u;
+    unsigned mod2;
+
+    n = MKN(logn);
+    mod2 = 0;
+    for (u = 0; u < n; u ++) {
+        int s;
+
+restart:
+        s = mkgauss(rng, logn);
+
+        /*
+         * We need the coefficient to fit within -127..+127;
+         * realistically, this is always the case except for
+         * the very low degrees (N = 2 or 4), for which there
+         * is no real security anyway.
+         */
+        if (s < -127 || s > 127) {
+            goto restart;
+        }
+
+        /*
+         * We need the sum of all coefficients to be 1; otherwise,
+         * the resultant of the polynomial with X^N+1 will be even,
+         * and the binary GCD will fail.
+         */
+        if (u == n - 1) {
+            if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+                goto restart;
+            }
+        } else {
+            mod2 ^= (unsigned)(s & 1);
+        }
+        f[u] = (int8_t)s;
+    }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_keygen(inner_shake256_context *rng,
+                                    int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+                                    unsigned logn, uint8_t *tmp) {
+    /*
+     * Algorithm is the following:
+     *
+     *  - Generate f and g with the Gaussian distribution.
+     *
+     *  - If either Res(f,phi) or Res(g,phi) is even, try again.
+     *
+     *  - If ||(f,g)|| is too large, try again.
+     *
+     *  - If ||B~_{f,g}|| is too large, try again.
+     *
+     *  - If f is not invertible mod phi mod q, try again.
+     *
+     *  - Compute h = g/f mod phi mod q.
+     *
+     *  - Solve the NTRU equation fG - gF = q; if the solving fails,
+     *    try again. Usual failure condition is when Res(f,phi)
+     *    and Res(g,phi) are not prime to each other.
+     */
+    size_t n, u;
+    uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
+
+    n = MKN(logn);
+    rc = rng;
+
+    /*
+     * We need to generate f and g randomly, until we find values
+     * such that the norm of (g,-f), and of the orthogonalized
+     * vector, are satisfying. The orthogonalized vector is:
+     *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+     * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+     *
+     * In the binary case, coefficients of f and g are generated
+     * independently of each other, with a discrete Gaussian
+     * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+     * the two vectors have expected norm 1.17*sqrt(q), which is
+     * also our acceptance bound: we require both vectors to be no
+     * larger than that (this will be satisfied about 1/4th of the
+     * time, thus we expect sampling new (f,g) about 4 times for that
+     * step).
+     *
+     * We require that Res(f,phi) and Res(g,phi) are both odd (the
+     * NTRU equation solver requires it).
+     */
+    for (;;) {
+        fpr *rt1, *rt2, *rt3;
+        fpr bnorm;
+        uint32_t normf, normg, norm;
+        int lim;
+
+        /*
+         * The poly_small_mkgauss() function makes sure
+         * that the sum of coefficients is 1 modulo 2
+         * (i.e. the resultant of the polynomial with phi
+         * will be odd).
+         */
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
+
+        /*
+         * Verify that all coefficients are within the bounds
+         * defined in max_fg_bits. This is the case with
+         * overwhelming probability; this guarantees that the
+         * key will be encodable with FALCON_COMP_TRIM.
+         */
+        lim = 1 << (PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[logn] - 1);
+        for (u = 0; u < n; u ++) {
+            /*
+             * We can use non-CT tests since on any failure
+             * we will discard f and g.
+             */
+            if (f[u] >= lim || f[u] <= -lim
+                    || g[u] >= lim || g[u] <= -lim) {
+                lim = -1;
+                break;
+            }
+        }
+        if (lim < 0) {
+            continue;
+        }
+
+        /*
+         * Bound is 1.17*sqrt(q). We compute the squared
+         * norms. With q = 12289, the squared bound is:
+         *   (1.17^2)* 12289 = 16822.4121
+         * Since f and g are integral, the squared norm
+         * of (g,-f) is an integer.
+         */
+        normf = poly_small_sqnorm(f, logn);
+        normg = poly_small_sqnorm(g, logn);
+        norm = (normf + normg) | -((normf | normg) >> 31);
+        if (norm >= 16823) {
+            continue;
+        }
+
+        /*
+         * We compute the orthogonalized vector norm.
+         */
+        rt1 = (fpr *)tmp;
+        rt2 = rt1 + n;
+        rt3 = rt2 + n;
+        poly_small_to_fp(rt1, f, logn);
+        poly_small_to_fp(rt2, g, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt1, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt2, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(rt1, fpr_q, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(rt2, fpr_q, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt2, rt3, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt2, logn);
+        bnorm = fpr_zero;
+        for (u = 0; u < n; u ++) {
+            bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+            bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+        }
+        if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+            continue;
+        }
+
+        /*
+         * Compute public key h = g/f mod X^N+1 mod q. If this
+         * fails, we must restart.
+         */
+        if (h == NULL) {
+            h2 = (uint16_t *)tmp;
+            tmp2 = h2 + n;
+        } else {
+            h2 = h;
+            tmp2 = (uint16_t *)tmp;
+        }
+        if (!PQCLEAN_FALCONPADDED512_AVX2_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+            continue;
+        }
+
+        /*
+         * Solve the NTRU equation to get F and G.
+         */
+        lim = (1 << (PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[logn] - 1)) - 1;
+        if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+            continue;
+        }
+
+        /*
+         * Key pair is generated.
+         */
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/pqclean.c
new file mode 100644
index 000000000..171105004
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN   40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ *   private key:
+ *      header byte: 0101nnnn
+ *      private f  (6 or 5 bits by element, depending on degree)
+ *      private g  (6 or 5 bits by element, depending on degree)
+ *      private F  (8 bits by element)
+ *
+ *   public key:
+ *      header byte: 0000nnnn
+ *      public h   (14 bits by element)
+ *
+ *   signature:
+ *      header byte: 0011nnnn
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
+ *      padding    to 666 bytes
+ *
+ *   message + signature:
+ *      signature  666 bytes
+ *      message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk) {
+    union {
+        uint8_t b[FALCON_KEYGEN_TEMP_9];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[512], g[512], F[512];
+    uint16_t h[512];
+    unsigned char seed[48];
+    inner_shake256_context rng;
+    size_t u, v;
+
+    /*
+     * Generate key pair.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
+    PQCLEAN_FALCONPADDED512_AVX2_keygen(&rng, f, g, F, NULL, h, 9, tmp.b);
+    inner_shake256_ctx_release(&rng);
+
+    /*
+     * Encode private key.
+     */
+    sk[0] = 0x50 + 9;
+    u = 1;
+    v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u,
+            f, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u,
+            g, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u,
+            F, 9, PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[9]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+
+    /*
+     * Encode public key.
+     */
+    pk[0] = 0x00 + 9;
+    v = PQCLEAN_FALCONPADDED512_AVX2_modq_encode(
+            pk + 1, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1,
+            h, 9);
+    if (v != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+        const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    union {
+        uint8_t b[72 * 512];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[512], g[512], F[512], G[512];
+    struct {
+        int16_t sig[512];
+        uint16_t hm[512];
+    } r;
+    unsigned char seed[48];
+    inner_shake256_context sc;
+    size_t u, v;
+
+    /*
+     * Decode the private key.
+     */
+    if (sk[0] != 0x50 + 9) {
+        return -1;
+    }
+    u = 1;
+    v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+            f, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9],
+            sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+            g, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9],
+            sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+            F, 9, PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[9],
+            sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+    if (!PQCLEAN_FALCONPADDED512_AVX2_complete_private(G, f, g, F, 9, tmp.b)) {
+        return -1;
+    }
+
+    /*
+     * Create a random nonce (40 bytes).
+     */
+    randombytes(nonce, NONCELEN);
+
+    /*
+     * Hash message nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(&sc, r.hm, 9, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Initialize a RNG.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
+
+    /*
+     * Compute and return the signature. This loops until a signature
+     * value is found that fits in the provided buffer.
+     */
+    for (;;) {
+        PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+        v = PQCLEAN_FALCONPADDED512_AVX2_comp_encode(sigbuf, sigbuflen, r.sig, 9);
+        if (v != 0) {
+            inner_shake256_ctx_release(&sc);
+            memset(sigbuf + v, 0, sigbuflen - v);
+            return 0;
+        }
+    }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+    const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    union {
+        uint8_t b[2 * 512];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    uint16_t h[512], hm[512];
+    int16_t sig[512];
+    inner_shake256_context sc;
+    size_t v;
+
+    /*
+     * Decode public key.
+     */
+    if (pk[0] != 0x00 + 9) {
+        return -1;
+    }
+    if (PQCLEAN_FALCONPADDED512_AVX2_modq_decode(h, 9,
+            pk + 1, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1)
+            != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+    PQCLEAN_FALCONPADDED512_AVX2_to_ntt_monty(h, 9);
+
+    /*
+     * Decode signature.
+     */
+    if (sigbuflen == 0) {
+        return -1;
+    }
+
+    v = PQCLEAN_FALCONPADDED512_AVX2_comp_decode(sig, 9, sigbuf, sigbuflen);
+    if (v == 0) {
+        return -1;
+    }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
+
+    /*
+     * Hash nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(&sc, hm, 9, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Verify signature.
+     */
+    if (!PQCLEAN_FALCONPADDED512_AVX2_verify_raw(hm, sig, h, 9, tmp.b)) {
+        return -1;
+    }
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    size_t vlen;
+
+    vlen = PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sig[0] = 0x30 + 9;
+    *siglen = 1 + NONCELEN + vlen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    if (siglen < 1 + NONCELEN) {
+        return -1;
+    }
+    if (sig[0] != 0x30 + 9) {
+        return -1;
+    }
+    return do_verify(sig + 1,
+                     sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    uint8_t *sigbuf;
+    size_t sigbuflen;
+
+    /*
+     * Move the message to its final location; this is a memmove() so
+     * it handles overlaps properly.
+     */
+    memmove(sm + PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES, m, mlen);
+    sigbuf = sm + 1 + NONCELEN;
+    sigbuflen = PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sm[0] = 0x30 + 9;
+    sigbuflen ++;
+    *smlen = mlen + NONCELEN + sigbuflen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+    const uint8_t *sigbuf;
+    size_t pmlen, sigbuflen;
+
+    if (smlen < PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES) {
+        return -1;
+    }
+    sigbuflen = PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+    pmlen = smlen - PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES;
+    if (sm[0] != 0x30 + 9) {
+        return -1;
+    }
+    sigbuf = sm + 1 + NONCELEN;
+
+    /*
+     * The one-byte signature header has been verified. Nonce is at sm+1
+     * followed by the signature (pointed to by sigbuf). The message
+     * follows the signature value.
+     */
+    if (do_verify(sm + 1, sigbuf, sigbuflen,
+                  sm + PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES, pmlen, pk) < 0) {
+        return -1;
+    }
+
+    /*
+     * Signature is correct, we just have to copy/move the message
+     * to its final destination. The memmove() properly handles
+     * overlaps.
+     */
+    memmove(m, sm + PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES, pmlen);
+    *mlen = pmlen;
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/rng.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/rng.c
new file mode 100644
index 000000000..203d31f9d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/rng.c
@@ -0,0 +1,179 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include <assert.h>
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_prng_init(prng *p, inner_shake256_context *src) {
+    inner_shake256_extract(src, p->state.d, 56);
+    PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED512_AVX2_prng_refill(prng *p) {
+
+    static const uint32_t CW[] = {
+        0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+    };
+
+    uint64_t cc;
+    size_t u;
+    int i;
+    uint32_t *sw;
+    union {
+        uint32_t w[16];
+        __m256i y[2];  /* for alignment */
+    } t;
+    __m256i state[16], init[16];
+
+    sw = (uint32_t *)p->state.d;
+
+    /*
+     * XOR next counter values into state.
+     */
+    cc = *(uint64_t *)(p->state.d + 48);
+    for (u = 0; u < 8; u ++) {
+        t.w[u] = (uint32_t)(cc + u);
+        t.w[u + 8] = (uint32_t)((cc + u) >> 32);
+    }
+    *(uint64_t *)(p->state.d + 48) = cc + 8;
+
+    /*
+     * Load state.
+     */
+    for (u = 0; u < 4; u ++) {
+        state[u] = init[u] =
+                       _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)CW[u]));
+    }
+    for (u = 0; u < 10; u ++) {
+        state[u + 4] = init[u + 4] =
+                           _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[u]));
+    }
+    state[14] = init[14] = _mm256_xor_si256(
+                               _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[10])),
+                               _mm256_loadu_si256((__m256i *)&t.w[0]));
+    state[15] = init[15] = _mm256_xor_si256(
+                               _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[11])),
+                               _mm256_loadu_si256((__m256i *)&t.w[8]));
+
+    /*
+     * Do all rounds.
+     */
+    for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d)   do { \
+        state[a] = _mm256_add_epi32(state[a], state[b]); \
+        state[d] = _mm256_xor_si256(state[d], state[a]); \
+        state[d] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[d], 16), \
+                                    _mm256_srli_epi32(state[d], 16)); \
+        state[c] = _mm256_add_epi32(state[c], state[d]); \
+        state[b] = _mm256_xor_si256(state[b], state[c]); \
+        state[b] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[b], 12), \
+                                    _mm256_srli_epi32(state[b], 20)); \
+        state[a] = _mm256_add_epi32(state[a], state[b]); \
+        state[d] = _mm256_xor_si256(state[d], state[a]); \
+        state[d] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[d],  8), \
+                                    _mm256_srli_epi32(state[d], 24)); \
+        state[c] = _mm256_add_epi32(state[c], state[d]); \
+        state[b] = _mm256_xor_si256(state[b], state[c]); \
+        state[b] = _mm256_or_si256( \
+                                    _mm256_slli_epi32(state[b], 7), \
+                                    _mm256_srli_epi32(state[b], 25)); \
+    } while (0)
+
+        QROUND( 0,  4,  8, 12);
+        QROUND( 1,  5,  9, 13);
+        QROUND( 2,  6, 10, 14);
+        QROUND( 3,  7, 11, 15);
+        QROUND( 0,  5, 10, 15);
+        QROUND( 1,  6, 11, 12);
+        QROUND( 2,  7,  8, 13);
+        QROUND( 3,  4,  9, 14);
+
+#undef QROUND
+
+    }
+
+    /*
+     * Add initial state back and encode the result in the destination
+     * buffer. We can dump the AVX2 values "as is" because the non-AVX2
+     * code uses a compatible order of values.
+     */
+    for (u = 0; u < 16; u ++) {
+        _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
+                            _mm256_add_epi32(state[u], init[u]));
+    }
+
+    p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) {
+    uint8_t *buf;
+
+    buf = dst;
+    while (len > 0) {
+        size_t clen;
+
+        clen = (sizeof p->buf.d) - p->ptr;
+        if (clen > len) {
+            clen = len;
+        }
+        memcpy(buf, p->buf.d, clen);
+        buf += clen;
+        len -= clen;
+        p->ptr += clen;
+        if (p->ptr == sizeof p->buf.d) {
+            PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/sign.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/sign.c
new file mode 100644
index 000000000..0e8eb7173
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/sign.c
@@ -0,0 +1,1319 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ *   N = 2^logn
+ *   phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+    /*
+     * For logn = 0 (polynomials are constant), the "tree" is a
+     * single element. Otherwise, the tree node has size 2^logn, and
+     * has two child trees for size logn-1 each. Thus, treesize s()
+     * must fulfill these two relations:
+     *
+     *   s(0) = 1
+     *   s(logn) = (2^logn) + 2*s(logn-1)
+     */
+    return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+                fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g0[0];
+        return;
+    }
+    hn = n >> 1;
+
+    /*
+     * The LDL decomposition yields L (which is written in the tree)
+     * and the diagonal of D. Since d00 = g0, we just write d11
+     * into tmp.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+    /*
+     * Split d00 (currently in g0) and d11 (currently in tmp). We
+     * reuse g0 and g1 as temporary storage spaces:
+     *   d00 splits into g1, g1+hn
+     *   d11 splits into g0, g0+hn
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(g1, g1 + hn, g0, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+    /*
+     * Each split result is the first row of a new auto-adjoint
+     * quasicyclic matrix for the next recursive step.
+     */
+    ffLDL_fft_inner(tree + n,
+                    g1, g1 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+          const fpr *g01, const fpr *g11,
+          unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *d00, *d11;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g00[0];
+        return;
+    }
+    hn = n >> 1;
+    d00 = tmp;
+    d11 = tmp + n;
+    tmp += n << 1;
+
+    memcpy(d00, g00, n * sizeof * g00);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(tmp, tmp + hn, d00, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(d00, d00 + hn, d11, logn);
+    memcpy(d11, tmp, n * sizeof * tmp);
+    ffLDL_fft_inner(tree + n,
+                    d11, d11 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+    /*
+     * TODO: make an iterative version.
+     */
+    size_t n;
+
+    n = MKN(logn);
+    if (n == 1) {
+        /*
+         * We actually store in the tree leaf the inverse of
+         * the value mandated by the specification: this
+         * saves a division both here and in the sampler.
+         */
+        tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+    } else {
+        ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+        ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+                               orig_logn, logn - 1);
+    }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        r[u] = fpr_of(t[u]);
+    }
+}
+
+/*
+ * The expanded private key contains:
+ *  - The B0 matrix (four elements)
+ *  - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+    (void)logn;
+    return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+    return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+    return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+    return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+    return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp) {
+    size_t n;
+    fpr *rf, *rg, *rF, *rG;
+    fpr *b00, *b01, *b10, *b11;
+    fpr *g00, *g01, *g11, *gxx;
+    fpr *tree;
+
+    n = MKN(logn);
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * We load the private key elements directly into the B0 matrix,
+     * since B0 = [[g, -f], [G, -F]].
+     */
+    rf = b01;
+    rg = b00;
+    rF = b11;
+    rG = b10;
+
+    smallints_to_fpr(rf, f, logn);
+    smallints_to_fpr(rg, g, logn);
+    smallints_to_fpr(rF, F, logn);
+    smallints_to_fpr(rG, G, logn);
+
+    /*
+     * Compute the FFT for the key elements, and negate f and F.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rf, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rg, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rF, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(rG, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_neg(rf, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_neg(rF, logn);
+
+    /*
+     * The Gram matrix is G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle).
+     */
+    g00 = (fpr *)tmp;
+    g01 = g00 + n;
+    g11 = g01 + n;
+    gxx = g11 + n;
+
+    memcpy(g00, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(g00, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(g00, gxx, logn);
+
+    memcpy(g01, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(g01, b10, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(gxx, b11, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(g01, gxx, logn);
+
+    memcpy(g11, b10, n * sizeof * b10);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(g11, logn);
+    memcpy(gxx, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(g11, gxx, logn);
+
+    /*
+     * Compute the Falcon tree.
+     */
+    ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+    /*
+     * Normalize tree.
+     */
+    ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+                       fpr *t0, fpr *t1,
+                       fpr *g00, fpr *g01, fpr *g11,
+                       unsigned orig_logn, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *z0, *z1;
+
+    /*
+     * Deepest level: the LDL tree leaf value is just g00 (the
+     * array has length only 1 at this point); we normalize it
+     * with regards to sigma, then use it for sampling.
+     */
+    if (logn == 0) {
+        fpr leaf;
+
+        leaf = g00[0];
+        leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+        t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+        t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+        return;
+    }
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Decompose G into LDL. We only need d00 (identical to g00),
+     * d11, and l10; we do that in place.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_LDL_fft(g00, g01, g11, logn);
+
+    /*
+     * Split d00 and d11 and expand them into half-size quasi-cyclic
+     * Gram matrices. We also save l10 in tmp[].
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(tmp, tmp + hn, g00, logn);
+    memcpy(g00, tmp, n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(tmp, tmp + hn, g11, logn);
+    memcpy(g11, tmp, n * sizeof * tmp);
+    memcpy(tmp, g01, n * sizeof * g01);
+    memcpy(g01, g00, hn * sizeof * g00);
+    memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+    /*
+     * The half-size Gram matrices for the recursive LDL tree
+     * building are now:
+     *   - left sub-tree: g00, g00+hn, g01
+     *   - right sub-tree: g11, g11+hn, g01+hn
+     * l10 is in tmp[].
+     */
+
+    /*
+     * We split t1 and use the first recursive call on the two
+     * halves, using the right sub-tree. The result is merged
+     * back into tmp + 2*n.
+     */
+    z1 = tmp + n;
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+                           g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * l10.
+     * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+     * in tmp + (n << 1). The buffer in z1 is free.
+     *
+     * In the end, z1 is written over t1, and tb0 is in t0.
+     */
+    memcpy(z1, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_sub(z1, tmp + (n << 1), logn);
+    memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(t0, tmp, logn);
+
+    /*
+     * Second recursive invocation, on the split tb0 (currently in t0)
+     * and the left sub-tree.
+     */
+    z0 = tmp;
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z0, z0 + hn, t0, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+                           g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+               fpr *z0, fpr *z1,
+               const fpr *tree,
+               const fpr *t0, const fpr *t1, unsigned logn,
+               fpr *tmp) {
+    size_t n, hn;
+    const fpr *tree0, *tree1;
+
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr w0, w1, w2, w3, sigma;
+        __m128d ww0, ww1, wa, wb, wc, wd;
+        __m128d wy0, wy1, wz0, wz1;
+        __m128d half, invsqrt8, invsqrt2, neghi, neglo;
+        int si0, si1, si2, si3;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        half = _mm_set1_pd(0.5);
+        invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
+        invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
+        neghi = _mm_set_pd(-0.0, 0.0);
+        neglo = _mm_set_pd(0.0, -0.0);
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        ww0 = _mm_loadu_pd(&t1[0].v);
+        ww1 = _mm_loadu_pd(&t1[2].v);
+        wa = _mm_unpacklo_pd(ww0, ww1);
+        wb = _mm_unpackhi_pd(ww0, ww1);
+        wc = _mm_add_pd(wa, wb);
+        ww0 = _mm_mul_pd(wc, half);
+        wc = _mm_sub_pd(wa, wb);
+        wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+        ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+        w2.v = _mm_cvtsd_f64(ww1);
+        w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+        wa = ww1;
+        sigma = tree1[3];
+        si2 = samp(samp_ctx, w2, sigma);
+        si3 = samp(samp_ctx, w3, sigma);
+        ww1 = _mm_set_pd((double)si3, (double)si2);
+        wa = _mm_sub_pd(wa, ww1);
+        wb = _mm_loadu_pd(&tree1[0].v);
+        wc = _mm_mul_pd(wa, wb);
+        wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+        wa = _mm_unpacklo_pd(wc, wd);
+        wb = _mm_unpackhi_pd(wc, wd);
+        ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+        w0.v = _mm_cvtsd_f64(ww0);
+        w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+        sigma = tree1[2];
+        si0 = samp(samp_ctx, w0, sigma);
+        si1 = samp(samp_ctx, w1, sigma);
+        ww0 = _mm_set_pd((double)si1, (double)si0);
+
+        wc = _mm_mul_pd(
+                 _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+                 invsqrt2);
+        wa = _mm_add_pd(ww0, wc);
+        wb = _mm_sub_pd(ww0, wc);
+        ww0 = _mm_unpacklo_pd(wa, wb);
+        ww1 = _mm_unpackhi_pd(wa, wb);
+        _mm_storeu_pd(&z1[0].v, ww0);
+        _mm_storeu_pd(&z1[2].v, ww1);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
+        wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
+        wz0 = _mm_loadu_pd(&tree[0].v);
+        wz1 = _mm_loadu_pd(&tree[2].v);
+        ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
+        ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
+        ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
+        ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
+
+        /*
+         * Second recursive invocation.
+         */
+        wa = _mm_unpacklo_pd(ww0, ww1);
+        wb = _mm_unpackhi_pd(ww0, ww1);
+        wc = _mm_add_pd(wa, wb);
+        ww0 = _mm_mul_pd(wc, half);
+        wc = _mm_sub_pd(wa, wb);
+        wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+        ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+        w2.v = _mm_cvtsd_f64(ww1);
+        w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+        wa = ww1;
+        sigma = tree0[3];
+        si2 = samp(samp_ctx, w2, sigma);
+        si3 = samp(samp_ctx, w3, sigma);
+        ww1 = _mm_set_pd((double)si3, (double)si2);
+        wa = _mm_sub_pd(wa, ww1);
+        wb = _mm_loadu_pd(&tree0[0].v);
+        wc = _mm_mul_pd(wa, wb);
+        wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+        wa = _mm_unpacklo_pd(wc, wd);
+        wb = _mm_unpackhi_pd(wc, wd);
+        ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+        w0.v = _mm_cvtsd_f64(ww0);
+        w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+        sigma = tree0[2];
+        si0 = samp(samp_ctx, w0, sigma);
+        si1 = samp(samp_ctx, w1, sigma);
+        ww0 = _mm_set_pd((double)si1, (double)si0);
+
+        wc = _mm_mul_pd(
+                 _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+                 invsqrt2);
+        wa = _mm_add_pd(ww0, wc);
+        wb = _mm_sub_pd(ww0, wc);
+        ww0 = _mm_unpacklo_pd(wa, wb);
+        ww1 = _mm_unpackhi_pd(wa, wb);
+        _mm_storeu_pd(&z0[0].v, ww0);
+        _mm_storeu_pd(&z0[2].v, ww1);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
+        fpr x0, x1, sigma;
+
+        x0 = t0[0];
+        x1 = t1[0];
+        sigma = tree[0];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+        return;
+    }
+
+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    tree0 = tree + n;
+    tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+    /*
+     * We split t1 into z1 (reused as temporary storage), then do
+     * the recursive invocation, with output in tmp. We finally
+     * merge back into z1.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree1, z1, z1 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+     */
+    memcpy(tmp, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_sub(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tmp, tree, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(tmp, t0, logn);
+
+    /*
+     * Second recursive invocation.
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z0, z0 + hn, tmp, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree0, z0, z0 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+             const fpr *expanded_key,
+             const uint16_t *hm,
+             unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    const fpr *b00, *b01, *b10, *b11, *tree;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+    t0 = tmp;
+    t1 = t0 + n;
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t0, ni, logn);
+
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Apply sampling. Output is written back in [tx, ty].
+     */
+    ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(t0, tx, n * sizeof * tx);
+    memcpy(t1, ty, n * sizeof * ty);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(t1, ty, logn);
+
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(t1, logn);
+
+    /*
+     * Compute the signature.
+     */
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED512_AVX2_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+            const int8_t *f, const int8_t *g,
+            const int8_t *F, const int8_t *G,
+            const uint16_t *hm, unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+
+    /*
+     * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b11, logn);
+
+    /*
+     * Compute the Gram matrix G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle). g10 is not kept
+     * since it is equal to adj(g01).
+     *
+     * We _replace_ the matrix B with the Gram matrix, but we
+     * must keep b01 and b11 for computing the target vector.
+     */
+    t0 = b11 + n;
+    t1 = t0 + n;
+
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(t0, logn);    // t0 <- b01*adj(b01)
+
+    memcpy(t1, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(t1, b10, logn);   // t1 <- b00*adj(b10)
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(b00, logn);   // b00 <- b00*adj(b00)
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(b00, t0, logn);      // b00 <- g00
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(b01, b11, logn);  // b01 <- b01*adj(b11)
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(b01, t1, logn);      // b01 <- g01
+
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(b10, logn);   // b10 <- b10*adj(b10)
+    memcpy(t1, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(t1, logn);    // t1 <- b11*adj(b11)
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(b10, t1, logn);      // b10 <- g11
+
+    /*
+     * We rename variables to make things clearer. The three elements
+     * of the Gram matrix uses the first 3*n slots of tmp[], followed
+     * by b11 and b01 (in that order).
+     */
+    g00 = b00;
+    g01 = b01;
+    g11 = b10;
+    b01 = t0;
+    t0 = b01 + n;
+    t1 = t0 + n;
+
+    /*
+     * Memory layout at that point:
+     *   g00 g01 g11 b11 b01 t0 t1
+     */
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t0, ni, logn);
+
+    /*
+     * b01 and b11 can be discarded, so we move back (t0,t1).
+     * Memory layout is now:
+     *      g00 g01 g11 t0 t1
+     */
+    memcpy(b11, t0, n * 2 * sizeof * t0);
+    t0 = g11 + n;
+    t1 = t0 + n;
+
+    /*
+     * Apply sampling; result is written over (t0,t1).
+     */
+    ffSampling_fft_dyntree(samp, samp_ctx,
+                           t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+    /*
+     * We arrange the layout back to:
+     *     b00 b01 b10 b11 t0 t1
+     *
+     * We did not conserve the matrix basis, so we must recompute
+     * it now.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    memmove(b11 + n, t0, n * 2 * sizeof * t0);
+    t0 = b11 + n;
+    t1 = t0 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b11, logn);
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(tx, t0, n * sizeof * t0);
+    memcpy(ty, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_poly_add(t1, ty, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED512_AVX2_iFFT(t1, logn);
+
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED512_AVX2_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED512_AVX2_gaussian0_sampler(prng *p) {
+
+    /*
+     * High words.
+     */
+    static const union {
+        uint16_t u16[16];
+        __m256i ymm[1];
+    } rhi15 = {
+        {
+            0x51FB, 0x2A69, 0x113E, 0x0568,
+            0x014A, 0x003B, 0x0008, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000,
+            0x0000, 0x0000, 0x0000, 0x0000
+        }
+    };
+
+    static const union {
+        uint64_t u64[20];
+        __m256i ymm[5];
+    } rlo57 = {
+        {
+            0x1F42ED3AC391802, 0x12B181F3F7DDB82,
+            0x1CDD0934829C1FF, 0x1754377C7994AE4,
+            0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
+            0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
+            0x01F80D88A7B6428, 0x001C3FDB2040C69,
+            0x00012CF24D031FB, 0x00000949F8B091F,
+            0x0000003665DA998, 0x00000000EBF6EBB,
+            0x0000000002F5D7E, 0x000000000007098,
+            0x0000000000000C6, 0x000000000000001,
+            0x000000000000000, 0x000000000000000
+        }
+    };
+
+    uint64_t lo;
+    unsigned hi;
+    __m256i xhi, rhi, gthi, eqhi, eqm;
+    __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
+    __m128i t, zt;
+    int r;
+
+    /*
+     * Get a 72-bit random value and split it into a low part
+     * (57 bits) and a high part (15 bits)
+     */
+    lo = prng_get_u64(p);
+    hi = prng_get_u8(p);
+    hi = (hi << 7) | (unsigned)(lo >> 57);
+    lo &= 0x1FFFFFFFFFFFFFF;
+
+    /*
+     * Broadcast the high part and compare it with the relevant
+     * values. We need both a "greater than" and an "equal"
+     * comparisons.
+     */
+    xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128((int)hi));
+    rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
+    gthi = _mm256_cmpgt_epi16(rhi, xhi);
+    eqhi = _mm256_cmpeq_epi16(rhi, xhi);
+
+    /*
+     * The result is the number of 72-bit values (among the list of 19)
+     * which are greater than the 72-bit random value. We first count
+     * all non-zero 16-bit elements in the first eight of gthi. Such
+     * elements have value -1 or 0, so we first negate them.
+     */
+    t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
+    zt = _mm_setzero_si128();
+    t = _mm_hadd_epi16(t, zt);
+    t = _mm_hadd_epi16(t, zt);
+    t = _mm_hadd_epi16(t, zt);
+    r = _mm_cvtsi128_si32(t);
+
+    /*
+     * We must look at the low bits for all values for which the
+     * high bits are an "equal" match; values 8-18 all have the
+     * same high bits (0).
+     * On 32-bit systems, 'lo' really is two registers, requiring
+     * some extra code.
+     */
+    #if defined(__x86_64__) || defined(_M_X64)
+    xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
+    #else
+    {
+        uint32_t e0, e1;
+        int32_t f0, f1;
+
+        e0 = (uint32_t)lo;
+        e1 = (uint32_t)(lo >> 32);
+        f0 = *(int32_t *)&e0;
+        f1 = *(int32_t *)&e1;
+        xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
+    }
+    #endif
+    gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo);
+    gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo);
+    gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo);
+    gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo);
+    gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo);
+
+    /*
+     * Keep only comparison results that correspond to the non-zero
+     * elements in eqhi.
+     */
+    gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
+                                 _mm256_castsi256_si128(eqhi)));
+    gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
+                                 _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
+    eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
+    gtlo2 = _mm256_and_si256(gtlo2, eqm);
+    gtlo3 = _mm256_and_si256(gtlo3, eqm);
+    gtlo4 = _mm256_and_si256(gtlo4, eqm);
+
+    /*
+     * Add all values to count the total number of "-1" elements.
+     * Since the first eight "high" words are all different, only
+     * one element (at most) in gtlo0:gtlo1 can be non-zero; however,
+     * if the high word of the random value is zero, then many
+     * elements of gtlo2:gtlo3:gtlo4 can be non-zero.
+     */
+    gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
+    gtlo0 = _mm256_add_epi64(
+                _mm256_add_epi64(gtlo0, gtlo2),
+                _mm256_add_epi64(gtlo3, gtlo4));
+    t = _mm_add_epi64(
+            _mm256_castsi256_si128(gtlo0),
+            _mm256_extracti128_si256(gtlo0, 1));
+    t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
+    r -= _mm_cvtsi128_si32(t);
+
+    return r;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+    int s, i;
+    fpr r;
+    uint32_t sw, w;
+    uint64_t z;
+
+    /*
+     * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+     * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+     */
+    s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+    r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+    /*
+     * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+     * (the minimum value for sigma), r = 0 and b = 1, then we get
+     * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+     * with probability about 0.000000000230383991, which is
+     * approximatively equal to 2^(-32). In any case, if s >= 64,
+     * then BerExp will be non-zero with probability less than
+     * 2^(-64), so we can simply saturate s at 63.
+     */
+    sw = (uint32_t)s;
+    sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+    s = (int)sw;
+
+    /*
+     * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+     * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+     * We scale it up to 2^64, then right-shift it by s bits because
+     * we really want exp(-x) = 2^(-s)*exp(-r).
+     *
+     * The "-1" operation makes sure that the value fits on 64 bits
+     * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+     * case). The bias is negligible since fpr_expm_p63() only computes
+     * with 51 bits of precision or so.
+     */
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+    /*
+     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+     * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+     * PRNG output to limit its consumption, the sign of the difference
+     * yields the expected result.
+     */
+    i = 64;
+    do {
+        i -= 8;
+        w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+    } while (!w && i > 0);
+    return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED512_AVX2_sampler(void *ctx, fpr mu, fpr isigma) {
+    sampler_context *spc;
+    int s;
+    fpr r, dss, ccs;
+
+    spc = ctx;
+
+    /*
+     * Center is mu. We compute mu = s + r where s is an integer
+     * and 0 <= r < 1.
+     */
+    s = (int)fpr_floor(mu);
+    r = fpr_sub(mu, fpr_of(s));
+
+    /*
+     * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+     */
+    dss = fpr_half(fpr_sqr(isigma));
+
+    /*
+     * ccs = sigma_min / sigma = sigma_min * isigma.
+     */
+    ccs = fpr_mul(isigma, spc->sigma_min);
+
+    /*
+     * We now need to sample on center r.
+     */
+    for (;;) {
+        int z0, z, b;
+        fpr x;
+
+        /*
+         * Sample z for a Gaussian distribution. Then get a
+         * random bit b to turn the sampling into a bimodal
+         * distribution: if b = 1, we use z+1, otherwise we
+         * use -z. We thus have two situations:
+         *
+         *  - b = 1: z >= 1 and sampled against a Gaussian
+         *    centered on 1.
+         *  - b = 0: z <= 0 and sampled against a Gaussian
+         *    centered on 0.
+         */
+        z0 = PQCLEAN_FALCONPADDED512_AVX2_gaussian0_sampler(&spc->p);
+        b = (int)prng_get_u8(&spc->p) & 1;
+        z = b + ((b << 1) - 1) * z0;
+
+        /*
+         * Rejection sampling. We want a Gaussian centered on r;
+         * but we sampled against a Gaussian centered on b (0 or
+         * 1). But we know that z is always in the range where
+         * our sampling distribution is greater than the Gaussian
+         * distribution, so rejection works.
+         *
+         * We got z with distribution:
+         *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+         * We target distribution:
+         *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
+         * Rejection sampling works by keeping the value z with
+         * probability S(z)/G(z), and starting again otherwise.
+         * This requires S(z) <= G(z), which is the case here.
+         * Thus, we simply need to keep our z with probability:
+         *    P = exp(-x)
+         * where:
+         *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+         *
+         * Here, we scale up the Bernouilli distribution, which
+         * makes rejection more probable, but makes rejection
+         * rate sufficiently decorrelated from the Gaussian
+         * center and standard deviation that the whole sampler
+         * can be said to be constant-time.
+         */
+        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+        if (BerExp(&spc->p, x, ccs)) {
+            /*
+             * Rejection sampling was centered on r, but the
+             * actual center is mu = s + r.
+             */
+            return s + z;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+                                       const fpr *expanded_key,
+                                       const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED512_AVX2_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED512_AVX2_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_tree(samp, samp_ctx, sig,
+                         expanded_key, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+                                      const int8_t *f, const int8_t *g,
+                                      const int8_t *F, const int8_t *G,
+                                      const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED512_AVX2_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED512_AVX2_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_dyn(samp, samp_ctx, sig,
+                        f, g, F, G, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/vrfy.c
new file mode 100644
index 000000000..6abf55d18
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ *   n = 2^logn  (2 <= n <= 1024)
+ *   phi = X^n + 1
+ *   q = 12289
+ *   q0i = -1/q mod 2^16
+ *   R = 2^16 mod q
+ *   R2 = 2^32 mod q
+ */
+
+#define Q     12289
+#define Q0I   12287
+#define R      4091
+#define R2    10952
+
+/*
+ * Table for NTT, binary case:
+ *   GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+    4091,  7888, 11060, 11208,  6960,  4342,  6275,  9759,
+    1591,  6399,  9477,  5266,   586,  5825,  7538,  9710,
+    1134,  6407,  1711,   965,  7099,  7674,  3743,  6442,
+    10414,  8100,  1885,  1688,  1364, 10329, 10164,  9180,
+    12210,  6240,   997,   117,  4783,  4407,  1549,  7072,
+    2829,  6458,  4431,  8877,  7144,  2564,  5664,  4042,
+    12189,   432, 10751,  1237,  7610,  1534,  3983,  7863,
+    2181,  6308,  8720,  6570,  4843,  1690,    14,  3872,
+    5569,  9368, 12163,  2019,  7543,  2315,  4673,  7340,
+    1553,  1156,  8401, 11389,  1020,  2967, 10772,  7045,
+    3316, 11236,  5285, 11578, 10637, 10086,  9493,  6180,
+    9277,  6130,  3323,   883, 10469,   489,  1502,  2851,
+    11061,  9729,  2742, 12241,  4970, 10481, 10078,  1195,
+    730,  1762,  3854,  2030,  5892, 10922,  9020,  5274,
+    9179,  3604,  3782, 10206,  3180,  3467,  4668,  2446,
+    7613,  9386,   834,  7703,  6836,  3403,  5351, 12276,
+    3580,  1739, 10820,  9787, 10209,  4070, 12250,  8525,
+    10401,  2749,  7338, 10574,  6040,   943,  9330,  1477,
+    6865,  9668,  3585,  6633, 12145,  4063,  3684,  7680,
+    8188,  6902,  3533,  9807,  6090,   727, 10099,  7003,
+    6945,  1949,  9731, 10559,  6057,   378,  7871,  8763,
+    8901,  9229,  8846,  4551,  9589, 11664,  7630,  8821,
+    5680,  4956,  6251,  8388, 10156,  8723,  2341,  3159,
+    1467,  5460,  8553,  7783,  2649,  2320,  9036,  6188,
+    737,  3698,  4699,  5753,  9046,  3687,    16,   914,
+    5186, 10531,  4552,  1964,  3509,  8436,  7516,  5381,
+    10733,  3281,  7037,  1060,  2895,  7156,  8887,  5357,
+    6409,  8197,  2962,  6375,  5064,  6634,  5625,   278,
+    932, 10229,  8927,  7642,   351,  9298,   237,  5858,
+    7692,  3146, 12126,  7586,  2053, 11285,  3802,  5204,
+    4602,  1748, 11300,   340,  3711,  4614,   300, 10993,
+    5070, 10049, 11616, 12247,  7421, 10707,  5746,  5654,
+    3835,  5553,  1224,  8476,  9237,  3845,   250, 11209,
+    4225,  6326,  9680, 12254,  4136,  2778,   692,  8808,
+    6410,  6718, 10105, 10418,  3759,  7356, 11361,  8433,
+    6437,  3652,  6342,  8978,  5391,  2272,  6476,  7416,
+    8418, 10824, 11986,  5733,   876,  7030,  2167,  2436,
+    3442,  9217,  8206,  4858,  5964,  2746,  7178,  1434,
+    7389,  8879, 10661, 11457,  4220,  1432, 10832,  4328,
+    8557,  1867,  9454,  2416,  3816,  9076,   686,  5393,
+    2523,  4339,  6115,   619,   937,  2834,  7775,  3279,
+    2363,  7488,  6112,  5056,   824, 10204, 11690,  1113,
+    2727,  9848,   896,  2028,  5075,  2654, 10464,  7884,
+    12169,  5434,  3070,  6400,  9132, 11672, 12153,  4520,
+    1273,  9739, 11468,  9937, 10039,  9720,  2262,  9399,
+    11192,   315,  4511,  1158,  6061,  6751, 11865,   357,
+    7367,  4550,   983,  8534,  8352, 10126,  7530,  9253,
+    4367,  5221,  3999,  8777,  3161,  6990,  4130, 11652,
+    3374, 11477,  1753,   292,  8681,  2806, 10378, 12188,
+    5800, 11811,  3181,  1988,  1024,  9340,  2477, 10928,
+    4582,  6750,  3619,  5503,  5233,  2463,  8470,  7650,
+    7964,  6395,  1071,  1272,  3474, 11045,  3291, 11344,
+    8502,  9478,  9837,  1253,  1857,  6233,  4720, 11561,
+    6034,  9817,  3339,  1797,  2879,  6242,  5200,  2114,
+    7962,  9353, 11363,  5475,  6084,  9601,  4108,  7323,
+    10438,  9471,  1271,   408,  6911,  3079,   360,  8276,
+    11535,  9156,  9049, 11539,   850,  8617,   784,  7919,
+    8334, 12170,  1846, 10213, 12184,  7827, 11903,  5600,
+    9779,  1012,   721,  2784,  6676,  6552,  5348,  4424,
+    6816,  8405,  9959,  5150,  2356,  5552,  5267,  1333,
+    8801,  9661,  7308,  5788,  4910,   909, 11613,  4395,
+    8238,  6686,  4302,  3044,  2285, 12249,  1963,  9216,
+    4296, 11918,   695,  4371,  9793,  4884,  2411, 10230,
+    2650,   841,  3890, 10231,  7248,  8505, 11196,  6688,
+    4059,  6060,  3686,  4722, 11853,  5816,  7058,  6868,
+    11137,  7926,  4894, 12284,  4102,  3908,  3610,  6525,
+    7938,  7982, 11977,  6755,   537,  4562,  1623,  8227,
+    11453,  7544,   906, 11816,  9548, 10858,  9703,  2815,
+    11736,  6813,  6979,   819,  8903,  6271, 10843,   348,
+    7514,  8339,  6439,   694,   852,  5659,  2781,  3716,
+    11589,  3024,  1523,  8659,  4114, 10738,  3303,  5885,
+    2978,  7289, 11884,  9123,  9323, 11830,    98,  2526,
+    2116,  4131, 11407,  1844,  3645,  3916,  8133,  2224,
+    10871,  8092,  9651,  5989,  7140,  8480,  1670,   159,
+    10923,  4918,   128,  7312,   725,  9157,  5006,  6393,
+    3494,  6043, 10972,  6181, 11838,  3423, 10514,  7668,
+    3693,  6658,  6905, 11953, 10212, 11922,  9101,  8365,
+    5110,    45,  2400,  1921,  4377,  2720,  1695,    51,
+    2808,   650,  1896,  9997,  9971, 11980,  8098,  4833,
+    4135,  4257,  5838,  4765, 10985, 11532,   590, 12198,
+    482, 12173,  2006,  7064, 10018,  3912, 12016, 10519,
+    11362,  6954,  2210,   284,  5413,  6601,  3865, 10339,
+    11188,  6231,   517,  9564, 11281,  3863,  1210,  4604,
+    8160, 11447,   153,  7204,  5763,  5089,  9248, 12154,
+    11748,  1354,  6672,   179,  5532,  2646,  5941, 12185,
+    862,  3158,   477,  7279,  5678,  7914,  4254,   302,
+    2893, 10114,  6890,  9560,  9647, 11905,  4098,  9824,
+    10269,  1353, 10715,  5325,  6254,  3951,  1807,  6449,
+    5159,  1308,  8315,  3404,  1877,  1231,   112,  6398,
+    11724, 12272,  7286,  1459, 12274,  9896,  3456,   800,
+    1397, 10678,   103,  7420,  7976,   936,   764,   632,
+    7996,  8223,  8445,  7758, 10870,  9571,  2508,  1946,
+    6524, 10158,  1044,  4338,  2457,  3641,  1659,  4139,
+    4688,  9733, 11148,  3946,  2082,  5261,  2036, 11850,
+    7636, 12236,  5366,  2380,  1399,  7720,  2100,  3217,
+    10912,  8898,  7578, 11995,  2791,  1215,  3355,  2711,
+    2267,  2004,  8568, 10176,  3214,  2337,  1750,  4729,
+    4997,  7415,  6315, 12044,  4374,  7157,  4844,   211,
+    8003, 10159,  9290, 11481,  1735,  2336,  5793,  9875,
+    8192,   986,  7527,  1401,   870,  3615,  8465,  2756,
+    9770,  2034, 10168,  3264,  6132,    54,  2880,  4763,
+    11805,  3074,  8286,  9428,  4881,  6933,  1090, 10038,
+    2567,   708,   893,  6465,  4962, 10024,  2090,  5718,
+    10743,   780,  4733,  4623,  2134,  2087,  4802,   884,
+    5372,  5795,  5938,  4333,  6559,  7549,  5269, 10664,
+    4252,  3260,  5917, 10814,  5768,  9983,  8096,  7791,
+    6800,  7491,  6272,  1907, 10947,  6289, 11803,  6032,
+    11449,  1171,  9201,  7933,  2479,  7970, 11337,  7062,
+    8911,  6728,  6542,  8114,  8828,  6595,  3545,  4348,
+    4610,  2205,  6999,  8106,  5560, 10390,  9321,  2499,
+    2413,  7272,  6881, 10582,  9308,  9437,  3554,  3326,
+    5991, 11969,  3415, 12283,  9838, 12063,  4332,  7830,
+    11329,  6605, 12271,  2044, 11611,  7353, 11201, 11582,
+    3733,  8943,  9978,  1627,  7168,  3935,  5050,  2762,
+    7496, 10383,   755,  1654, 12053,  4952, 10134,  4394,
+    6592,  7898,  7497,  8904, 12029,  3581, 10748,  5674,
+    10358,  4901,  7414,  8771,   710,  6764,  8462,  7193,
+    5371,  7274, 11084,   290,  7864,  6827, 11822,  2509,
+    6578,  4026,  5807,  1458,  5721,  5762,  4178,  2105,
+    11621,  4852,  8897,  2856, 11510,  9264,  2520,  8776,
+    7011,  2647,  1898,  7039,  5950, 11163,  5488,  6277,
+    9182, 11456,   633, 10046, 11554,  5633,  9587,  2333,
+    7008,  7084,  5047,  7199,  9865,  8997,   569,  6390,
+    10845,  9679,  8268, 11472,  4203,  1997,     2,  9331,
+    162,  6182,  2000,  3649,  9792,  6363,  7557,  6187,
+    8510,  9935,  5536,  9019,  3706, 12009,  1452,  3067,
+    5494,  9692,  4865,  6019,  7106,  9610,  4588, 10165,
+    6261,  5887,  2652, 10172,  1580, 10379,  4638,  9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ *   iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+    4091,  4401,  1081,  1229,  2530,  6014,  7947,  5329,
+    2579,  4751,  6464, 11703,  7023,  2812,  5890, 10698,
+    3109,  2125,  1960, 10925, 10601, 10404,  4189,  1875,
+    5847,  8546,  4615,  5190, 11324, 10578,  5882, 11155,
+    8417, 12275, 10599,  7446,  5719,  3569,  5981, 10108,
+    4426,  8306, 10755,  4679, 11052,  1538, 11857,   100,
+    8247,  6625,  9725,  5145,  3412,  7858,  5831,  9460,
+    5217, 10740,  7882,  7506, 12172, 11292,  6049,    79,
+    13,  6938,  8886,  5453,  4586, 11455,  2903,  4676,
+    9843,  7621,  8822,  9109,  2083,  8507,  8685,  3110,
+    7015,  3269,  1367,  6397, 10259,  8435, 10527, 11559,
+    11094,  2211,  1808,  7319,    48,  9547,  2560,  1228,
+    9438, 10787, 11800,  1820, 11406,  8966,  6159,  3012,
+    6109,  2796,  2203,  1652,   711,  7004,  1053,  8973,
+    5244,  1517,  9322, 11269,   900,  3888, 11133, 10736,
+    4949,  7616,  9974,  4746, 10270,   126,  2921,  6720,
+    6635,  6543,  1582,  4868,    42,   673,  2240,  7219,
+    1296, 11989,  7675,  8578, 11949,   989, 10541,  7687,
+    7085,  8487,  1004, 10236,  4703,   163,  9143,  4597,
+    6431, 12052,  2991, 11938,  4647,  3362,  2060, 11357,
+    12011,  6664,  5655,  7225,  5914,  9327,  4092,  5880,
+    6932,  3402,  5133,  9394, 11229,  5252,  9008,  1556,
+    6908,  4773,  3853,  8780, 10325,  7737,  1758,  7103,
+    11375, 12273,  8602,  3243,  6536,  7590,  8591, 11552,
+    6101,  3253,  9969,  9640,  4506,  3736,  6829, 10822,
+    9130,  9948,  3566,  2133,  3901,  6038,  7333,  6609,
+    3468,  4659,   625,  2700,  7738,  3443,  3060,  3388,
+    3526,  4418, 11911,  6232,  1730,  2558, 10340,  5344,
+    5286,  2190, 11562,  6199,  2482,  8756,  5387,  4101,
+    4609,  8605,  8226,   144,  5656,  8704,  2621,  5424,
+    10812,  2959, 11346,  6249,  1715,  4951,  9540,  1888,
+    3764,    39,  8219,  2080,  2502,  1469, 10550,  8709,
+    5601,  1093,  3784,  5041,  2058,  8399, 11448,  9639,
+    2059,  9878,  7405,  2496,  7918, 11594,   371,  7993,
+    3073, 10326,    40, 10004,  9245,  7987,  5603,  4051,
+    7894,   676, 11380,  7379,  6501,  4981,  2628,  3488,
+    10956,  7022,  6737,  9933,  7139,  2330,  3884,  5473,
+    7865,  6941,  5737,  5613,  9505, 11568, 11277,  2510,
+    6689,   386,  4462,   105,  2076, 10443,   119,  3955,
+    4370, 11505,  3672, 11439,   750,  3240,  3133,   754,
+    4013, 11929,  9210,  5378, 11881, 11018,  2818,  1851,
+    4966,  8181,  2688,  6205,  6814,   926,  2936,  4327,
+    10175,  7089,  6047,  9410, 10492,  8950,  2472,  6255,
+    728,  7569,  6056, 10432, 11036,  2452,  2811,  3787,
+    945,  8998,  1244,  8815, 11017, 11218,  5894,  4325,
+    4639,  3819,  9826,  7056,  6786,  8670,  5539,  7707,
+    1361,  9812,  2949, 11265, 10301,  9108,   478,  6489,
+    101,  1911,  9483,  3608, 11997, 10536,   812,  8915,
+    637,  8159,  5299,  9128,  3512,  8290,  7068,  7922,
+    3036,  4759,  2163,  3937,  3755, 11306,  7739,  4922,
+    11932,   424,  5538,  6228, 11131,  7778, 11974,  1097,
+    2890, 10027,  2569,  2250,  2352,   821,  2550, 11016,
+    7769,   136,   617,  3157,  5889,  9219,  6855,   120,
+    4405,  1825,  9635,  7214, 10261, 11393,  2441,  9562,
+    11176,   599,  2085, 11465,  7233,  6177,  4801,  9926,
+    9010,  4514,  9455, 11352, 11670,  6174,  7950,  9766,
+    6896, 11603,  3213,  8473,  9873,  2835, 10422,  3732,
+    7961,  1457, 10857,  8069,   832,  1628,  3410,  4900,
+    10855,  5111,  9543,  6325,  7431,  4083,  3072,  8847,
+    9853, 10122,  5259, 11413,  6556,   303,  1465,  3871,
+    4873,  5813, 10017,  6898,  3311,  5947,  8637,  5852,
+    3856,   928,  4933,  8530,  1871,  2184,  5571,  5879,
+    3481, 11597,  9511,  8153,    35,  2609,  5963,  8064,
+    1080, 12039,  8444,  3052,  3813, 11065,  6736,  8454,
+    2340,  7651,  1910, 10709,  2117,  9637,  6402,  6028,
+    2124,  7701,  2679,  5183,  6270,  7424,  2597,  6795,
+    9222, 10837,   280,  8583,  3270,  6753,  2354,  3779,
+    6102,  4732,  5926,  2497,  8640, 10289,  6107, 12127,
+    2958, 12287, 10292,  8086,   817,  4021,  2610,  1444,
+    5899, 11720,  3292,  2424,  5090,  7242,  5205,  5281,
+    9956,  2702,  6656,   735,  2243, 11656,   833,  3107,
+    6012,  6801,  1126,  6339,  5250, 10391,  9642,  5278,
+    3513,  9769,  3025,   779,  9433,  3392,  7437,   668,
+    10184,  8111,  6527,  6568, 10831,  6482,  8263,  5711,
+    9780,   467,  5462,  4425, 11999,  1205,  5015,  6918,
+    5096,  3827,  5525, 11579,  3518,  4875,  7388,  1931,
+    6615,  1541,  8708,   260,  3385,  4792,  4391,  5697,
+    7895,  2155,  7337,   236, 10635, 11534,  1906,  4793,
+    9527,  7239,  8354,  5121, 10662,  2311,  3346,  8556,
+    707,  1088,  4936,   678, 10245,    18,  5684,   960,
+    4459,  7957,   226,  2451,     6,  8874,   320,  6298,
+    8963,  8735,  2852,  2981,  1707,  5408,  5017,  9876,
+    9790,  2968,  1899,  6729,  4183,  5290, 10084,  7679,
+    7941,  8744,  5694,  3461,  4175,  5747,  5561,  3378,
+    5227,   952,  4319,  9810,  4356,  3088, 11118,   840,
+    6257,   486,  6000,  1342, 10382,  6017,  4798,  5489,
+    4498,  4193,  2306,  6521,  1475,  6372,  9029,  8037,
+    1625,  7020,  4740,  5730,  7956,  6351,  6494,  6917,
+    11405,  7487, 10202, 10155,  7666,  7556, 11509,  1546,
+    6571, 10199,  2265,  7327,  5824, 11396, 11581,  9722,
+    2251, 11199,  5356,  7408,  2861,  4003,  9215,   484,
+    7526,  9409, 12235,  6157,  9025,  2121, 10255,  2519,
+    9533,  3824,  8674, 11419, 10888,  4762, 11303,  4097,
+    2414,  6496,  9953, 10554,   808,  2999,  2130,  4286,
+    12078,  7445,  5132,  7915,   245,  5974,  4874,  7292,
+    7560, 10539,  9952,  9075,  2113,  3721, 10285, 10022,
+    9578,  8934, 11074,  9498,   294,  4711,  3391,  1377,
+    9072, 10189,  4569, 10890,  9909,  6923,    53,  4653,
+    439, 10253,  7028, 10207,  8343,  1141,  2556,  7601,
+    8150, 10630,  8648,  9832,  7951, 11245,  2131,  5765,
+    10343,  9781,  2718,  1419,  4531,  3844,  4066,  4293,
+    11657, 11525, 11353,  4313,  4869, 12186,  1611, 10892,
+    11489,  8833,  2393,    15, 10830,  5003,    17,   565,
+    5891, 12177, 11058, 10412,  8885,  3974, 10981,  7130,
+    5840, 10482,  8338,  6035,  6964,  1574, 10936,  2020,
+    2465,  8191,   384,  2642,  2729,  5399,  2175,  9396,
+    11987,  8035,  4375,  6611,  5010, 11812,  9131, 11427,
+    104,  6348,  9643,  6757, 12110,  5617, 10935,   541,
+    135,  3041,  7200,  6526,  5085, 12136,   842,  4129,
+    7685, 11079,  8426,  1008,  2725, 11772,  6058,  1101,
+    1950,  8424,  5688,  6876, 12005, 10079,  5335,   927,
+    1770,   273,  8377,  2271,  5225, 10283,   116, 11807,
+    91, 11699,   757,  1304,  7524,  6451,  8032,  8154,
+    7456,  4191,   309,  2318,  2292, 10393, 11639,  9481,
+    12238, 10594,  9569,  7912, 10368,  9889, 12244,  7179,
+    3924,  3188,   367,  2077,   336,  5384,  5631,  8596,
+    4621,  1775,  8866,   451,  6108,  1317,  6246,  8795,
+    5896,  7283,  3132, 11564,  4977, 12161,  7371,  1366,
+    12130, 10619,  3809,  5149,  6300,  2638,  4197,  1418,
+    10065,  4156,  8373,  8644, 10445,   882,  8158, 10173,
+    9763, 12191,   459,  2966,  3166,   405,  5000,  9311,
+    6404,  8986,  1551,  8175,  3630, 10766,  9265,   700,
+    8573,  9508,  6630, 11437, 11595,  5850,  3950,  4775,
+    11941,  1446,  6018,  3386, 11470,  5310,  5476,   553,
+    9474,  2586,  1431,  2741,   473, 11383,  4745,   836,
+    4062, 10666,  7727, 11752,  5534,   312,  4307,  4351,
+    5764,  8679,  8381,  8187,     5,  7395,  4363,  1152,
+    5421,  5231,  6473,   436,  7567,  8603,  6229,  8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+    /*
+     * If x < 0, the cast to uint32_t will set the high bit to 1.
+     */
+    uint32_t y;
+
+    y = (uint32_t)x;
+    y += Q & -(y >> 31);
+    return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+    /*
+     * We compute x + y - q. If the result is negative, then the
+     * high bit will be set, and 'd >> 31' will be equal to 1;
+     * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+     * it will be an all-zero pattern. In other words, this
+     * implements a conditional addition of q.
+     */
+    uint32_t d;
+
+    d = x + y - Q;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+    /*
+     * As in mq_add(), we use a conditional addition to ensure the
+     * result is in the 0..q-1 range.
+     */
+    uint32_t d;
+
+    d = x - y;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+    x += Q & -(x & 1);
+    return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+    uint32_t z, w;
+
+    /*
+     * We compute x*y + k*q with a value of k chosen so that the 16
+     * low bits of the result are 0. We can then shift the value.
+     * After the shift, result may still be larger than q, but it
+     * will be lower than 2*q, so a conditional subtraction works.
+     */
+
+    z = x * y;
+    w = ((z * Q0I) & 0xFFFF) * Q;
+
+    /*
+     * When adding z and w, the result will have its low 16 bits
+     * equal to 0. Since x, y and z are lower than q, the sum will
+     * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+     * fit on 29 bits.
+     */
+    z = (z + w) >> 16;
+
+    /*
+     * After the shift, analysis shows that the value will be less
+     * than 2q. We do a subtraction then conditional subtraction to
+     * ensure the result is in the expected range.
+     */
+    z -= Q;
+    z += Q & -(z >> 31);
+    return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+    return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+    /*
+     * We invert y by computing y^(q-2) mod q.
+     *
+     * We use the following addition chain for exponent e = 12287:
+     *
+     *   e0 = 1
+     *   e1 = 2 * e0 = 2
+     *   e2 = e1 + e0 = 3
+     *   e3 = e2 + e1 = 5
+     *   e4 = 2 * e3 = 10
+     *   e5 = 2 * e4 = 20
+     *   e6 = 2 * e5 = 40
+     *   e7 = 2 * e6 = 80
+     *   e8 = 2 * e7 = 160
+     *   e9 = e8 + e2 = 163
+     *   e10 = e9 + e8 = 323
+     *   e11 = 2 * e10 = 646
+     *   e12 = 2 * e11 = 1292
+     *   e13 = e12 + e9 = 1455
+     *   e14 = 2 * e13 = 2910
+     *   e15 = 2 * e14 = 5820
+     *   e16 = e15 + e10 = 6143
+     *   e17 = 2 * e16 = 12286
+     *   e18 = e17 + e0 = 12287
+     *
+     * Additions on exponents are converted to Montgomery
+     * multiplications. We define all intermediate results as so
+     * many local variables, and let the C compiler work out which
+     * must be kept around.
+     */
+    uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+    uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+    y0 = mq_montymul(y, R2);
+    y1 = mq_montysqr(y0);
+    y2 = mq_montymul(y1, y0);
+    y3 = mq_montymul(y2, y1);
+    y4 = mq_montysqr(y3);
+    y5 = mq_montysqr(y4);
+    y6 = mq_montysqr(y5);
+    y7 = mq_montysqr(y6);
+    y8 = mq_montysqr(y7);
+    y9 = mq_montymul(y8, y2);
+    y10 = mq_montymul(y9, y8);
+    y11 = mq_montysqr(y10);
+    y12 = mq_montysqr(y11);
+    y13 = mq_montymul(y12, y9);
+    y14 = mq_montysqr(y13);
+    y15 = mq_montysqr(y14);
+    y16 = mq_montymul(y15, y10);
+    y17 = mq_montysqr(y16);
+    y18 = mq_montymul(y17, y0);
+
+    /*
+     * Final multiplication with x, which is not in Montgomery
+     * representation, computes the correct division result.
+     */
+    return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, i, j1;
+
+        ht = t >> 1;
+        for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+            size_t j, j2;
+            uint32_t s;
+
+            s = GMb[m + i];
+            j2 = j1 + ht;
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v;
+
+                u = a[j];
+                v = mq_montymul(a[j + ht], s);
+                a[j] = (uint16_t)mq_add(u, v);
+                a[j + ht] = (uint16_t)mq_sub(u, v);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+    uint32_t ni;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    while (m > 1) {
+        size_t hm, dt, i, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+            size_t j, j2;
+            uint32_t s;
+
+            j2 = j1 + t;
+            s = iGMb[hm + i];
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v, w;
+
+                u = a[j];
+                v = a[j + t];
+                a[j] = (uint16_t)mq_add(u, v);
+                w = mq_sub(u, v);
+                a[j + t] = (uint16_t)
+                           mq_montymul(w, s);
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * To complete the inverse NTT, we must now divide all values by
+     * n (the vector size). We thus need the inverse of n, i.e. we
+     * need to divide 1 by 2 logn times. But we also want it in
+     * Montgomery representation, i.e. we also want to multiply it
+     * by R = 2^16. In the common case, this should be a simple right
+     * shift. The loop below is generic and works also in corner cases;
+     * its computation time is negligible.
+     */
+    ni = R;
+    for (m = n; m > 1; m >>= 1) {
+        ni = mq_rshift1(ni);
+    }
+    for (m = 0; m < n; m ++) {
+        a[m] = (uint16_t)mq_montymul(a[m], ni);
+    }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], R2);
+    }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+    }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_sub(f[u], g[u]);
+    }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_to_ntt_monty(uint16_t *h, unsigned logn) {
+    mq_NTT(h, logn);
+    mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+                                        const uint16_t *h, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+
+    /*
+     * Reduce s2 elements modulo q ([0..q-1] range).
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+     */
+    mq_NTT(tt, logn);
+    mq_poly_montymul_ntt(tt, h, logn);
+    mq_iNTT(tt, logn);
+    mq_poly_sub(tt, c0, logn);
+
+    /*
+     * Normalize -s1 elements into the [-q/2..q/2] range.
+     */
+    for (u = 0; u < n; u ++) {
+        int32_t w;
+
+        w = (int32_t)tt[u];
+        w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+        ((int16_t *)tt)[u] = (int16_t)w;
+    }
+
+    /*
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
+     * is short enough.
+     */
+    return PQCLEAN_FALCONPADDED512_AVX2_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        tt[u] = (uint16_t)mq_conv_small(f[u]);
+        h[u] = (uint16_t)mq_conv_small(g[u]);
+    }
+    mq_NTT(h, logn);
+    mq_NTT(tt, logn);
+    for (u = 0; u < n; u ++) {
+        if (tt[u] == 0) {
+            return 0;
+        }
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *t1, *t2;
+
+    n = (size_t)1 << logn;
+    t1 = (uint16_t *)tmp;
+    t2 = t1 + n;
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint16_t)mq_conv_small(g[u]);
+        t2[u] = (uint16_t)mq_conv_small(F[u]);
+    }
+    mq_NTT(t1, logn);
+    mq_NTT(t2, logn);
+    mq_poly_tomonty(t1, logn);
+    mq_poly_montymul_ntt(t1, t2, logn);
+    for (u = 0; u < n; u ++) {
+        t2[u] = (uint16_t)mq_conv_small(f[u]);
+    }
+    mq_NTT(t2, logn);
+    for (u = 0; u < n; u ++) {
+        if (t2[u] == 0) {
+            return 0;
+        }
+        t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+    }
+    mq_iNTT(t1, logn);
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+        int32_t gi;
+
+        w = t1[u];
+        w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+        gi = *(int32_t *)&w;
+        if (gi < -127 || gi > +127) {
+            return 0;
+        }
+        G[u] = (int8_t)gi;
+    }
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED512_AVX2_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/LICENSE b/src/sig/falcon/pqclean_falcon-padded-512_clean/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin <thomas.pornin@nccgroup.com>.
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/api.h b/src/sig/falcon/pqclean_falcon-padded-512_clean/api.h
new file mode 100644
index 000000000..47c131469
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED512_CLEAN_API_H
+#define PQCLEAN_FALCONPADDED512_CLEAN_API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES   1281
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES   897
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES            666
+
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_ALGNAME          "Falcon-padded-512"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ *   public (pk): PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES
+ *   private (sk): PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/codec.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/codec.c
new file mode 100644
index 000000000..2105122ec
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_modq_encode(
+    void *out, size_t max_out_len,
+    const uint16_t *x, unsigned logn) {
+    size_t n, out_len, u;
+    uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        if (x[u] >= 12289) {
+            return 0;
+        }
+    }
+    out_len = ((n * 14) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << 14) | x[u];
+        acc_len += 14;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_modq_decode(
+    uint16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len, u;
+    const uint8_t *buf;
+    uint32_t acc;
+    int acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * 14) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    u = 0;
+    while (u < n) {
+        acc = (acc << 8) | (*buf ++);
+        acc_len += 8;
+        if (acc_len >= 14) {
+            unsigned w;
+
+            acc_len -= 14;
+            w = (acc >> acc_len) & 0x3FFF;
+            if (w >= 12289) {
+                return 0;
+            }
+            x[u ++] = (uint16_t)w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint16_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_decode(
+    int16_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            w |= -(w & mask2);
+            x[u ++] = (int16_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+    void *out, size_t max_out_len,
+    const int8_t *x, unsigned logn, unsigned bits) {
+    size_t n, u, out_len;
+    int minv, maxv;
+    uint8_t *buf;
+    uint32_t acc, mask;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    maxv = (1 << (bits - 1)) - 1;
+    minv = -maxv;
+    for (u = 0; u < n; u ++) {
+        if (x[u] < minv || x[u] > maxv) {
+            return 0;
+        }
+    }
+    out_len = ((n * bits) + 7) >> 3;
+    if (out == NULL) {
+        return out_len;
+    }
+    if (out_len > max_out_len) {
+        return 0;
+    }
+    buf = out;
+    acc = 0;
+    acc_len = 0;
+    mask = ((uint32_t)1 << bits) - 1;
+    for (u = 0; u < n; u ++) {
+        acc = (acc << bits) | ((uint8_t)x[u] & mask);
+        acc_len += bits;
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            *buf ++ = (uint8_t)(acc >> acc_len);
+        }
+    }
+    if (acc_len > 0) {
+        *buf ++ = (uint8_t)(acc << (8 - acc_len));
+    }
+    return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+    int8_t *x, unsigned logn, unsigned bits,
+    const void *in, size_t max_in_len) {
+    size_t n, in_len;
+    const uint8_t *buf;
+    size_t u;
+    uint32_t acc, mask1, mask2;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    in_len = ((n * bits) + 7) >> 3;
+    if (in_len > max_in_len) {
+        return 0;
+    }
+    buf = in;
+    u = 0;
+    acc = 0;
+    acc_len = 0;
+    mask1 = ((uint32_t)1 << bits) - 1;
+    mask2 = (uint32_t)1 << (bits - 1);
+    while (u < n) {
+        acc = (acc << 8) | *buf ++;
+        acc_len += 8;
+        while (acc_len >= bits && u < n) {
+            uint32_t w;
+
+            acc_len -= bits;
+            w = (acc >> acc_len) & mask1;
+            w |= -(w & mask2);
+            if (w == -mask2) {
+                /*
+                 * The -2^(bits-1) value is forbidden.
+                 */
+                return 0;
+            }
+            x[u ++] = (int8_t) * (int32_t *)&w;
+        }
+    }
+    if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+        /*
+         * Extra bits in the last byte must be zero.
+         */
+        return 0;
+    }
+    return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_comp_encode(
+    void *out, size_t max_out_len,
+    const int16_t *x, unsigned logn) {
+    uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = out;
+
+    /*
+     * Make sure that all values are within the -2047..+2047 range.
+     */
+    for (u = 0; u < n; u ++) {
+        if (x[u] < -2047 || x[u] > +2047) {
+            return 0;
+        }
+    }
+
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        int t;
+        unsigned w;
+
+        /*
+         * Get sign and absolute value of next integer; push the
+         * sign bit.
+         */
+        acc <<= 1;
+        t = x[u];
+        if (t < 0) {
+            t = -t;
+            acc |= 1;
+        }
+        w = (unsigned)t;
+
+        /*
+         * Push the low 7 bits of the absolute value.
+         */
+        acc <<= 7;
+        acc |= w & 127u;
+        w >>= 7;
+
+        /*
+         * We pushed exactly 8 bits.
+         */
+        acc_len += 8;
+
+        /*
+         * Push as many zeros as necessary, then a one. Since the
+         * absolute value is at most 2047, w can only range up to
+         * 15 at this point, thus we will add at most 16 bits
+         * here. With the 8 bits above and possibly up to 7 bits
+         * from previous iterations, we may go up to 31 bits, which
+         * will fit in the accumulator, which is an uint32_t.
+         */
+        acc <<= (w + 1);
+        acc |= 1;
+        acc_len += w + 1;
+
+        /*
+         * Produce all full bytes.
+         */
+        while (acc_len >= 8) {
+            acc_len -= 8;
+            if (buf != NULL) {
+                if (v >= max_out_len) {
+                    return 0;
+                }
+                buf[v] = (uint8_t)(acc >> acc_len);
+            }
+            v ++;
+        }
+    }
+
+    /*
+     * Flush remaining bits (if any).
+     */
+    if (acc_len > 0) {
+        if (buf != NULL) {
+            if (v >= max_out_len) {
+                return 0;
+            }
+            buf[v] = (uint8_t)(acc << (8 - acc_len));
+        }
+        v ++;
+    }
+
+    return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_comp_decode(
+    int16_t *x, unsigned logn,
+    const void *in, size_t max_in_len) {
+    const uint8_t *buf;
+    size_t n, u, v;
+    uint32_t acc;
+    unsigned acc_len;
+
+    n = (size_t)1 << logn;
+    buf = in;
+    acc = 0;
+    acc_len = 0;
+    v = 0;
+    for (u = 0; u < n; u ++) {
+        unsigned b, s, m;
+
+        /*
+         * Get next eight bits: sign and low seven bits of the
+         * absolute value.
+         */
+        if (v >= max_in_len) {
+            return 0;
+        }
+        acc = (acc << 8) | (uint32_t)buf[v ++];
+        b = acc >> acc_len;
+        s = b & 128;
+        m = b & 127;
+
+        /*
+         * Get next bits until a 1 is reached.
+         */
+        for (;;) {
+            if (acc_len == 0) {
+                if (v >= max_in_len) {
+                    return 0;
+                }
+                acc = (acc << 8) | (uint32_t)buf[v ++];
+                acc_len = 8;
+            }
+            acc_len --;
+            if (((acc >> acc_len) & 1) != 0) {
+                break;
+            }
+            m += 128;
+            if (m > 2047) {
+                return 0;
+            }
+        }
+
+        /*
+         * "-0" is forbidden.
+         */
+        if (s && m == 0) {
+            return 0;
+        }
+        if (s) {
+            x[u] = (int16_t) - m;
+        } else {
+            x[u] = (int16_t)m;
+        }
+    }
+
+    /*
+     * Unused bits in the last byte must be zero.
+     */
+    if ((acc & ((1u << acc_len) - 1u)) != 0) {
+        return 0;
+    }
+
+    return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ *   log(n)     n   max(f,g)   std(f,g)   max(F,G)   std(F,G)
+ *      1       2     129       56.31       143       60.02
+ *      2       4     123       40.93       160       46.52
+ *      3       8      97       28.97       159       38.01
+ *      4      16     100       21.48       154       32.50
+ *      5      32      71       15.41       151       29.36
+ *      6      64      59       11.07       138       27.77
+ *      7     128      39        7.91       144       27.00
+ *      8     256      32        5.63       148       26.61
+ *      9     512      22        4.00       137       26.46
+ *     10    1024      15        2.84       146       26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    7,
+    7,
+    6,
+    6,
+    5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[] = {
+    0, /* unused */
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ *   log(n)     n   max sig coeff (theoretical)
+ *      1       2       412
+ *      2       4       583
+ *      3       8       824
+ *      4      16      1166
+ *      5      32      1649
+ *      6      64      2332
+ *      7     128      3299
+ *      8     256      4665
+ *      9     512      6598
+ *     10    1024      9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_sig_bits[] = {
+    0, /* unused */
+    10,
+    11,
+    11,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/common.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/common.c
new file mode 100644
index 000000000..74e88e903
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
+
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
+    /*
+     * Each 16-bit sample is a value in 0..65535. The value is
+     * kept if it falls in 0..61444 (because 61445 = 5*12289)
+     * and rejected otherwise; thus, each sample has probability
+     * about 0.93758 of being selected.
+     *
+     * We want to oversample enough to be sure that we will
+     * have enough values with probability at least 1 - 2^(-256).
+     * Depending on degree N, this leads to the following
+     * required oversampling:
+     *
+     *   logn     n  oversampling
+     *     1      2     65
+     *     2      4     67
+     *     3      8     71
+     *     4     16     77
+     *     5     32     86
+     *     6     64    100
+     *     7    128    122
+     *     8    256    154
+     *     9    512    205
+     *    10   1024    287
+     *
+     * If logn >= 7, then the provided temporary buffer is large
+     * enough. Otherwise, we use a stack buffer of 63 entries
+     * (i.e. 126 bytes) for the values that do not fit in tmp[].
+     */
+
+    static const uint16_t overtab[] = {
+        0, /* unused */
+        65,
+        67,
+        71,
+        77,
+        86,
+        100,
+        122,
+        154,
+        205,
+        287
+    };
+
+    unsigned n, n2, u, m, p, over;
+    uint16_t *tt1, tt2[63];
+
+    /*
+     * We first generate m 16-bit value. Values 0..n-1 go to x[].
+     * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+     * We also reduce modulo q the values; rejected values are set
+     * to 0xFFFF.
+     */
+    n = 1U << logn;
+    n2 = n << 1;
+    over = overtab[logn];
+    m = n + over;
+    tt1 = (uint16_t *)tmp;
+    for (u = 0; u < m; u ++) {
+        uint8_t buf[2];
+        uint32_t w, wr;
+
+        inner_shake256_extract(sc, buf, sizeof buf);
+        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+        wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+        wr |= ((w - 61445) >> 31) - 1;
+        if (u < n) {
+            x[u] = (uint16_t)wr;
+        } else if (u < n2) {
+            tt1[u - n] = (uint16_t)wr;
+        } else {
+            tt2[u - n2] = (uint16_t)wr;
+        }
+    }
+
+    /*
+     * Now we must "squeeze out" the invalid values. We do this in
+     * a logarithmic sequence of passes; each pass computes where a
+     * value should go, and moves it down by 'p' slots if necessary,
+     * where 'p' uses an increasing powers-of-two scale. It can be
+     * shown that in all cases where the loop decides that a value
+     * has to be moved down by p slots, the destination slot is
+     * "free" (i.e. contains an invalid value).
+     */
+    for (p = 1; p <= over; p <<= 1) {
+        unsigned v;
+
+        /*
+         * In the loop below:
+         *
+         *   - v contains the index of the final destination of
+         *     the value; it is recomputed dynamically based on
+         *     whether values are valid or not.
+         *
+         *   - u is the index of the value we consider ("source");
+         *     its address is s.
+         *
+         *   - The loop may swap the value with the one at index
+         *     u-p. The address of the swap destination is d.
+         */
+        v = 0;
+        for (u = 0; u < m; u ++) {
+            uint16_t *s, *d;
+            unsigned j, sv, dv, mk;
+
+            if (u < n) {
+                s = &x[u];
+            } else if (u < n2) {
+                s = &tt1[u - n];
+            } else {
+                s = &tt2[u - n2];
+            }
+            sv = *s;
+
+            /*
+             * The value in sv should ultimately go to
+             * address v, i.e. jump back by u-v slots.
+             */
+            j = u - v;
+
+            /*
+             * We increment v for the next iteration, but
+             * only if the source value is valid. The mask
+             * 'mk' is -1 if the value is valid, 0 otherwise,
+             * so we _subtract_ mk.
+             */
+            mk = (sv >> 15) - 1U;
+            v -= mk;
+
+            /*
+             * In this loop we consider jumps by p slots; if
+             * u < p then there is nothing more to do.
+             */
+            if (u < p) {
+                continue;
+            }
+
+            /*
+             * Destination for the swap: value at address u-p.
+             */
+            if ((u - p) < n) {
+                d = &x[u - p];
+            } else if ((u - p) < n2) {
+                d = &tt1[(u - p) - n];
+            } else {
+                d = &tt2[(u - p) - n2];
+            }
+            dv = *d;
+
+            /*
+             * The swap should be performed only if the source
+             * is valid AND the jump j has its 'p' bit set.
+             */
+            mk &= -(((j & p) + 0x1FF) >> 9);
+
+            *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+        }
+    }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+    0,    /* unused */
+    101498,
+    208714,
+    428865,
+    892039,
+    1852696,
+    3842630,
+    7959734,
+    16468416,
+    34034726,
+    70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_is_short(
+    const int16_t *s1, const int16_t *s2, unsigned logn) {
+    /*
+     * We use the l2-norm. Code below uses only 32-bit operations to
+     * compute the square of the norm with saturation to 2^32-1 if
+     * the value exceeds 2^31-1.
+     */
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = (size_t)1 << logn;
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s1[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+        z = s2[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    s |= -(ng >> 31);
+
+    return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(
+    uint32_t sqn, const int16_t *s2, unsigned logn) {
+    size_t n, u;
+    uint32_t ng;
+
+    n = (size_t)1 << logn;
+    ng = -(sqn >> 31);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = s2[u];
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+    }
+    sqn |= -(ng >> 31);
+
+    return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/fft.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/fft.c
new file mode 100644
index 000000000..011fbe11d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/fft.c
@@ -0,0 +1,699 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_add(a_re, b_re); \
+        fpct_im = fpr_add(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_re, fpct_im; \
+        fpct_re = fpr_sub(a_re, b_re); \
+        fpct_im = fpr_sub(a_im, b_im); \
+        (d_re) = fpct_re; \
+        (d_im) = fpct_im; \
+    } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+        fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im)   do { \
+        fpr fpct_a_re, fpct_a_im; \
+        fpr fpct_b_re, fpct_b_im; \
+        fpr fpct_d_re, fpct_d_im; \
+        fpr fpct_m; \
+        fpct_a_re = (a_re); \
+        fpct_a_im = (a_im); \
+        fpct_b_re = (b_re); \
+        fpct_b_im = (b_im); \
+        fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+        fpct_m = fpr_inv(fpct_m); \
+        fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+        fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+        fpct_d_re = fpr_sub( \
+                             fpr_mul(fpct_a_re, fpct_b_re), \
+                             fpr_mul(fpct_a_im, fpct_b_im)); \
+        fpct_d_im = fpr_add( \
+                             fpr_mul(fpct_a_re, fpct_b_im), \
+                             fpr_mul(fpct_a_im, fpct_b_re)); \
+        (d_re) = fpct_d_re; \
+        (d_im) = fpct_d_im; \
+    } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ *    Re(f(w_j)) -> slot rev(j)/2
+ *    Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_FFT(fpr *f, unsigned logn) {
+    /*
+     * FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = N
+     *   for m = 1; m < N; m *= 2:
+     *       ht = t/2
+     *       for i1 = 0; i1 < m; i1 ++:
+     *           j1 = i1 * t
+     *           s = GM[m + i1]
+     *           for j = j1; j < (j1 + ht); j ++:
+     *               x = f[j]
+     *               y = s * f[j + ht]
+     *               f[j] = x + y
+     *               f[j + ht] = x - y
+     *       t = ht
+     *
+     * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+     *
+     * In the description above, f[] is supposed to contain complex
+     * numbers. In our in-memory representation, the real and
+     * imaginary parts of f[k] are in array slots k and k+N/2.
+     *
+     * We only keep the first half of the complex numbers. We can
+     * see that after the first iteration, the first and second halves
+     * of the array of complex numbers have separate lives, so we
+     * simply ignore the second part.
+     */
+
+    unsigned u;
+    size_t t, n, hn, m;
+
+    /*
+     * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+     * (because GM[1] = w^rev(1) = w^(N/2) = i).
+     * In our chosen representation, this is a no-op: everything is
+     * already where it should be.
+     */
+
+    /*
+     * Subsequent iterations are truncated to use only the first
+     * half of values.
+     */
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    t = hn;
+    for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+        size_t ht, hm, i1, j1;
+
+        ht = t >> 1;
+        hm = m >> 1;
+        for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+            size_t j, j2;
+
+            j2 = j1 + ht;
+            fpr s_re, s_im;
+
+            s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+            s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+            for (j = j1; j < j2; j ++) {
+                fpr x_re, x_im, y_re, y_im;
+
+                x_re = f[j];
+                x_im = f[j + hn];
+                y_re = f[j + ht];
+                y_im = f[j + ht + hn];
+                FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
+                FPC_ADD(f[j], f[j + hn],
+                        x_re, x_im, y_re, y_im);
+                FPC_SUB(f[j + ht], f[j + ht + hn],
+                        x_re, x_im, y_re, y_im);
+            }
+        }
+        t = ht;
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_iFFT(fpr *f, unsigned logn) {
+    /*
+     * Inverse FFT algorithm in bit-reversal order uses the following
+     * iterative algorithm:
+     *
+     *   t = 1
+     *   for m = N; m > 1; m /= 2:
+     *       hm = m/2
+     *       dt = t*2
+     *       for i1 = 0; i1 < hm; i1 ++:
+     *           j1 = i1 * dt
+     *           s = iGM[hm + i1]
+     *           for j = j1; j < (j1 + t); j ++:
+     *               x = f[j]
+     *               y = f[j + t]
+     *               f[j] = x + y
+     *               f[j + t] = s * (x - y)
+     *       t = dt
+     *   for i1 = 0; i1 < N; i1 ++:
+     *       f[i1] = f[i1] / N
+     *
+     * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+     * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+     *
+     * In the main loop (not counting the final division loop), in
+     * all iterations except the last, the first and second half of f[]
+     * (as an array of complex numbers) are separate. In our chosen
+     * representation, we do not keep the second half.
+     *
+     * The last iteration recombines the recomputed half with the
+     * implicit half, and should yield only real numbers since the
+     * target polynomial is real; moreover, s = i at that step.
+     * Thus, when considering x and y:
+     *    y = conj(x) since the final f[j] must be real
+     *    Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+     *    filled with 2*Im(x).
+     * But we already have Re(x) and Im(x) in array slots j and j+t
+     * in our chosen representation. That last iteration is thus a
+     * simple doubling of the values in all the array.
+     *
+     * We make the last iteration a no-op by tweaking the final
+     * division into a division by N/2, not N.
+     */
+    size_t u, n, hn, t, m;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    hn = n >> 1;
+    for (u = logn; u > 1; u --) {
+        size_t hm, dt, i1, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+            size_t j, j2;
+
+            j2 = j1 + t;
+            fpr s_re, s_im;
+
+            s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+            s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+            for (j = j1; j < j2; j ++) {
+                fpr x_re, x_im, y_re, y_im;
+
+                x_re = f[j];
+                x_im = f[j + hn];
+                y_re = f[j + t];
+                y_im = f[j + t + hn];
+                FPC_ADD(f[j], f[j + hn],
+                        x_re, x_im, y_re, y_im);
+                FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
+                FPC_MUL(f[j + t], f[j + t + hn],
+                        x_re, x_im, s_re, s_im);
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * Last iteration is a no-op, provided that we divide by N/2
+     * instead of N. We need to make a special case for logn = 0.
+     */
+    if (logn > 0) {
+        fpr ni;
+
+        ni = fpr_p2_tab[logn];
+        for (u = 0; u < n; u ++) {
+            f[u] = fpr_mul(f[u], ni);
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_add(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_add(a[u], b[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_sub(a[u], b[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_neg(a[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(fpr *a, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = (n >> 1); u < n; u ++) {
+        a[u] = fpr_neg(a[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = b[u + hn];
+        FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = fpr_neg(b[u + hn]);
+        FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+    /*
+     * Since each coefficient is multiplied with its own conjugate,
+     * the result contains only real values.
+     */
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+        a[u + hn] = fpr_zero;
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+    size_t n, u;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        a[u] = fpr_mul(a[u], x);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_div_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = b[u + hn];
+        FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr a_re, a_im;
+        fpr b_re, b_im;
+
+        a_re = a[u];
+        a_im = a[u + hn];
+        b_re = b[u];
+        b_im = b[u + hn];
+        d[u] = fpr_inv(fpr_add(
+                           fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+                           fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr F_re, F_im, G_re, G_im;
+        fpr f_re, f_im, g_re, g_im;
+        fpr a_re, a_im, b_re, b_im;
+
+        F_re = F[u];
+        F_im = F[u + hn];
+        G_re = G[u];
+        G_im = G[u + hn];
+        f_re = f[u];
+        f_im = f[u + hn];
+        g_re = g[u];
+        g_im = g[u + hn];
+
+        FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+        FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+        d[u] = fpr_add(a_re, b_re);
+        d[u + hn] = fpr_add(a_im, b_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        a[u] = fpr_mul(a[u], b[u]);
+        a[u + hn] = fpr_mul(a[u + hn], b[u]);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_div_autoadj_fft(
+    fpr *a, const fpr *b, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr ib;
+
+        ib = fpr_inv(b[u]);
+        a[u] = fpr_mul(a[u], ib);
+        a[u + hn] = fpr_mul(a[u + hn], ib);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_LDL_fft(
+    const fpr *g00,
+    fpr *g01, fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+        fpr mu_re, mu_im;
+
+        g00_re = g00[u];
+        g00_im = g00[u + hn];
+        g01_re = g01[u];
+        g01_im = g01[u + hn];
+        g11_re = g11[u];
+        g11_im = g11[u + hn];
+        FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+        FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+        FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
+        g01[u] = mu_re;
+        g01[u + hn] = fpr_neg(mu_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(
+    fpr *d11, fpr *l10,
+    const fpr *g00, const fpr *g01,
+    const fpr *g11, unsigned logn) {
+    size_t n, hn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    for (u = 0; u < hn; u ++) {
+        fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+        fpr mu_re, mu_im;
+
+        g00_re = g00[u];
+        g00_im = g00[u + hn];
+        g01_re = g01[u];
+        g01_im = g01[u + hn];
+        g11_re = g11[u];
+        g11_im = g11[u + hn];
+        FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+        FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+        FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
+        l10[u] = mu_re;
+        l10[u + hn] = fpr_neg(mu_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(
+    fpr *f0, fpr *f1,
+    const fpr *f, unsigned logn) {
+    /*
+     * The FFT representation we use is in bit-reversed order
+     * (element i contains f(w^(rev(i))), where rev() is the
+     * bit-reversal function over the ring degree. This changes
+     * indexes with regards to the Falcon specification.
+     */
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    /*
+     * We process complex values by pairs. For logn = 1, there is only
+     * one complex value (the other one is the implicit conjugate),
+     * so we add the two lines below because the loop will be
+     * skipped.
+     */
+    f0[0] = f[0];
+    f1[0] = f[hn];
+
+    for (u = 0; u < qn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+        fpr t_re, t_im;
+
+        a_re = f[(u << 1) + 0];
+        a_im = f[(u << 1) + 0 + hn];
+        b_re = f[(u << 1) + 1];
+        b_im = f[(u << 1) + 1 + hn];
+
+        FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+        f0[u] = fpr_half(t_re);
+        f0[u + qn] = fpr_half(t_im);
+
+        FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+        FPC_MUL(t_re, t_im, t_re, t_im,
+                fpr_gm_tab[((u + hn) << 1) + 0],
+                fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+        f1[u] = fpr_half(t_re);
+        f1[u + qn] = fpr_half(t_im);
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(
+    fpr *f,
+    const fpr *f0, const fpr *f1, unsigned logn) {
+    size_t n, hn, qn, u;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    qn = hn >> 1;
+
+    /*
+     * An extra copy to handle the special case logn = 1.
+     */
+    f[0] = f0[0];
+    f[hn] = f1[0];
+
+    for (u = 0; u < qn; u ++) {
+        fpr a_re, a_im, b_re, b_im;
+        fpr t_re, t_im;
+
+        a_re = f0[u];
+        a_im = f0[u + qn];
+        FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+                fpr_gm_tab[((u + hn) << 1) + 0],
+                fpr_gm_tab[((u + hn) << 1) + 1]);
+        FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+        f[(u << 1) + 0] = t_re;
+        f[(u << 1) + 0 + hn] = t_im;
+        FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+        f[(u << 1) + 1] = t_re;
+        f[(u << 1) + 1 + hn] = t_im;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.c
new file mode 100644
index 000000000..82ff1df46
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.c
@@ -0,0 +1,1622 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/*
+ * Normalize a provided unsigned integer to the 2^63..2^64-1 range by
+ * left-shifting it if necessary. The exponent e is adjusted accordingly
+ * (i.e. if the value was left-shifted by n bits, then n is subtracted
+ * from e). If source m is 0, then it remains 0, but e is altered.
+ * Both m and e must be simple variables (no expressions allowed).
+ */
+#define FPR_NORM64(m, e)   do { \
+        uint32_t nt; \
+        \
+        (e) -= 63; \
+        \
+        nt = (uint32_t)((m) >> 32); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 5); \
+        \
+        nt = (uint32_t)((m) >> 48); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 4); \
+        \
+        nt = (uint32_t)((m) >> 56); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) <<  8)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 3); \
+        \
+        nt = (uint32_t)((m) >> 60); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) <<  4)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 2); \
+        \
+        nt = (uint32_t)((m) >> 62); \
+        nt = (nt | -nt) >> 31; \
+        (m) ^= ((m) ^ ((m) <<  2)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt << 1); \
+        \
+        nt = (uint32_t)((m) >> 63); \
+        (m) ^= ((m) ^ ((m) <<  1)) & ((uint64_t)nt - 1); \
+        (e) += (int)(nt); \
+    } while (0)
+
+fpr
+fpr_scaled(int64_t i, int sc) {
+    /*
+     * To convert from int to float, we have to do the following:
+     *  1. Get the absolute value of the input, and its sign
+     *  2. Shift right or left the value as appropriate
+     *  3. Pack the result
+     *
+     * We can assume that the source integer is not -2^63.
+     */
+    int s, e;
+    uint32_t t;
+    uint64_t m;
+
+    /*
+     * Extract sign bit.
+     * We have: -i = 1 + ~i
+     */
+    s = (int)((uint64_t)i >> 63);
+    i ^= -(int64_t)s;
+    i += s;
+
+    /*
+     * For now we suppose that i != 0.
+     * Otherwise, we set m to i and left-shift it as much as needed
+     * to get a 1 in the top bit. We can do that in a logarithmic
+     * number of conditional shifts.
+     */
+    m = (uint64_t)i;
+    e = 9 + sc;
+    FPR_NORM64(m, e);
+
+    /*
+     * Now m is in the 2^63..2^64-1 range. We must divide it by 512;
+     * if one of the dropped bits is a 1, this should go into the
+     * "sticky bit".
+     */
+    m |= ((uint32_t)m & 0x1FF) + 0x1FF;
+    m >>= 9;
+
+    /*
+     * Corrective action: if i = 0 then all of the above was
+     * incorrect, and we clamp e and m down to zero.
+     */
+    t = (uint32_t)((uint64_t)(i | -i) >> 63);
+    m &= -(uint64_t)t;
+    e &= -(int)t;
+
+    /*
+     * Assemble back everything. The FPR() function will handle cases
+     * where e is too low.
+     */
+    return FPR(s, e, m);
+}
+
+fpr
+fpr_add(fpr x, fpr y) {
+    uint64_t m, xu, yu, za;
+    uint32_t cs;
+    int ex, ey, sx, sy, cc;
+
+    /*
+     * Make sure that the first operand (x) has the larger absolute
+     * value. This guarantees that the exponent of y is less than
+     * or equal to the exponent of x, and, if they are equal, then
+     * the mantissa of y will not be greater than the mantissa of x.
+     *
+     * After this swap, the result will have the sign x, except in
+     * the following edge case: abs(x) = abs(y), and x and y have
+     * opposite sign bits; in that case, the result shall be +0
+     * even if the sign bit of x is 1. To handle this case properly,
+     * we do the swap is abs(x) = abs(y) AND the sign of x is 1.
+     */
+    m = ((uint64_t)1 << 63) - 1;
+    za = (x & m) - (y & m);
+    cs = (uint32_t)(za >> 63)
+         | ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63));
+    m = (x ^ y) & -(uint64_t)cs;
+    x ^= m;
+    y ^= m;
+
+    /*
+     * Extract sign bits, exponents and mantissas. The mantissas are
+     * scaled up to 2^55..2^56-1, and the exponent is unbiased. If
+     * an operand is zero, its mantissa is set to 0 at this step, and
+     * its exponent will be -1078.
+     */
+    ex = (int)(x >> 52);
+    sx = ex >> 11;
+    ex &= 0x7FF;
+    m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52;
+    xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3;
+    ex -= 1078;
+    ey = (int)(y >> 52);
+    sy = ey >> 11;
+    ey &= 0x7FF;
+    m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52;
+    yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3;
+    ey -= 1078;
+
+    /*
+     * x has the larger exponent; hence, we only need to right-shift y.
+     * If the shift count is larger than 59 bits then we clamp the
+     * value to zero.
+     */
+    cc = ex - ey;
+    yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31);
+    cc &= 63;
+
+    /*
+     * The lowest bit of yu is "sticky".
+     */
+    m = fpr_ulsh(1, cc) - 1;
+    yu |= (yu & m) + m;
+    yu = fpr_ursh(yu, cc);
+
+    /*
+     * If the operands have the same sign, then we add the mantissas;
+     * otherwise, we subtract the mantissas.
+     */
+    xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy));
+
+    /*
+     * The result may be smaller, or slightly larger. We normalize
+     * it to the 2^63..2^64-1 range (if xu is zero, then it stays
+     * at zero).
+     */
+    FPR_NORM64(xu, ex);
+
+    /*
+     * Scale down the value to 2^54..s^55-1, handling the last bit
+     * as sticky.
+     */
+    xu |= ((uint32_t)xu & 0x1FF) + 0x1FF;
+    xu >>= 9;
+    ex += 9;
+
+    /*
+     * In general, the result has the sign of x. However, if the
+     * result is exactly zero, then the following situations may
+     * be encountered:
+     *   x > 0, y = -x   -> result should be +0
+     *   x < 0, y = -x   -> result should be +0
+     *   x = +0, y = +0  -> result should be +0
+     *   x = -0, y = +0  -> result should be +0
+     *   x = +0, y = -0  -> result should be +0
+     *   x = -0, y = -0  -> result should be -0
+     *
+     * But at the conditional swap step at the start of the
+     * function, we ensured that if abs(x) = abs(y) and the
+     * sign of x was 1, then x and y were swapped. Thus, the
+     * two following cases cannot actually happen:
+     *   x < 0, y = -x
+     *   x = -0, y = +0
+     * In all other cases, the sign bit of x is conserved, which
+     * is what the FPR() function does. The FPR() function also
+     * properly clamps values to zero when the exponent is too
+     * low, but does not alter the sign in that case.
+     */
+    return FPR(sx, ex, xu);
+}
+
+fpr
+fpr_mul(fpr x, fpr y) {
+    uint64_t xu, yu, w, zu, zv;
+    uint32_t x0, x1, y0, y1, z0, z1, z2;
+    int ex, ey, d, e, s;
+
+    /*
+     * Extract absolute values as scaled unsigned integers. We
+     * don't extract exponents yet.
+     */
+    xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+    yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+    /*
+     * We have two 53-bit integers to multiply; we need to split
+     * each into a lower half and a upper half. Moreover, we
+     * prefer to have lower halves to be of 25 bits each, for
+     * reasons explained later on.
+     */
+    x0 = (uint32_t)xu & 0x01FFFFFF;
+    x1 = (uint32_t)(xu >> 25);
+    y0 = (uint32_t)yu & 0x01FFFFFF;
+    y1 = (uint32_t)(yu >> 25);
+    w = (uint64_t)x0 * (uint64_t)y0;
+    z0 = (uint32_t)w & 0x01FFFFFF;
+    z1 = (uint32_t)(w >> 25);
+    w = (uint64_t)x0 * (uint64_t)y1;
+    z1 += (uint32_t)w & 0x01FFFFFF;
+    z2 = (uint32_t)(w >> 25);
+    w = (uint64_t)x1 * (uint64_t)y0;
+    z1 += (uint32_t)w & 0x01FFFFFF;
+    z2 += (uint32_t)(w >> 25);
+    zu = (uint64_t)x1 * (uint64_t)y1;
+    z2 += (z1 >> 25);
+    z1 &= 0x01FFFFFF;
+    zu += z2;
+
+    /*
+     * Since xu and yu are both in the 2^52..2^53-1 range, the
+     * product is in the 2^104..2^106-1 range. We first reassemble
+     * it and round it into the 2^54..2^56-1 range; the bottom bit
+     * is made "sticky". Since the low limbs z0 and z1 are 25 bits
+     * each, we just take the upper part (zu), and consider z0 and
+     * z1 only for purposes of stickiness.
+     * (This is the reason why we chose 25-bit limbs above.)
+     */
+    zu |= ((z0 | z1) + 0x01FFFFFF) >> 25;
+
+    /*
+     * We normalize zu to the 2^54..s^55-1 range: it could be one
+     * bit too large at this point. This is done with a conditional
+     * right-shift that takes into account the sticky bit.
+     */
+    zv = (zu >> 1) | (zu & 1);
+    w = zu >> 55;
+    zu ^= (zu ^ zv) & -w;
+
+    /*
+     * Get the aggregate scaling factor:
+     *
+     *   - Each exponent is biased by 1023.
+     *
+     *   - Integral mantissas are scaled by 2^52, hence an
+     *     extra 52 bias for each exponent.
+     *
+     *   - However, we right-shifted z by 50 bits, and then
+     *     by 0 or 1 extra bit (depending on the value of w).
+     *
+     * In total, we must add the exponents, then subtract
+     * 2 * (1023 + 52), then add 50 + w.
+     */
+    ex = (int)((x >> 52) & 0x7FF);
+    ey = (int)((y >> 52) & 0x7FF);
+    e = ex + ey - 2100 + (int)w;
+
+    /*
+     * Sign bit is the XOR of the operand sign bits.
+     */
+    s = (int)((x ^ y) >> 63);
+
+    /*
+     * Corrective actions for zeros: if either of the operands is
+     * zero, then the computations above were wrong. Test for zero
+     * is whether ex or ey is zero. We just have to set the mantissa
+     * (zu) to zero, the FPR() function will normalize e.
+     */
+    d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11;
+    zu &= -(uint64_t)d;
+
+    /*
+     * FPR() packs the result and applies proper rounding.
+     */
+    return FPR(s, e, zu);
+}
+
+fpr
+fpr_div(fpr x, fpr y) {
+    uint64_t xu, yu, q, q2, w;
+    int i, ex, ey, e, d, s;
+
+    /*
+     * Extract mantissas of x and y (unsigned).
+     */
+    xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+    yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+    /*
+     * Perform bit-by-bit division of xu by yu. We run it for 55 bits.
+     */
+    q = 0;
+    for (i = 0; i < 55; i ++) {
+        /*
+         * If yu is less than or equal xu, then subtract it and
+         * push a 1 in the quotient; otherwise, leave xu unchanged
+         * and push a 0.
+         */
+        uint64_t b;
+
+        b = ((xu - yu) >> 63) - 1;
+        xu -= b & yu;
+        q |= b & 1;
+        xu <<= 1;
+        q <<= 1;
+    }
+
+    /*
+     * We got 55 bits in the quotient, followed by an extra zero. We
+     * want that 56th bit to be "sticky": it should be a 1 if and
+     * only if the remainder (xu) is non-zero.
+     */
+    q |= (xu | -xu) >> 63;
+
+    /*
+     * Quotient is at most 2^56-1. Its top bit may be zero, but in
+     * that case the next-to-top bit will be a one, since the
+     * initial xu and yu were both in the 2^52..2^53-1 range.
+     * We perform a conditional shift to normalize q to the
+     * 2^54..2^55-1 range (with the bottom bit being sticky).
+     */
+    q2 = (q >> 1) | (q & 1);
+    w = q >> 55;
+    q ^= (q ^ q2) & -w;
+
+    /*
+     * Extract exponents to compute the scaling factor:
+     *
+     *   - Each exponent is biased and we scaled them up by
+     *     52 bits; but these biases will cancel out.
+     *
+     *   - The division loop produced a 55-bit shifted result,
+     *     so we must scale it down by 55 bits.
+     *
+     *   - If w = 1, we right-shifted the integer by 1 bit,
+     *     hence we must add 1 to the scaling.
+     */
+    ex = (int)((x >> 52) & 0x7FF);
+    ey = (int)((y >> 52) & 0x7FF);
+    e = ex - ey - 55 + (int)w;
+
+    /*
+     * Sign is the XOR of the signs of the operands.
+     */
+    s = (int)((x ^ y) >> 63);
+
+    /*
+     * Corrective actions for zeros: if x = 0, then the computation
+     * is wrong, and we must clamp e and q to 0. We do not care
+     * about the case y = 0 (as per assumptions in this module,
+     * the caller does not perform divisions by zero).
+     */
+    d = (ex + 0x7FF) >> 11;
+    s &= d;
+    e &= -d;
+    q &= -(uint64_t)d;
+
+    /*
+     * FPR() packs the result and applies proper rounding.
+     */
+    return FPR(s, e, q);
+}
+
+fpr
+fpr_sqrt(fpr x) {
+    uint64_t xu, q, s, r;
+    int ex, e;
+
+    /*
+     * Extract the mantissa and the exponent. We don't care about
+     * the sign: by assumption, the operand is nonnegative.
+     * We want the "true" exponent corresponding to a mantissa
+     * in the 1..2 range.
+     */
+    xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+    ex = (int)((x >> 52) & 0x7FF);
+    e = ex - 1023;
+
+    /*
+     * If the exponent is odd, double the mantissa and decrement
+     * the exponent. The exponent is then halved to account for
+     * the square root.
+     */
+    xu += xu & -(uint64_t)(e & 1);
+    e >>= 1;
+
+    /*
+     * Double the mantissa.
+     */
+    xu <<= 1;
+
+    /*
+     * We now have a mantissa in the 2^53..2^55-1 range. It
+     * represents a value between 1 (inclusive) and 4 (exclusive)
+     * in fixed point notation (with 53 fractional bits). We
+     * compute the square root bit by bit.
+     */
+    q = 0;
+    s = 0;
+    r = (uint64_t)1 << 53;
+    for (int i = 0; i < 54; i ++) {
+        uint64_t t, b;
+
+        t = s + r;
+        b = ((xu - t) >> 63) - 1;
+        s += (r << 1) & b;
+        xu -= t & b;
+        q += r & b;
+        xu <<= 1;
+        r >>= 1;
+    }
+
+    /*
+     * Now, q is a rounded-low 54-bit value, with a leading 1,
+     * 52 fractional digits, and an additional guard bit. We add
+     * an extra sticky bit to account for what remains of the operand.
+     */
+    q <<= 1;
+    q |= (xu | -xu) >> 63;
+
+    /*
+     * Result q is in the 2^54..2^55-1 range; we bias the exponent
+     * by 54 bits (the value e at that point contains the "true"
+     * exponent, but q is now considered an integer, i.e. scaled
+     * up.
+     */
+    e -= 54;
+
+    /*
+     * Corrective action for an operand of value zero.
+     */
+    q &= -(uint64_t)((ex + 0x7FF) >> 11);
+
+    /*
+     * Apply rounding and back result.
+     */
+    return FPR(0, e, q);
+}
+
+uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+    /*
+     * Polynomial approximation of exp(-x) is taken from FACCT:
+     *   https://eprint.iacr.org/2018/1234
+     * Specifically, values are extracted from the implementation
+     * referenced from the FACCT article, and available at:
+     *   https://github.com/raykzhao/gaussian
+     * Here, the coefficients have been scaled up by 2^63 and
+     * converted to integers.
+     *
+     * Tests over more than 24 billions of random inputs in the
+     * 0..log(2) range have never shown a deviation larger than
+     * 2^(-50) from the true mathematical value.
+     */
+    static const uint64_t C[] = {
+        0x00000004741183A3u,
+        0x00000036548CFC06u,
+        0x0000024FDCBF140Au,
+        0x0000171D939DE045u,
+        0x0000D00CF58F6F84u,
+        0x000680681CF796E3u,
+        0x002D82D8305B0FEAu,
+        0x011111110E066FD0u,
+        0x0555555555070F00u,
+        0x155555555581FF00u,
+        0x400000000002B400u,
+        0x7FFFFFFFFFFF4800u,
+        0x8000000000000000u
+    };
+
+    uint64_t z, y;
+    unsigned u;
+    uint32_t z0, z1, y0, y1;
+    uint64_t a, b;
+
+    y = C[0];
+    z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
+    for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) {
+        /*
+         * Compute product z * y over 128 bits, but keep only
+         * the top 64 bits.
+         *
+         * TODO: On some architectures/compilers we could use
+         * some intrinsics (__umulh() on MSVC) or other compiler
+         * extensions (unsigned __int128 on GCC / Clang) for
+         * improved speed; however, most 64-bit architectures
+         * also have appropriate IEEE754 floating-point support,
+         * which is better.
+         */
+        uint64_t c;
+
+        z0 = (uint32_t)z;
+        z1 = (uint32_t)(z >> 32);
+        y0 = (uint32_t)y;
+        y1 = (uint32_t)(y >> 32);
+        a = ((uint64_t)z0 * (uint64_t)y1)
+            + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+        b = ((uint64_t)z1 * (uint64_t)y0);
+        c = (a >> 32) + (b >> 32);
+        c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+        c += (uint64_t)z1 * (uint64_t)y1;
+        y = C[u] - c;
+    }
+
+    /*
+     * The scaling factor must be applied at the end. Since y is now
+     * in fixed-point notation, we have to convert the factor to the
+     * same format, and do an extra integer multiplication.
+     */
+    z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+    z0 = (uint32_t)z;
+    z1 = (uint32_t)(z >> 32);
+    y0 = (uint32_t)y;
+    y1 = (uint32_t)(y >> 32);
+    a = ((uint64_t)z0 * (uint64_t)y1)
+        + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+    b = ((uint64_t)z1 * (uint64_t)y0);
+    y = (a >> 32) + (b >> 32);
+    y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+    y += (uint64_t)z1 * (uint64_t)y1;
+
+    return y;
+}
+
+const fpr fpr_gm_tab[] = {
+    0, 0,
+    9223372036854775808U,  4607182418800017408U,
+    4604544271217802189U,  4604544271217802189U,
+    13827916308072577997U,  4604544271217802189U,
+    4606496786581982534U,  4600565431771507043U,
+    13823937468626282851U,  4606496786581982534U,
+    4600565431771507043U,  4606496786581982534U,
+    13829868823436758342U,  4600565431771507043U,
+    4607009347991985328U,  4596196889902818827U,
+    13819568926757594635U,  4607009347991985328U,
+    4603179351334086856U,  4605664432017547683U,
+    13829036468872323491U,  4603179351334086856U,
+    4605664432017547683U,  4603179351334086856U,
+    13826551388188862664U,  4605664432017547683U,
+    4596196889902818827U,  4607009347991985328U,
+    13830381384846761136U,  4596196889902818827U,
+    4607139046673687846U,  4591727299969791020U,
+    13815099336824566828U,  4607139046673687846U,
+    4603889326261607894U,  4605137878724712257U,
+    13828509915579488065U,  4603889326261607894U,
+    4606118860100255153U,  4602163548591158843U,
+    13825535585445934651U,  4606118860100255153U,
+    4598900923775164166U,  4606794571824115162U,
+    13830166608678890970U,  4598900923775164166U,
+    4606794571824115162U,  4598900923775164166U,
+    13822272960629939974U,  4606794571824115162U,
+    4602163548591158843U,  4606118860100255153U,
+    13829490896955030961U,  4602163548591158843U,
+    4605137878724712257U,  4603889326261607894U,
+    13827261363116383702U,  4605137878724712257U,
+    4591727299969791020U,  4607139046673687846U,
+    13830511083528463654U,  4591727299969791020U,
+    4607171569234046334U,  4587232218149935124U,
+    13810604255004710932U,  4607171569234046334U,
+    4604224084862889120U,  4604849113969373103U,
+    13828221150824148911U,  4604224084862889120U,
+    4606317631232591731U,  4601373767755717824U,
+    13824745804610493632U,  4606317631232591731U,
+    4599740487990714333U,  4606655894547498725U,
+    13830027931402274533U,  4599740487990714333U,
+    4606912484326125783U,  4597922303871901467U,
+    13821294340726677275U,  4606912484326125783U,
+    4602805845399633902U,  4605900952042040894U,
+    13829272988896816702U,  4602805845399633902U,
+    4605409869824231233U,  4603540801876750389U,
+    13826912838731526197U,  4605409869824231233U,
+    4594454542771183930U,  4607084929468638487U,
+    13830456966323414295U,  4594454542771183930U,
+    4607084929468638487U,  4594454542771183930U,
+    13817826579625959738U,  4607084929468638487U,
+    4603540801876750389U,  4605409869824231233U,
+    13828781906679007041U,  4603540801876750389U,
+    4605900952042040894U,  4602805845399633902U,
+    13826177882254409710U,  4605900952042040894U,
+    4597922303871901467U,  4606912484326125783U,
+    13830284521180901591U,  4597922303871901467U,
+    4606655894547498725U,  4599740487990714333U,
+    13823112524845490141U,  4606655894547498725U,
+    4601373767755717824U,  4606317631232591731U,
+    13829689668087367539U,  4601373767755717824U,
+    4604849113969373103U,  4604224084862889120U,
+    13827596121717664928U,  4604849113969373103U,
+    4587232218149935124U,  4607171569234046334U,
+    13830543606088822142U,  4587232218149935124U,
+    4607179706000002317U,  4582730748936808062U,
+    13806102785791583870U,  4607179706000002317U,
+    4604386048625945823U,  4604698657331085206U,
+    13828070694185861014U,  4604386048625945823U,
+    4606409688975526202U,  4600971798440897930U,
+    13824343835295673738U,  4606409688975526202U,
+    4600154912527631775U,  4606578871587619388U,
+    13829950908442395196U,  4600154912527631775U,
+    4606963563043808649U,  4597061974398750563U,
+    13820434011253526371U,  4606963563043808649U,
+    4602994049708411683U,  4605784983948558848U,
+    13829157020803334656U,  4602994049708411683U,
+    4605539368864982914U,  4603361638657888991U,
+    13826733675512664799U,  4605539368864982914U,
+    4595327571478659014U,  4607049811591515049U,
+    13830421848446290857U,  4595327571478659014U,
+    4607114680469659603U,  4593485039402578702U,
+    13816857076257354510U,  4607114680469659603U,
+    4603716733069447353U,  4605276012900672507U,
+    13828648049755448315U,  4603716733069447353U,
+    4606012266443150634U,  4602550884377336506U,
+    13825922921232112314U,  4606012266443150634U,
+    4598476289818621559U,  4606856142606846307U,
+    13830228179461622115U,  4598476289818621559U,
+    4606727809065869586U,  4599322407794599425U,
+    13822694444649375233U,  4606727809065869586U,
+    4601771097584682078U,  4606220668805321205U,
+    13829592705660097013U,  4601771097584682078U,
+    4604995550503212910U,  4604058477489546729U,
+    13827430514344322537U,  4604995550503212910U,
+    4589965306122607094U,  4607158013403433018U,
+    13830530050258208826U,  4589965306122607094U,
+    4607158013403433018U,  4589965306122607094U,
+    13813337342977382902U,  4607158013403433018U,
+    4604058477489546729U,  4604995550503212910U,
+    13828367587357988718U,  4604058477489546729U,
+    4606220668805321205U,  4601771097584682078U,
+    13825143134439457886U,  4606220668805321205U,
+    4599322407794599425U,  4606727809065869586U,
+    13830099845920645394U,  4599322407794599425U,
+    4606856142606846307U,  4598476289818621559U,
+    13821848326673397367U,  4606856142606846307U,
+    4602550884377336506U,  4606012266443150634U,
+    13829384303297926442U,  4602550884377336506U,
+    4605276012900672507U,  4603716733069447353U,
+    13827088769924223161U,  4605276012900672507U,
+    4593485039402578702U,  4607114680469659603U,
+    13830486717324435411U,  4593485039402578702U,
+    4607049811591515049U,  4595327571478659014U,
+    13818699608333434822U,  4607049811591515049U,
+    4603361638657888991U,  4605539368864982914U,
+    13828911405719758722U,  4603361638657888991U,
+    4605784983948558848U,  4602994049708411683U,
+    13826366086563187491U,  4605784983948558848U,
+    4597061974398750563U,  4606963563043808649U,
+    13830335599898584457U,  4597061974398750563U,
+    4606578871587619388U,  4600154912527631775U,
+    13823526949382407583U,  4606578871587619388U,
+    4600971798440897930U,  4606409688975526202U,
+    13829781725830302010U,  4600971798440897930U,
+    4604698657331085206U,  4604386048625945823U,
+    13827758085480721631U,  4604698657331085206U,
+    4582730748936808062U,  4607179706000002317U,
+    13830551742854778125U,  4582730748936808062U,
+    4607181740574479067U,  4578227681973159812U,
+    13801599718827935620U,  4607181740574479067U,
+    4604465633578481725U,  4604621949701367983U,
+    13827993986556143791U,  4604465633578481725U,
+    4606453861145241227U,  4600769149537129431U,
+    13824141186391905239U,  4606453861145241227U,
+    4600360675823176935U,  4606538458821337243U,
+    13829910495676113051U,  4600360675823176935U,
+    4606987119037722413U,  4596629994023683153U,
+    13820002030878458961U,  4606987119037722413U,
+    4603087070374583113U,  4605725276488455441U,
+    13829097313343231249U,  4603087070374583113U,
+    4605602459698789090U,  4603270878689749849U,
+    13826642915544525657U,  4605602459698789090U,
+    4595762727260045105U,  4607030246558998647U,
+    13830402283413774455U,  4595762727260045105U,
+    4607127537664763515U,  4592606767730311893U,
+    13815978804585087701U,  4607127537664763515U,
+    4603803453461190356U,  4605207475328619533U,
+    13828579512183395341U,  4603803453461190356U,
+    4606066157444814153U,  4602357870542944470U,
+    13825729907397720278U,  4606066157444814153U,
+    4598688984595225406U,  4606826008603986804U,
+    13830198045458762612U,  4598688984595225406U,
+    4606761837001494797U,  4599112075441176914U,
+    13822484112295952722U,  4606761837001494797U,
+    4601967947786150793U,  4606170366472647579U,
+    13829542403327423387U,  4601967947786150793U,
+    4605067233569943231U,  4603974338538572089U,
+    13827346375393347897U,  4605067233569943231U,
+    4590846768565625881U,  4607149205763218185U,
+    13830521242617993993U,  4590846768565625881U,
+    4607165468267934125U,  4588998070480937184U,
+    13812370107335712992U,  4607165468267934125U,
+    4604141730443515286U,  4604922840319727473U,
+    13828294877174503281U,  4604141730443515286U,
+    4606269759522929756U,  4601573027631668967U,
+    13824945064486444775U,  4606269759522929756U,
+    4599531889160152938U,  4606692493141721470U,
+    13830064529996497278U,  4599531889160152938U,
+    4606884969294623682U,  4598262871476403630U,
+    13821634908331179438U,  4606884969294623682U,
+    4602710690099904183U,  4605957195211051218U,
+    13829329232065827026U,  4602710690099904183U,
+    4605343481119364930U,  4603629178146150899U,
+    13827001215000926707U,  4605343481119364930U,
+    4594016801320007031U,  4607100477024622401U,
+    13830472513879398209U,  4594016801320007031U,
+    4607068040143112603U,  4594891488091520602U,
+    13818263524946296410U,  4607068040143112603U,
+    4603451617570386922U,  4605475169017376660U,
+    13828847205872152468U,  4603451617570386922U,
+    4605843545406134034U,  4602900303344142735U,
+    13826272340198918543U,  4605843545406134034U,
+    4597492765973365521U,  4606938683557690074U,
+    13830310720412465882U,  4597492765973365521U,
+    4606618018794815019U,  4599948172872067014U,
+    13823320209726842822U,  4606618018794815019U,
+    4601173347964633034U,  4606364276725003740U,
+    13829736313579779548U,  4601173347964633034U,
+    4604774382555066977U,  4604305528345395596U,
+    13827677565200171404U,  4604774382555066977U,
+    4585465300892538317U,  4607176315382986589U,
+    13830548352237762397U,  4585465300892538317U,
+    4607176315382986589U,  4585465300892538317U,
+    13808837337747314125U,  4607176315382986589U,
+    4604305528345395596U,  4604774382555066977U,
+    13828146419409842785U,  4604305528345395596U,
+    4606364276725003740U,  4601173347964633034U,
+    13824545384819408842U,  4606364276725003740U,
+    4599948172872067014U,  4606618018794815019U,
+    13829990055649590827U,  4599948172872067014U,
+    4606938683557690074U,  4597492765973365521U,
+    13820864802828141329U,  4606938683557690074U,
+    4602900303344142735U,  4605843545406134034U,
+    13829215582260909842U,  4602900303344142735U,
+    4605475169017376660U,  4603451617570386922U,
+    13826823654425162730U,  4605475169017376660U,
+    4594891488091520602U,  4607068040143112603U,
+    13830440076997888411U,  4594891488091520602U,
+    4607100477024622401U,  4594016801320007031U,
+    13817388838174782839U,  4607100477024622401U,
+    4603629178146150899U,  4605343481119364930U,
+    13828715517974140738U,  4603629178146150899U,
+    4605957195211051218U,  4602710690099904183U,
+    13826082726954679991U,  4605957195211051218U,
+    4598262871476403630U,  4606884969294623682U,
+    13830257006149399490U,  4598262871476403630U,
+    4606692493141721470U,  4599531889160152938U,
+    13822903926014928746U,  4606692493141721470U,
+    4601573027631668967U,  4606269759522929756U,
+    13829641796377705564U,  4601573027631668967U,
+    4604922840319727473U,  4604141730443515286U,
+    13827513767298291094U,  4604922840319727473U,
+    4588998070480937184U,  4607165468267934125U,
+    13830537505122709933U,  4588998070480937184U,
+    4607149205763218185U,  4590846768565625881U,
+    13814218805420401689U,  4607149205763218185U,
+    4603974338538572089U,  4605067233569943231U,
+    13828439270424719039U,  4603974338538572089U,
+    4606170366472647579U,  4601967947786150793U,
+    13825339984640926601U,  4606170366472647579U,
+    4599112075441176914U,  4606761837001494797U,
+    13830133873856270605U,  4599112075441176914U,
+    4606826008603986804U,  4598688984595225406U,
+    13822061021450001214U,  4606826008603986804U,
+    4602357870542944470U,  4606066157444814153U,
+    13829438194299589961U,  4602357870542944470U,
+    4605207475328619533U,  4603803453461190356U,
+    13827175490315966164U,  4605207475328619533U,
+    4592606767730311893U,  4607127537664763515U,
+    13830499574519539323U,  4592606767730311893U,
+    4607030246558998647U,  4595762727260045105U,
+    13819134764114820913U,  4607030246558998647U,
+    4603270878689749849U,  4605602459698789090U,
+    13828974496553564898U,  4603270878689749849U,
+    4605725276488455441U,  4603087070374583113U,
+    13826459107229358921U,  4605725276488455441U,
+    4596629994023683153U,  4606987119037722413U,
+    13830359155892498221U,  4596629994023683153U,
+    4606538458821337243U,  4600360675823176935U,
+    13823732712677952743U,  4606538458821337243U,
+    4600769149537129431U,  4606453861145241227U,
+    13829825898000017035U,  4600769149537129431U,
+    4604621949701367983U,  4604465633578481725U,
+    13827837670433257533U,  4604621949701367983U,
+    4578227681973159812U,  4607181740574479067U,
+    13830553777429254875U,  4578227681973159812U,
+    4607182249242036882U,  4573724215515480177U,
+    13797096252370255985U,  4607182249242036882U,
+    4604505071555817232U,  4604583231088591477U,
+    13827955267943367285U,  4604505071555817232U,
+    4606475480113671417U,  4600667422348321968U,
+    13824039459203097776U,  4606475480113671417U,
+    4600463181646572228U,  4606517779747998088U,
+    13829889816602773896U,  4600463181646572228U,
+    4606998399608725124U,  4596413578358834022U,
+    13819785615213609830U,  4606998399608725124U,
+    4603133304188877240U,  4605694995810664660U,
+    13829067032665440468U,  4603133304188877240U,
+    4605633586259814045U,  4603225210076562971U,
+    13826597246931338779U,  4605633586259814045U,
+    4595979936813835462U,  4607019963775302583U,
+    13830392000630078391U,  4595979936813835462U,
+    4607133460805585796U,  4592167175087283203U,
+    13815539211942059011U,  4607133460805585796U,
+    4603846496621587377U,  4605172808754305228U,
+    13828544845609081036U,  4603846496621587377U,
+    4606092657816072624U,  4602260871257280788U,
+    13825632908112056596U,  4606092657816072624U,
+    4598795050632330097U,  4606810452769876110U,
+    13830182489624651918U,  4598795050632330097U,
+    4606778366364612594U,  4599006600037663623U,
+    13822378636892439431U,  4606778366364612594U,
+    4602065906208722008U,  4606144763310860551U,
+    13829516800165636359U,  4602065906208722008U,
+    4605102686554936490U,  4603931940768740167U,
+    13827303977623515975U,  4605102686554936490U,
+    4591287158938884897U,  4607144295058764886U,
+    13830516331913540694U,  4591287158938884897U,
+    4607168688050493276U,  4588115294056142819U,
+    13811487330910918627U,  4607168688050493276U,
+    4604183020748362039U,  4604886103475043762U,
+    13828258140329819570U,  4604183020748362039U,
+    4606293848208650998U,  4601473544562720001U,
+    13824845581417495809U,  4606293848208650998U,
+    4599636300858866724U,  4606674353838411301U,
+    13830046390693187109U,  4599636300858866724U,
+    4606898891031025132U,  4598136582470364665U,
+    13821508619325140473U,  4606898891031025132U,
+    4602758354025980442U,  4605929219593405673U,
+    13829301256448181481U,  4602758354025980442U,
+    4605376811039722786U,  4603585091850767959U,
+    13826957128705543767U,  4605376811039722786U,
+    4594235767444503503U,  4607092871118901179U,
+    13830464907973676987U,  4594235767444503503U,
+    4607076652372832968U,  4594673119063280916U,
+    13818045155918056724U,  4607076652372832968U,
+    4603496309891590679U,  4605442656228245717U,
+    13828814693083021525U,  4603496309891590679U,
+    4605872393621214213U,  4602853162432841185U,
+    13826225199287616993U,  4605872393621214213U,
+    4597707695679609371U,  4606925748668145757U,
+    13830297785522921565U,  4597707695679609371U,
+    4606637115963965612U,  4599844446633109139U,
+    13823216483487884947U,  4606637115963965612U,
+    4601273700967202825U,  4606341107699334546U,
+    13829713144554110354U,  4601273700967202825U,
+    4604811873195349477U,  4604264921241055824U,
+    13827636958095831632U,  4604811873195349477U,
+    4586348876009622851U,  4607174111710118367U,
+    13830546148564894175U,  4586348876009622851U,
+    4607178180169683960U,  4584498631466405633U,
+    13807870668321181441U,  4607178180169683960U,
+    4604345904647073908U,  4604736643460027021U,
+    13828108680314802829U,  4604345904647073908U,
+    4606387137437298591U,  4601072712526242277U,
+    13824444749381018085U,  4606387137437298591U,
+    4600051662802353687U,  4606598603759044570U,
+    13829970640613820378U,  4600051662802353687U,
+    4606951288507767453U,  4597277522845151878U,
+    13820649559699927686U,  4606951288507767453U,
+    4602947266358709886U,  4605814408482919348U,
+    13829186445337695156U,  4602947266358709886U,
+    4605507406967535927U,  4603406726595779752U,
+    13826778763450555560U,  4605507406967535927U,
+    4595109641634432498U,  4607059093103722971U,
+    13830431129958498779U,  4595109641634432498U,
+    4607107746899444102U,  4593797652641645341U,
+    13817169689496421149U,  4607107746899444102U,
+    4603673059103075106U,  4605309881318010327U,
+    13828681918172786135U,  4603673059103075106U,
+    4605984877841711338U,  4602646891659203088U,
+    13826018928513978896U,  4605984877841711338U,
+    4598369669086960528U,  4606870719641066940U,
+    13830242756495842748U,  4598369669086960528U,
+    4606710311774494716U,  4599427256825614420U,
+    13822799293680390228U,  4606710311774494716U,
+    4601672213217083403U,  4606245366082353408U,
+    13829617402937129216U,  4601672213217083403U,
+    4604959323120302796U,  4604100215502905499U,
+    13827472252357681307U,  4604959323120302796U,
+    4589524267239410099U,  4607161910007591876U,
+    13830533946862367684U,  4589524267239410099U,
+    4607153778602162496U,  4590406145430462614U,
+    13813778182285238422U,  4607153778602162496U,
+    4604016517974851588U,  4605031521104517324U,
+    13828403557959293132U,  4604016517974851588U,
+    4606195668621671667U,  4601869677011524443U,
+    13825241713866300251U,  4606195668621671667U,
+    4599217346014614711U,  4606744984357082948U,
+    13830117021211858756U,  4599217346014614711U,
+    4606841238740778884U,  4598582729657176439U,
+    13821954766511952247U,  4606841238740778884U,
+    4602454542796181607U,  4606039359984203741U,
+    13829411396838979549U,  4602454542796181607U,
+    4605241877142478242U,  4603760198400967492U,
+    13827132235255743300U,  4605241877142478242U,
+    4593046061348462537U,  4607121277474223905U,
+    13830493314328999713U,  4593046061348462537U,
+    4607040195955932526U,  4595545269419264690U,
+    13818917306274040498U,  4607040195955932526U,
+    4603316355454250015U,  4605571053506370248U,
+    13828943090361146056U,  4603316355454250015U,
+    4605755272910869620U,  4603040651631881451U,
+    13826412688486657259U,  4605755272910869620U,
+    4596846128749438754U,  4606975506703684317U,
+    13830347543558460125U,  4596846128749438754U,
+    4606558823023444576U,  4600257918160607478U,
+    13823629955015383286U,  4606558823023444576U,
+    4600870609507958271U,  4606431930490633905U,
+    13829803967345409713U,  4600870609507958271U,
+    4604660425598397818U,  4604425958770613225U,
+    13827797995625389033U,  4604660425598397818U,
+    4580962600092897021U,  4607180892816495009U,
+    13830552929671270817U,  4580962600092897021U,
+    4607180892816495009U,  4580962600092897021U,
+    13804334636947672829U,  4607180892816495009U,
+    4604425958770613225U,  4604660425598397818U,
+    13828032462453173626U,  4604425958770613225U,
+    4606431930490633905U,  4600870609507958271U,
+    13824242646362734079U,  4606431930490633905U,
+    4600257918160607478U,  4606558823023444576U,
+    13829930859878220384U,  4600257918160607478U,
+    4606975506703684317U,  4596846128749438754U,
+    13820218165604214562U,  4606975506703684317U,
+    4603040651631881451U,  4605755272910869620U,
+    13829127309765645428U,  4603040651631881451U,
+    4605571053506370248U,  4603316355454250015U,
+    13826688392309025823U,  4605571053506370248U,
+    4595545269419264690U,  4607040195955932526U,
+    13830412232810708334U,  4595545269419264690U,
+    4607121277474223905U,  4593046061348462537U,
+    13816418098203238345U,  4607121277474223905U,
+    4603760198400967492U,  4605241877142478242U,
+    13828613913997254050U,  4603760198400967492U,
+    4606039359984203741U,  4602454542796181607U,
+    13825826579650957415U,  4606039359984203741U,
+    4598582729657176439U,  4606841238740778884U,
+    13830213275595554692U,  4598582729657176439U,
+    4606744984357082948U,  4599217346014614711U,
+    13822589382869390519U,  4606744984357082948U,
+    4601869677011524443U,  4606195668621671667U,
+    13829567705476447475U,  4601869677011524443U,
+    4605031521104517324U,  4604016517974851588U,
+    13827388554829627396U,  4605031521104517324U,
+    4590406145430462614U,  4607153778602162496U,
+    13830525815456938304U,  4590406145430462614U,
+    4607161910007591876U,  4589524267239410099U,
+    13812896304094185907U,  4607161910007591876U,
+    4604100215502905499U,  4604959323120302796U,
+    13828331359975078604U,  4604100215502905499U,
+    4606245366082353408U,  4601672213217083403U,
+    13825044250071859211U,  4606245366082353408U,
+    4599427256825614420U,  4606710311774494716U,
+    13830082348629270524U,  4599427256825614420U,
+    4606870719641066940U,  4598369669086960528U,
+    13821741705941736336U,  4606870719641066940U,
+    4602646891659203088U,  4605984877841711338U,
+    13829356914696487146U,  4602646891659203088U,
+    4605309881318010327U,  4603673059103075106U,
+    13827045095957850914U,  4605309881318010327U,
+    4593797652641645341U,  4607107746899444102U,
+    13830479783754219910U,  4593797652641645341U,
+    4607059093103722971U,  4595109641634432498U,
+    13818481678489208306U,  4607059093103722971U,
+    4603406726595779752U,  4605507406967535927U,
+    13828879443822311735U,  4603406726595779752U,
+    4605814408482919348U,  4602947266358709886U,
+    13826319303213485694U,  4605814408482919348U,
+    4597277522845151878U,  4606951288507767453U,
+    13830323325362543261U,  4597277522845151878U,
+    4606598603759044570U,  4600051662802353687U,
+    13823423699657129495U,  4606598603759044570U,
+    4601072712526242277U,  4606387137437298591U,
+    13829759174292074399U,  4601072712526242277U,
+    4604736643460027021U,  4604345904647073908U,
+    13827717941501849716U,  4604736643460027021U,
+    4584498631466405633U,  4607178180169683960U,
+    13830550217024459768U,  4584498631466405633U,
+    4607174111710118367U,  4586348876009622851U,
+    13809720912864398659U,  4607174111710118367U,
+    4604264921241055824U,  4604811873195349477U,
+    13828183910050125285U,  4604264921241055824U,
+    4606341107699334546U,  4601273700967202825U,
+    13824645737821978633U,  4606341107699334546U,
+    4599844446633109139U,  4606637115963965612U,
+    13830009152818741420U,  4599844446633109139U,
+    4606925748668145757U,  4597707695679609371U,
+    13821079732534385179U,  4606925748668145757U,
+    4602853162432841185U,  4605872393621214213U,
+    13829244430475990021U,  4602853162432841185U,
+    4605442656228245717U,  4603496309891590679U,
+    13826868346746366487U,  4605442656228245717U,
+    4594673119063280916U,  4607076652372832968U,
+    13830448689227608776U,  4594673119063280916U,
+    4607092871118901179U,  4594235767444503503U,
+    13817607804299279311U,  4607092871118901179U,
+    4603585091850767959U,  4605376811039722786U,
+    13828748847894498594U,  4603585091850767959U,
+    4605929219593405673U,  4602758354025980442U,
+    13826130390880756250U,  4605929219593405673U,
+    4598136582470364665U,  4606898891031025132U,
+    13830270927885800940U,  4598136582470364665U,
+    4606674353838411301U,  4599636300858866724U,
+    13823008337713642532U,  4606674353838411301U,
+    4601473544562720001U,  4606293848208650998U,
+    13829665885063426806U,  4601473544562720001U,
+    4604886103475043762U,  4604183020748362039U,
+    13827555057603137847U,  4604886103475043762U,
+    4588115294056142819U,  4607168688050493276U,
+    13830540724905269084U,  4588115294056142819U,
+    4607144295058764886U,  4591287158938884897U,
+    13814659195793660705U,  4607144295058764886U,
+    4603931940768740167U,  4605102686554936490U,
+    13828474723409712298U,  4603931940768740167U,
+    4606144763310860551U,  4602065906208722008U,
+    13825437943063497816U,  4606144763310860551U,
+    4599006600037663623U,  4606778366364612594U,
+    13830150403219388402U,  4599006600037663623U,
+    4606810452769876110U,  4598795050632330097U,
+    13822167087487105905U,  4606810452769876110U,
+    4602260871257280788U,  4606092657816072624U,
+    13829464694670848432U,  4602260871257280788U,
+    4605172808754305228U,  4603846496621587377U,
+    13827218533476363185U,  4605172808754305228U,
+    4592167175087283203U,  4607133460805585796U,
+    13830505497660361604U,  4592167175087283203U,
+    4607019963775302583U,  4595979936813835462U,
+    13819351973668611270U,  4607019963775302583U,
+    4603225210076562971U,  4605633586259814045U,
+    13829005623114589853U,  4603225210076562971U,
+    4605694995810664660U,  4603133304188877240U,
+    13826505341043653048U,  4605694995810664660U,
+    4596413578358834022U,  4606998399608725124U,
+    13830370436463500932U,  4596413578358834022U,
+    4606517779747998088U,  4600463181646572228U,
+    13823835218501348036U,  4606517779747998088U,
+    4600667422348321968U,  4606475480113671417U,
+    13829847516968447225U,  4600667422348321968U,
+    4604583231088591477U,  4604505071555817232U,
+    13827877108410593040U,  4604583231088591477U,
+    4573724215515480177U,  4607182249242036882U,
+    13830554286096812690U,  4573724215515480177U,
+    4607182376410422530U,  4569220649180767418U,
+    13792592686035543226U,  4607182376410422530U,
+    4604524701268679793U,  4604563781218984604U,
+    13827935818073760412U,  4604524701268679793U,
+    4606486172460753999U,  4600616459743653188U,
+    13823988496598428996U,  4606486172460753999U,
+    4600514338912178239U,  4606507322377452870U,
+    13829879359232228678U,  4600514338912178239U,
+    4607003915349878877U,  4596305267720071930U,
+    13819677304574847738U,  4607003915349878877U,
+    4603156351203636159U,  4605679749231851918U,
+    13829051786086627726U,  4603156351203636159U,
+    4605649044311923410U,  4603202304363743346U,
+    13826574341218519154U,  4605649044311923410U,
+    4596088445927168004U,  4607014697483910382U,
+    13830386734338686190U,  4596088445927168004U,
+    4607136295912168606U,  4591947271803021404U,
+    13815319308657797212U,  4607136295912168606U,
+    4603867938232615808U,  4605155376589456981U,
+    13828527413444232789U,  4603867938232615808U,
+    4606105796280968177U,  4602212250118051877U,
+    13825584286972827685U,  4606105796280968177U,
+    4598848011564831930U,  4606802552898869248U,
+    13830174589753645056U,  4598848011564831930U,
+    4606786509620734768U,  4598953786765296928U,
+    13822325823620072736U,  4606786509620734768U,
+    4602114767134999006U,  4606131849150971908U,
+    13829503886005747716U,  4602114767134999006U,
+    4605120315324767624U,  4603910660507251362U,
+    13827282697362027170U,  4605120315324767624U,
+    4591507261658050721U,  4607141713064252300U,
+    13830513749919028108U,  4591507261658050721U,
+    4607170170974224083U,  4587673791460508439U,
+    13811045828315284247U,  4607170170974224083U,
+    4604203581176243359U,  4604867640218014515U,
+    13828239677072790323U,  4604203581176243359U,
+    4606305777984577632U,  4601423692641949331U,
+    13824795729496725139U,  4606305777984577632U,
+    4599688422741010356U,  4606665164148251002U,
+    13830037201003026810U,  4599688422741010356U,
+    4606905728766014348U,  4598029484874872834U,
+    13821401521729648642U,  4606905728766014348U,
+    4602782121393764535U,  4605915122243179241U,
+    13829287159097955049U,  4602782121393764535U,
+    4605393374401988274U,  4603562972219549215U,
+    13826935009074325023U,  4605393374401988274U,
+    4594345179472540681U,  4607088942243446236U,
+    13830460979098222044U,  4594345179472540681U,
+    4607080832832247697U,  4594563856311064231U,
+    13817935893165840039U,  4607080832832247697U,
+    4603518581031047189U,  4605426297151190466U,
+    13828798334005966274U,  4603518581031047189U,
+    4605886709123365959U,  4602829525820289164U,
+    13826201562675064972U,  4605886709123365959U,
+    4597815040470278984U,  4606919157647773535U,
+    13830291194502549343U,  4597815040470278984U,
+    4606646545123403481U,  4599792496117920694U,
+    13823164532972696502U,  4606646545123403481U,
+    4601323770373937522U,  4606329407841126011U,
+    13829701444695901819U,  4601323770373937522U,
+    4604830524903495634U,  4604244531615310815U,
+    13827616568470086623U,  4604830524903495634U,
+    4586790578280679046U,  4607172882816799076U,
+    13830544919671574884U,  4586790578280679046U,
+    4607178985458280057U,  4583614727651146525U,
+    13806986764505922333U,  4607178985458280057U,
+    4604366005771528720U,  4604717681185626434U,
+    13828089718040402242U,  4604366005771528720U,
+    4606398451906509788U,  4601022290077223616U,
+    13824394326931999424U,  4606398451906509788U,
+    4600103317933788342U,  4606588777269136769U,
+    13829960814123912577U,  4600103317933788342U,
+    4606957467106717424U,  4597169786279785693U,
+    13820541823134561501U,  4606957467106717424U,
+    4602970680601913687U,  4605799732098147061U,
+    13829171768952922869U,  4602970680601913687U,
+    4605523422498301790U,  4603384207141321914U,
+    13826756243996097722U,  4605523422498301790U,
+    4595218635031890910U,  4607054494135176056U,
+    13830426530989951864U,  4595218635031890910U,
+    4607111255739239816U,  4593688012422887515U,
+    13817060049277663323U,  4607111255739239816U,
+    4603694922063032361U,  4605292980606880364U,
+    13828665017461656172U,  4603694922063032361U,
+    4605998608960791335U,  4602598930031891166U,
+    13825970966886666974U,  4605998608960791335U,
+    4598423001813699022U,  4606863472012527185U,
+    13830235508867302993U,  4598423001813699022U,
+    4606719100629313491U,  4599374859150636784U,
+    13822746896005412592U,  4606719100629313491U,
+    4601721693286060937U,  4606233055365547081U,
+    13829605092220322889U,  4601721693286060937U,
+    4604977468824438271U,  4604079374282302598U,
+    13827451411137078406U,  4604977468824438271U,
+    4589744810590291021U,  4607160003989618959U,
+    13830532040844394767U,  4589744810590291021U,
+    4607155938267770208U,  4590185751760970393U,
+    13813557788615746201U,  4607155938267770208U,
+    4604037525321326463U,  4605013567986435066U,
+    13828385604841210874U,  4604037525321326463U,
+    4606208206518262803U,  4601820425647934753U,
+    13825192462502710561U,  4606208206518262803U,
+    4599269903251194481U,  4606736437002195879U,
+    13830108473856971687U,  4599269903251194481U,
+    4606848731493011465U,  4598529532600161144U,
+    13821901569454936952U,  4606848731493011465U,
+    4602502755147763107U,  4606025850160239809U,
+    13829397887015015617U,  4602502755147763107U,
+    4605258978359093269U,  4603738491917026584U,
+    13827110528771802392U,  4605258978359093269U,
+    4593265590854265407U,  4607118021058468598U,
+    13830490057913244406U,  4593265590854265407U,
+    4607045045516813836U,  4595436449949385485U,
+    13818808486804161293U,  4607045045516813836U,
+    4603339021357904144U,  4605555245917486022U,
+    13828927282772261830U,  4603339021357904144U,
+    4605770164172969910U,  4603017373458244943U,
+    13826389410313020751U,  4605770164172969910U,
+    4596954088216812973U,  4606969576261663845U,
+    13830341613116439653U,  4596954088216812973U,
+    4606568886807728474U,  4600206446098256018U,
+    13823578482953031826U,  4606568886807728474U,
+    4600921238092511730U,  4606420848538580260U,
+    13829792885393356068U,  4600921238092511730U,
+    4604679572075463103U,  4604406033021674239U,
+    13827778069876450047U,  4604679572075463103U,
+    4581846703643734566U,  4607180341788068727U,
+    13830552378642844535U,  4581846703643734566U,
+    4607181359080094673U,  4579996072175835083U,
+    13803368109030610891U,  4607181359080094673U,
+    4604445825685214043U,  4604641218080103285U,
+    13828013254934879093U,  4604445825685214043U,
+    4606442934727379583U,  4600819913163773071U,
+    13824191950018548879U,  4606442934727379583U,
+    4600309328230211502U,  4606548680329491866U,
+    13829920717184267674U,  4600309328230211502U,
+    4606981354314050484U,  4596738097012783531U,
+    13820110133867559339U,  4606981354314050484U,
+    4603063884010218172U,  4605740310302420207U,
+    13829112347157196015U,  4603063884010218172U,
+    4605586791482848547U,  4603293641160266722U,
+    13826665678015042530U,  4605586791482848547U,
+    4595654028864046335U,  4607035262954517034U,
+    13830407299809292842U,  4595654028864046335U,
+    4607124449686274900U,  4592826452951465409U,
+    13816198489806241217U,  4607124449686274900U,
+    4603781852316960384U,  4605224709411790590U,
+    13828596746266566398U,  4603781852316960384U,
+    4606052795787882823U,  4602406247776385022U,
+    13825778284631160830U,  4606052795787882823U,
+    4598635880488956483U,  4606833664420673202U,
+    13830205701275449010U,  4598635880488956483U,
+    4606753451050079834U,  4599164736579548843U,
+    13822536773434324651U,  4606753451050079834U,
+    4601918851211878557U,  4606183055233559255U,
+    13829555092088335063U,  4601918851211878557U,
+    4605049409688478101U,  4603995455647851249U,
+    13827367492502627057U,  4605049409688478101U,
+    4590626485056654602U,  4607151534426937478U,
+    13830523571281713286U,  4590626485056654602U,
+    4607163731439411601U,  4589303678145802340U,
+    13812675715000578148U,  4607163731439411601U,
+    4604121000955189926U,  4604941113561600762U,
+    13828313150416376570U,  4604121000955189926U,
+    4606257600839867033U,  4601622657843474729U,
+    13824994694698250537U,  4606257600839867033U,
+    4599479600326345459U,  4606701442584137310U,
+    13830073479438913118U,  4599479600326345459U,
+    4606877885424248132U,  4598316292140394014U,
+    13821688328995169822U,  4606877885424248132U,
+    4602686793990243041U,  4605971073215153165U,
+    13829343110069928973U,  4602686793990243041U,
+    4605326714874986465U,  4603651144395358093U,
+    13827023181250133901U,  4605326714874986465U,
+    4593907249284540294U,  4607104153983298999U,
+    13830476190838074807U,  4593907249284540294U,
+    4607063608453868552U,  4595000592312171144U,
+    13818372629166946952U,  4607063608453868552U,
+    4603429196809300824U,  4605491322423429598U,
+    13828863359278205406U,  4603429196809300824U,
+    4605829012964735987U,  4602923807199184054U,
+    13826295844053959862U,  4605829012964735987U,
+    4597385183080791534U,  4606945027305114062U,
+    13830317064159889870U,  4597385183080791534U,
+    4606608350964852124U,  4599999947619525579U,
+    13823371984474301387U,  4606608350964852124U,
+    4601123065313358619U,  4606375745674388705U,
+    13829747782529164513U,  4601123065313358619U,
+    4604755543975806820U,  4604325745441780828U,
+    13827697782296556636U,  4604755543975806820U,
+    4585023436363055487U,  4607177290141793710U,
+    13830549326996569518U,  4585023436363055487U,
+    4607175255902437396U,  4585907115494236537U,
+    13809279152349012345U,  4607175255902437396U,
+    4604285253548209224U,  4604793159020491611U,
+    13828165195875267419U,  4604285253548209224U,
+    4606352730697093817U,  4601223560006786057U,
+    13824595596861561865U,  4606352730697093817U,
+    4599896339047301634U,  4606627607157935956U,
+    13829999644012711764U,  4599896339047301634U,
+    4606932257325205256U,  4597600270510262682U,
+    13820972307365038490U,  4606932257325205256U,
+    4602876755014813164U,  4605858005670328613U,
+    13829230042525104421U,  4602876755014813164U,
+    4605458946901419122U,  4603473988668005304U,
+    13826846025522781112U,  4605458946901419122U,
+    4594782329999411347U,  4607072388129742377U,
+    13830444424984518185U,  4594782329999411347U,
+    4607096716058023245U,  4594126307716900071U,
+    13817498344571675879U,  4607096716058023245U,
+    4603607160562208225U,  4605360179893335444U,
+    13828732216748111252U,  4603607160562208225U,
+    4605943243960030558U,  4602734543519989142U,
+    13826106580374764950U,  4605943243960030558U,
+    4598209407597805010U,  4606891971185517504U,
+    13830264008040293312U,  4598209407597805010U,
+    4606683463531482757U,  4599584122834874440U,
+    13822956159689650248U,  4606683463531482757U,
+    4601523323048804569U,  4606281842017099424U,
+    13829653878871875232U,  4601523323048804569U,
+    4604904503566677638U,  4604162403772767740U,
+    13827534440627543548U,  4604904503566677638U,
+    4588556721781247689U,  4607167120476811757U,
+    13830539157331587565U,  4588556721781247689U,
+    4607146792632922887U,  4591066993883984169U,
+    13814439030738759977U,  4607146792632922887U,
+    4603953166845776383U,  4605084992581147553U,
+    13828457029435923361U,  4603953166845776383U,
+    4606157602458368090U,  4602016966272225497U,
+    13825389003127001305U,  4606157602458368090U,
+    4599059363095165615U,  4606770142132396069U,
+    13830142178987171877U,  4599059363095165615U,
+    4606818271362779153U,  4598742041476147134U,
+    13822114078330922942U,  4606818271362779153U,
+    4602309411551204896U,  4606079444829232727U,
+    13829451481684008535U,  4602309411551204896U,
+    4605190175055178825U,  4603825001630339212U,
+    13827197038485115020U,  4605190175055178825U,
+    4592387007752762956U,  4607130541380624519U,
+    13830502578235400327U,  4592387007752762956U,
+    4607025146816593591U,  4595871363584150300U,
+    13819243400438926108U,  4607025146816593591U,
+    4603248068256948438U,  4605618058006716661U,
+    13828990094861492469U,  4603248068256948438U,
+    4605710171610479304U,  4603110210506737381U,
+    13826482247361513189U,  4605710171610479304U,
+    4596521820799644122U,  4606992800820440327U,
+    13830364837675216135U,  4596521820799644122U,
+    4606528158595189433U,  4600411960456200676U,
+    13823783997310976484U,  4606528158595189433U,
+    4600718319105833937U,  4606464709641375231U,
+    13829836746496151039U,  4600718319105833937U,
+    4604602620643553229U,  4604485382263976838U,
+    13827857419118752646U,  4604602620643553229U,
+    4576459225186735875U,  4607182037296057423U,
+    13830554074150833231U,  4576459225186735875U,
+    4607182037296057423U,  4576459225186735875U,
+    13799831262041511683U,  4607182037296057423U,
+    4604485382263976838U,  4604602620643553229U,
+    13827974657498329037U,  4604485382263976838U,
+    4606464709641375231U,  4600718319105833937U,
+    13824090355960609745U,  4606464709641375231U,
+    4600411960456200676U,  4606528158595189433U,
+    13829900195449965241U,  4600411960456200676U,
+    4606992800820440327U,  4596521820799644122U,
+    13819893857654419930U,  4606992800820440327U,
+    4603110210506737381U,  4605710171610479304U,
+    13829082208465255112U,  4603110210506737381U,
+    4605618058006716661U,  4603248068256948438U,
+    13826620105111724246U,  4605618058006716661U,
+    4595871363584150300U,  4607025146816593591U,
+    13830397183671369399U,  4595871363584150300U,
+    4607130541380624519U,  4592387007752762956U,
+    13815759044607538764U,  4607130541380624519U,
+    4603825001630339212U,  4605190175055178825U,
+    13828562211909954633U,  4603825001630339212U,
+    4606079444829232727U,  4602309411551204896U,
+    13825681448405980704U,  4606079444829232727U,
+    4598742041476147134U,  4606818271362779153U,
+    13830190308217554961U,  4598742041476147134U,
+    4606770142132396069U,  4599059363095165615U,
+    13822431399949941423U,  4606770142132396069U,
+    4602016966272225497U,  4606157602458368090U,
+    13829529639313143898U,  4602016966272225497U,
+    4605084992581147553U,  4603953166845776383U,
+    13827325203700552191U,  4605084992581147553U,
+    4591066993883984169U,  4607146792632922887U,
+    13830518829487698695U,  4591066993883984169U,
+    4607167120476811757U,  4588556721781247689U,
+    13811928758636023497U,  4607167120476811757U,
+    4604162403772767740U,  4604904503566677638U,
+    13828276540421453446U,  4604162403772767740U,
+    4606281842017099424U,  4601523323048804569U,
+    13824895359903580377U,  4606281842017099424U,
+    4599584122834874440U,  4606683463531482757U,
+    13830055500386258565U,  4599584122834874440U,
+    4606891971185517504U,  4598209407597805010U,
+    13821581444452580818U,  4606891971185517504U,
+    4602734543519989142U,  4605943243960030558U,
+    13829315280814806366U,  4602734543519989142U,
+    4605360179893335444U,  4603607160562208225U,
+    13826979197416984033U,  4605360179893335444U,
+    4594126307716900071U,  4607096716058023245U,
+    13830468752912799053U,  4594126307716900071U,
+    4607072388129742377U,  4594782329999411347U,
+    13818154366854187155U,  4607072388129742377U,
+    4603473988668005304U,  4605458946901419122U,
+    13828830983756194930U,  4603473988668005304U,
+    4605858005670328613U,  4602876755014813164U,
+    13826248791869588972U,  4605858005670328613U,
+    4597600270510262682U,  4606932257325205256U,
+    13830304294179981064U,  4597600270510262682U,
+    4606627607157935956U,  4599896339047301634U,
+    13823268375902077442U,  4606627607157935956U,
+    4601223560006786057U,  4606352730697093817U,
+    13829724767551869625U,  4601223560006786057U,
+    4604793159020491611U,  4604285253548209224U,
+    13827657290402985032U,  4604793159020491611U,
+    4585907115494236537U,  4607175255902437396U,
+    13830547292757213204U,  4585907115494236537U,
+    4607177290141793710U,  4585023436363055487U,
+    13808395473217831295U,  4607177290141793710U,
+    4604325745441780828U,  4604755543975806820U,
+    13828127580830582628U,  4604325745441780828U,
+    4606375745674388705U,  4601123065313358619U,
+    13824495102168134427U,  4606375745674388705U,
+    4599999947619525579U,  4606608350964852124U,
+    13829980387819627932U,  4599999947619525579U,
+    4606945027305114062U,  4597385183080791534U,
+    13820757219935567342U,  4606945027305114062U,
+    4602923807199184054U,  4605829012964735987U,
+    13829201049819511795U,  4602923807199184054U,
+    4605491322423429598U,  4603429196809300824U,
+    13826801233664076632U,  4605491322423429598U,
+    4595000592312171144U,  4607063608453868552U,
+    13830435645308644360U,  4595000592312171144U,
+    4607104153983298999U,  4593907249284540294U,
+    13817279286139316102U,  4607104153983298999U,
+    4603651144395358093U,  4605326714874986465U,
+    13828698751729762273U,  4603651144395358093U,
+    4605971073215153165U,  4602686793990243041U,
+    13826058830845018849U,  4605971073215153165U,
+    4598316292140394014U,  4606877885424248132U,
+    13830249922279023940U,  4598316292140394014U,
+    4606701442584137310U,  4599479600326345459U,
+    13822851637181121267U,  4606701442584137310U,
+    4601622657843474729U,  4606257600839867033U,
+    13829629637694642841U,  4601622657843474729U,
+    4604941113561600762U,  4604121000955189926U,
+    13827493037809965734U,  4604941113561600762U,
+    4589303678145802340U,  4607163731439411601U,
+    13830535768294187409U,  4589303678145802340U,
+    4607151534426937478U,  4590626485056654602U,
+    13813998521911430410U,  4607151534426937478U,
+    4603995455647851249U,  4605049409688478101U,
+    13828421446543253909U,  4603995455647851249U,
+    4606183055233559255U,  4601918851211878557U,
+    13825290888066654365U,  4606183055233559255U,
+    4599164736579548843U,  4606753451050079834U,
+    13830125487904855642U,  4599164736579548843U,
+    4606833664420673202U,  4598635880488956483U,
+    13822007917343732291U,  4606833664420673202U,
+    4602406247776385022U,  4606052795787882823U,
+    13829424832642658631U,  4602406247776385022U,
+    4605224709411790590U,  4603781852316960384U,
+    13827153889171736192U,  4605224709411790590U,
+    4592826452951465409U,  4607124449686274900U,
+    13830496486541050708U,  4592826452951465409U,
+    4607035262954517034U,  4595654028864046335U,
+    13819026065718822143U,  4607035262954517034U,
+    4603293641160266722U,  4605586791482848547U,
+    13828958828337624355U,  4603293641160266722U,
+    4605740310302420207U,  4603063884010218172U,
+    13826435920864993980U,  4605740310302420207U,
+    4596738097012783531U,  4606981354314050484U,
+    13830353391168826292U,  4596738097012783531U,
+    4606548680329491866U,  4600309328230211502U,
+    13823681365084987310U,  4606548680329491866U,
+    4600819913163773071U,  4606442934727379583U,
+    13829814971582155391U,  4600819913163773071U,
+    4604641218080103285U,  4604445825685214043U,
+    13827817862539989851U,  4604641218080103285U,
+    4579996072175835083U,  4607181359080094673U,
+    13830553395934870481U,  4579996072175835083U,
+    4607180341788068727U,  4581846703643734566U,
+    13805218740498510374U,  4607180341788068727U,
+    4604406033021674239U,  4604679572075463103U,
+    13828051608930238911U,  4604406033021674239U,
+    4606420848538580260U,  4600921238092511730U,
+    13824293274947287538U,  4606420848538580260U,
+    4600206446098256018U,  4606568886807728474U,
+    13829940923662504282U,  4600206446098256018U,
+    4606969576261663845U,  4596954088216812973U,
+    13820326125071588781U,  4606969576261663845U,
+    4603017373458244943U,  4605770164172969910U,
+    13829142201027745718U,  4603017373458244943U,
+    4605555245917486022U,  4603339021357904144U,
+    13826711058212679952U,  4605555245917486022U,
+    4595436449949385485U,  4607045045516813836U,
+    13830417082371589644U,  4595436449949385485U,
+    4607118021058468598U,  4593265590854265407U,
+    13816637627709041215U,  4607118021058468598U,
+    4603738491917026584U,  4605258978359093269U,
+    13828631015213869077U,  4603738491917026584U,
+    4606025850160239809U,  4602502755147763107U,
+    13825874792002538915U,  4606025850160239809U,
+    4598529532600161144U,  4606848731493011465U,
+    13830220768347787273U,  4598529532600161144U,
+    4606736437002195879U,  4599269903251194481U,
+    13822641940105970289U,  4606736437002195879U,
+    4601820425647934753U,  4606208206518262803U,
+    13829580243373038611U,  4601820425647934753U,
+    4605013567986435066U,  4604037525321326463U,
+    13827409562176102271U,  4605013567986435066U,
+    4590185751760970393U,  4607155938267770208U,
+    13830527975122546016U,  4590185751760970393U,
+    4607160003989618959U,  4589744810590291021U,
+    13813116847445066829U,  4607160003989618959U,
+    4604079374282302598U,  4604977468824438271U,
+    13828349505679214079U,  4604079374282302598U,
+    4606233055365547081U,  4601721693286060937U,
+    13825093730140836745U,  4606233055365547081U,
+    4599374859150636784U,  4606719100629313491U,
+    13830091137484089299U,  4599374859150636784U,
+    4606863472012527185U,  4598423001813699022U,
+    13821795038668474830U,  4606863472012527185U,
+    4602598930031891166U,  4605998608960791335U,
+    13829370645815567143U,  4602598930031891166U,
+    4605292980606880364U,  4603694922063032361U,
+    13827066958917808169U,  4605292980606880364U,
+    4593688012422887515U,  4607111255739239816U,
+    13830483292594015624U,  4593688012422887515U,
+    4607054494135176056U,  4595218635031890910U,
+    13818590671886666718U,  4607054494135176056U,
+    4603384207141321914U,  4605523422498301790U,
+    13828895459353077598U,  4603384207141321914U,
+    4605799732098147061U,  4602970680601913687U,
+    13826342717456689495U,  4605799732098147061U,
+    4597169786279785693U,  4606957467106717424U,
+    13830329503961493232U,  4597169786279785693U,
+    4606588777269136769U,  4600103317933788342U,
+    13823475354788564150U,  4606588777269136769U,
+    4601022290077223616U,  4606398451906509788U,
+    13829770488761285596U,  4601022290077223616U,
+    4604717681185626434U,  4604366005771528720U,
+    13827738042626304528U,  4604717681185626434U,
+    4583614727651146525U,  4607178985458280057U,
+    13830551022313055865U,  4583614727651146525U,
+    4607172882816799076U,  4586790578280679046U,
+    13810162615135454854U,  4607172882816799076U,
+    4604244531615310815U,  4604830524903495634U,
+    13828202561758271442U,  4604244531615310815U,
+    4606329407841126011U,  4601323770373937522U,
+    13824695807228713330U,  4606329407841126011U,
+    4599792496117920694U,  4606646545123403481U,
+    13830018581978179289U,  4599792496117920694U,
+    4606919157647773535U,  4597815040470278984U,
+    13821187077325054792U,  4606919157647773535U,
+    4602829525820289164U,  4605886709123365959U,
+    13829258745978141767U,  4602829525820289164U,
+    4605426297151190466U,  4603518581031047189U,
+    13826890617885822997U,  4605426297151190466U,
+    4594563856311064231U,  4607080832832247697U,
+    13830452869687023505U,  4594563856311064231U,
+    4607088942243446236U,  4594345179472540681U,
+    13817717216327316489U,  4607088942243446236U,
+    4603562972219549215U,  4605393374401988274U,
+    13828765411256764082U,  4603562972219549215U,
+    4605915122243179241U,  4602782121393764535U,
+    13826154158248540343U,  4605915122243179241U,
+    4598029484874872834U,  4606905728766014348U,
+    13830277765620790156U,  4598029484874872834U,
+    4606665164148251002U,  4599688422741010356U,
+    13823060459595786164U,  4606665164148251002U,
+    4601423692641949331U,  4606305777984577632U,
+    13829677814839353440U,  4601423692641949331U,
+    4604867640218014515U,  4604203581176243359U,
+    13827575618031019167U,  4604867640218014515U,
+    4587673791460508439U,  4607170170974224083U,
+    13830542207828999891U,  4587673791460508439U,
+    4607141713064252300U,  4591507261658050721U,
+    13814879298512826529U,  4607141713064252300U,
+    4603910660507251362U,  4605120315324767624U,
+    13828492352179543432U,  4603910660507251362U,
+    4606131849150971908U,  4602114767134999006U,
+    13825486803989774814U,  4606131849150971908U,
+    4598953786765296928U,  4606786509620734768U,
+    13830158546475510576U,  4598953786765296928U,
+    4606802552898869248U,  4598848011564831930U,
+    13822220048419607738U,  4606802552898869248U,
+    4602212250118051877U,  4606105796280968177U,
+    13829477833135743985U,  4602212250118051877U,
+    4605155376589456981U,  4603867938232615808U,
+    13827239975087391616U,  4605155376589456981U,
+    4591947271803021404U,  4607136295912168606U,
+    13830508332766944414U,  4591947271803021404U,
+    4607014697483910382U,  4596088445927168004U,
+    13819460482781943812U,  4607014697483910382U,
+    4603202304363743346U,  4605649044311923410U,
+    13829021081166699218U,  4603202304363743346U,
+    4605679749231851918U,  4603156351203636159U,
+    13826528388058411967U,  4605679749231851918U,
+    4596305267720071930U,  4607003915349878877U,
+    13830375952204654685U,  4596305267720071930U,
+    4606507322377452870U,  4600514338912178239U,
+    13823886375766954047U,  4606507322377452870U,
+    4600616459743653188U,  4606486172460753999U,
+    13829858209315529807U,  4600616459743653188U,
+    4604563781218984604U,  4604524701268679793U,
+    13827896738123455601U,  4604563781218984604U,
+    4569220649180767418U,  4607182376410422530U,
+    13830554413265198338U,  4569220649180767418U
+};
+
+const fpr fpr_p2_tab[] = {
+    4611686018427387904U,
+    4607182418800017408U,
+    4602678819172646912U,
+    4598175219545276416U,
+    4593671619917905920U,
+    4589168020290535424U,
+    4584664420663164928U,
+    4580160821035794432U,
+    4575657221408423936U,
+    4571153621781053440U,
+    4566650022153682944U
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.h b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.h
new file mode 100644
index 000000000..beab1ab66
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.h
@@ -0,0 +1,491 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/* ====================================================================== */
+/*
+ * Custom floating-point implementation with integer arithmetics. We
+ * use IEEE-754 "binary64" format, with some simplifications:
+ *
+ *   - Top bit is s = 1 for negative, 0 for positive.
+ *
+ *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
+ *
+ *   - Mantissa m uses the 52 low bits.
+ *
+ * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
+ * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
+ * less than 1.0), but the top bit (equal to 1 by definition) is omitted
+ * in the encoding.
+ *
+ * In IEEE-754, there are some special values:
+ *
+ *   - If e = 2047, then the value is either an infinite (m = 0) or
+ *     a NaN (m != 0).
+ *
+ *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
+ *     aka "denormalized number" (m != 0).
+ *
+ * Of these, we only need the zeros. The caller is responsible for not
+ * providing operands that would lead to infinites, NaNs or subnormals.
+ * If inputs are such that values go out of range, then indeterminate
+ * values are returned (it would still be deterministic, but no specific
+ * value may be relied upon).
+ *
+ * At the C level, the three parts are stored in a 64-bit unsigned
+ * word.
+ *
+ * One may note that a property of the IEEE-754 format is that order
+ * is preserved for positive values: if two positive floating-point
+ * values x and y are such that x < y, then their respective encodings
+ * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
+ * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
+ * y < 0, and x < y, then ia64(x) > ia64(y).
+ *
+ * IMPORTANT ASSUMPTIONS:
+ * ======================
+ *
+ * For proper computations, and constant-time behaviour, we assume the
+ * following:
+ *
+ *   - 32x32->64 multiplication (unsigned) has an execution time that
+ *     is independent of its operands. This is true of most modern
+ *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
+ *     and M3 (in the M0 and M0+, this is done in software, so it depends
+ *     on that routine), and the PowerPC cores from the G3/G4 lines.
+ *     For more info, see: https://www.bearssl.org/ctmul.html
+ *
+ *   - Left-shifts and right-shifts of 32-bit values have an execution
+ *     time which does not depend on the shifted value nor on the
+ *     shift count. An historical exception is the Pentium IV, but most
+ *     modern CPU have barrel shifters. Some small microcontrollers
+ *     might have varying-time shifts (not the ARM Cortex M*, though).
+ *
+ *   - Right-shift of a signed negative value performs a sign extension.
+ *     As per the C standard, this operation returns an
+ *     implementation-defined result (this is NOT an "undefined
+ *     behaviour"). On most/all systems, an arithmetic shift is
+ *     performed, because this is what makes most sense.
+ */
+
+/*
+ * Normally we should declare the 'fpr' type to be a struct or union
+ * around the internal 64-bit value; however, we want to use the
+ * direct 64-bit integer type to enable a lighter call convention on
+ * ARM platforms. This means that direct (invalid) use of operators
+ * such as '*' or '+' will not be caught by the compiler. We rely on
+ * the "normal" (non-emulated) code to detect such instances.
+ */
+typedef uint64_t fpr;
+
+/*
+ * For computations, we split values into an integral mantissa in the
+ * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
+ * "sticky" (it is set to 1 if any of the bits below it is 1); when
+ * re-encoding, the low two bits are dropped, but may induce an
+ * increment in the value for proper rounding.
+ */
+
+/*
+ * Right-shift a 64-bit unsigned value by a possibly secret shift count.
+ * We assumed that the underlying architecture had a barrel shifter for
+ * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
+ * typically invoke a software routine that is not necessarily
+ * constant-time; hence the function below.
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ursh(uint64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+/*
+ * Right-shift a 64-bit signed value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline int64_t
+fpr_irsh(int64_t x, int n) {
+    x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
+    return x >> (n & 31);
+}
+
+/*
+ * Left-shift a 64-bit unsigned value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ulsh(uint64_t x, int n) {
+    x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
+    return x << (n & 31);
+}
+
+/*
+ * Expectations:
+ *   s = 0 or 1
+ *   exponent e is "arbitrary" and unbiased
+ *   2^54 <= m < 2^55
+ * Numerical value is (-1)^2 * m * 2^e
+ *
+ * Exponents which are too low lead to value zero. If the exponent is
+ * too large, the returned value is indeterminate.
+ *
+ * If m = 0, then a zero is returned (using the provided sign).
+ * If e < -1076, then a zero is returned (regardless of the value of m).
+ * If e >= -1076 and e != 0, m must be within the expected range
+ * (2^54 to 2^55-1).
+ */
+static inline fpr
+FPR(int s, int e, uint64_t m) {
+    fpr x;
+    uint32_t t;
+    unsigned f;
+
+    /*
+     * If e >= -1076, then the value is "normal"; otherwise, it
+     * should be a subnormal, which we clamp down to zero.
+     */
+    e += 1076;
+    t = (uint32_t)e >> 31;
+    m &= (uint64_t)t - 1;
+
+    /*
+     * If m = 0 then we want a zero; make e = 0 too, but conserve
+     * the sign.
+     */
+    t = (uint32_t)(m >> 54);
+    e &= -(int)t;
+
+    /*
+     * The 52 mantissa bits come from m. Value m has its top bit set
+     * (unless it is a zero); we leave it "as is": the top bit will
+     * increment the exponent by 1, except when m = 0, which is
+     * exactly what we want.
+     */
+    x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
+
+    /*
+     * Rounding: if the low three bits of m are 011, 110 or 111,
+     * then the value should be incremented to get the next
+     * representable value. This implements the usual
+     * round-to-nearest rule (with preference to even values in case
+     * of a tie). Note that the increment may make a carry spill
+     * into the exponent field, which is again exactly what we want
+     * in that case.
+     */
+    f = (unsigned)m & 7U;
+    x += (0xC8U >> f) & 1;
+    return x;
+}
+
+#define fpr_scaled   PQCLEAN_FALCONPADDED512_CLEAN_fpr_scaled
+fpr fpr_scaled(int64_t i, int sc);
+
+static inline fpr
+fpr_of(int64_t i) {
+    return fpr_scaled(i, 0);
+}
+
+static const fpr fpr_q = 4667981563525332992;
+static const fpr fpr_inverse_of_q = 4545632735260551042;
+static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
+static const fpr fpr_inv_sigma[] = {
+    0,  /* unused */
+    4574611497772390042,
+    4574501679055810265,
+    4574396282908341804,
+    4574245855758572086,
+    4574103865040221165,
+    4573969550563515544,
+    4573842244705920822,
+    4573721358406441454,
+    4573606369665796042,
+    4573496814039276259
+};
+static const fpr fpr_sigma_min[] = {
+    0,  /* unused */
+    4607707126469777035,
+    4607777455861499430,
+    4607846828256951418,
+    4607949175006100261,
+    4608049571757433526,
+    4608148125896792003,
+    4608244935301382692,
+    4608340089478362016,
+    4608433670533905013,
+    4608525754002622308
+};
+static const fpr fpr_log2 = 4604418534313441775;
+static const fpr fpr_inv_log2 = 4609176140021203710;
+static const fpr fpr_bnorm_max = 4670353323383631276;
+static const fpr fpr_zero = 0;
+static const fpr fpr_one = 4607182418800017408;
+static const fpr fpr_two = 4611686018427387904;
+static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
+static const fpr fpr_ptwo31 = 4746794007248502784;
+static const fpr fpr_ptwo31m1 = 4746794007244308480;
+static const fpr fpr_mtwo31m1 = 13970166044099084288U;
+static const fpr fpr_ptwo63m1 = 4890909195324358656;
+static const fpr fpr_mtwo63m1 = 14114281232179134464U;
+static const fpr fpr_ptwo63 = 4890909195324358656;
+
+static inline int64_t
+fpr_rint(fpr x) {
+    uint64_t m, d;
+    int e;
+    uint32_t s, dd, f;
+
+    /*
+     * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
+     * thus extract the mantissa as a 63-bit integer, then right-shift
+     * it as needed.
+     */
+    m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    e = 1085 - ((int)(x >> 52) & 0x7FF);
+
+    /*
+     * If a shift of more than 63 bits is needed, then simply set m
+     * to zero. This also covers the case of an input operand equal
+     * to zero.
+     */
+    m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
+    e &= 63;
+
+    /*
+     * Right-shift m as needed. Shift count is e. Proper rounding
+     * mandates that:
+     *   - If the highest dropped bit is zero, then round low.
+     *   - If the highest dropped bit is one, and at least one of the
+     *     other dropped bits is one, then round up.
+     *   - If the highest dropped bit is one, and all other dropped
+     *     bits are zero, then round up if the lowest kept bit is 1,
+     *     or low otherwise (i.e. ties are broken by "rounding to even").
+     *
+     * We thus first extract a word consisting of all the dropped bit
+     * AND the lowest kept bit; then we shrink it down to three bits,
+     * the lowest being "sticky".
+     */
+    d = fpr_ulsh(m, 63 - e);
+    dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
+    f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
+    m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
+
+    /*
+     * Apply the sign bit.
+     */
+    s = (uint32_t)(x >> 63);
+    return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+    uint64_t t;
+    int64_t xi;
+    int e, cc;
+
+    /*
+     * We extract the integer as a _signed_ 64-bit integer with
+     * a scaling factor. Since we assume that the value fits
+     * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
+     * absolute value to make it in the 2^62..2^63-1 range: we
+     * will only need a right-shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    t = x >> 63;
+    xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
+                   & (((uint64_t)1 << 63) - 1));
+    xi = (xi ^ -(int64_t)t) + (int64_t)t;
+    cc = 1085 - e;
+
+    /*
+     * We perform an arithmetic right-shift on the value. This
+     * applies floor() semantics on both positive and negative values
+     * (rounding toward minus infinity).
+     */
+    xi = fpr_irsh(xi, cc & 63);
+
+    /*
+     * If the true shift count was 64 or more, then we should instead
+     * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
+     * case: -0 will be floored to -1, not 0 (whether this is correct
+     * is debatable; in any case, the other functions normalize zero
+     * to +0).
+     *
+     * For an input of zero, the non-shifted xi was incorrect (we used
+     * a top implicit bit of value 1, not 0), but this does not matter
+     * since this operation will clamp it down.
+     */
+    xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
+    return xi;
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+    uint64_t t, xu;
+    int e, cc;
+
+    /*
+     * Extract the absolute value. Since we assume that the value
+     * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
+     * the absolute value into the 2^62..2^63-1 range, and then
+     * do a right shift afterwards.
+     */
+    e = (int)(x >> 52) & 0x7FF;
+    xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+    cc = 1085 - e;
+    xu = fpr_ursh(xu, cc & 63);
+
+    /*
+     * If the exponent is too low (cc > 63), then the shift was wrong
+     * and we must clamp the value to 0. This also covers the case
+     * of an input equal to zero.
+     */
+    xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
+
+    /*
+     * Apply back the sign, if the source value is negative.
+     */
+    t = x >> 63;
+    xu = (xu ^ -t) + t;
+    return *(int64_t *)&xu;
+}
+
+#define fpr_add   PQCLEAN_FALCONPADDED512_CLEAN_fpr_add
+fpr fpr_add(fpr x, fpr y);
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+    y ^= (uint64_t)1 << 63;
+    return fpr_add(x, y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+    x ^= (uint64_t)1 << 63;
+    return x;
+}
+
+static inline fpr
+fpr_half(fpr x) {
+    /*
+     * To divide a value by 2, we just have to subtract 1 from its
+     * exponent, but we have to take care of zero.
+     */
+    uint32_t t;
+
+    x -= (uint64_t)1 << 52;
+    t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
+    x &= (uint64_t)t - 1;
+    return x;
+}
+
+static inline fpr
+fpr_double(fpr x) {
+    /*
+     * To double a value, we just increment by one the exponent. We
+     * don't care about infinites or NaNs; however, 0 is a
+     * special case.
+     */
+    x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
+    return x;
+}
+
+#define fpr_mul   PQCLEAN_FALCONPADDED512_CLEAN_fpr_mul
+fpr fpr_mul(fpr x, fpr y);
+
+static inline fpr
+fpr_sqr(fpr x) {
+    return fpr_mul(x, x);
+}
+
+#define fpr_div   PQCLEAN_FALCONPADDED512_CLEAN_fpr_div
+fpr fpr_div(fpr x, fpr y);
+
+static inline fpr
+fpr_inv(fpr x) {
+    return fpr_div(4607182418800017408u, x);
+}
+
+#define fpr_sqrt   PQCLEAN_FALCONPADDED512_CLEAN_fpr_sqrt
+fpr fpr_sqrt(fpr x);
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+    /*
+     * If both x and y are positive, then a signed comparison yields
+     * the proper result:
+     *   - For positive values, the order is preserved.
+     *   - The sign bit is at the same place as in integers, so
+     *     sign is preserved.
+     * Moreover, we can compute [x < y] as sgn(x-y) and the computation
+     * of x-y will not overflow.
+     *
+     * If the signs differ, then sgn(x) gives the proper result.
+     *
+     * If both x and y are negative, then the order is reversed.
+     * Hence [x < y] = sgn(y-x). We must compute this separately from
+     * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
+     * case x = y properly.
+     */
+    int cc0, cc1;
+    int64_t sx;
+    int64_t sy;
+
+    sx = *(int64_t *)&x;
+    sy = *(int64_t *)&y;
+    sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
+
+    cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
+    cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
+
+    return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
+}
+
+/*
+ * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
+ * bits or so.
+ */
+#define fpr_expm_p63   PQCLEAN_FALCONPADDED512_CLEAN_fpr_expm_p63
+uint64_t fpr_expm_p63(fpr x, fpr ccs);
+
+#define fpr_gm_tab   PQCLEAN_FALCONPADDED512_CLEAN_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab   PQCLEAN_FALCONPADDED512_CLEAN_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/inner.h b/src/sig/falcon/pqclean_falcon-padded-512_clean/inner.h
new file mode 100644
index 000000000..361f06263
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/inner.h
@@ -0,0 +1,820 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCONPADDED512_CLEAN_ macro (e.g. PQCLEAN_FALCONPADDED512_CLEAN_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc)        shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ *   - modq: sequence of values modulo 12289, each encoded over exactly
+ *     14 bits. The encoder and decoder verify that integers are within
+ *     the valid range (0..12288). Values are arrays of uint16.
+ *
+ *   - trim: sequence of signed integers, a specified number of bits
+ *     each. The number of bits is provided as parameter and includes
+ *     the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ *     (which means that the -2^(bits-1) value is forbidden); encode and
+ *     decode functions check that property. Values are arrays of
+ *     int16_t or int8_t, corresponding to names 'trim_i16' and
+ *     'trim_i8', respectively.
+ *
+ *   - comp: variable-length encoding for signed integers; each integer
+ *     uses a minimum of 9 bits, possibly more. This is normally used
+ *     only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED512_CLEAN_modq_encode(void *out, size_t max_out_len,
+        const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(void *out, size_t max_out_len,
+        const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_comp_encode(void *out, size_t max_out_len,
+        const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED512_CLEAN_modq_decode(uint16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+        const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_comp_decode(int16_t *x, unsigned logn,
+        const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ *   c0[]      contains the hashed nonce+message
+ *   s2[]      is the decoded signature
+ *   h[]       contains the public key, in NTT + Montgomery format
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+        const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ *   fpr
+ *         type for a real number
+ *
+ *   fpr fpr_of(int64_t i)
+ *         cast an integer into a real number; source must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_scaled(int64_t i, int sc)
+ *         compute i*2^sc as a real number; source 'i' must be in the
+ *         -(2^63-1)..+(2^63-1) range
+ *
+ *   fpr fpr_ldexp(fpr x, int e)
+ *         compute x*2^e
+ *
+ *   int64_t fpr_rint(fpr x)
+ *         round x to the nearest integer; x must be in the -(2^63-1)
+ *         to +(2^63-1) range
+ *
+ *   int64_t fpr_trunc(fpr x)
+ *         round to an integer; this rounds towards zero; value must
+ *         be in the -(2^63-1) to +(2^63-1) range
+ *
+ *   fpr fpr_add(fpr x, fpr y)
+ *         compute x + y
+ *
+ *   fpr fpr_sub(fpr x, fpr y)
+ *         compute x - y
+ *
+ *   fpr fpr_neg(fpr x)
+ *         compute -x
+ *
+ *   fpr fpr_half(fpr x)
+ *         compute x/2
+ *
+ *   fpr fpr_double(fpr x)
+ *         compute x*2
+ *
+ *   fpr fpr_mul(fpr x, fpr y)
+ *         compute x * y
+ *
+ *   fpr fpr_sqr(fpr x)
+ *         compute x * x
+ *
+ *   fpr fpr_inv(fpr x)
+ *         compute 1/x
+ *
+ *   fpr fpr_div(fpr x, fpr y)
+ *         compute x/y
+ *
+ *   fpr fpr_sqrt(fpr x)
+ *         compute the square root of x
+ *
+ *   int fpr_lt(fpr x, fpr y)
+ *         return 1 if x < y, 0 otherwise
+ *
+ *   uint64_t fpr_expm_p63(fpr x)
+ *         return exp(x), assuming that 0 <= x < log(2). Returned value
+ *         is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ *         rounded to the nearest integer). Computation should have a
+ *         precision of at least 45 bits.
+ *
+ *   const fpr fpr_gm_tab[]
+ *         array of constants for FFT / iFFT
+ *
+ *   const fpr fpr_p2_tab[]
+ *         precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ *   fpr fpr_q                 12289
+ *   fpr fpr_inverse_of_q      1/12289
+ *   fpr fpr_inv_2sqrsigma0    1/(2*(1.8205^2))
+ *   fpr fpr_inv_sigma[]       1/sigma (indexed by logn, 1 to 10)
+ *   fpr fpr_sigma_min[]       1/sigma_min (indexed by logn, 1 to 10)
+ *   fpr fpr_log2              log(2)
+ *   fpr fpr_inv_log2          1/log(2)
+ *   fpr fpr_bnorm_max         16822.4121
+ *   fpr fpr_zero              0
+ *   fpr fpr_one               1
+ *   fpr fpr_two               2
+ *   fpr fpr_onehalf           0.5
+ *   fpr fpr_ptwo31            2^31
+ *   fpr fpr_ptwo31m1          2^31-1
+ *   fpr fpr_mtwo31m1          -(2^31-1)
+ *   fpr fpr_ptwo63m1          2^63-1
+ *   fpr fpr_mtwo63m1          -(2^63-1)
+ *   fpr fpr_ptwo63            2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+    union {
+        uint8_t d[512]; /* MUST be 512, exactly */
+        uint64_t dummy_u64;
+    } buf;
+    size_t ptr;
+    union {
+        uint8_t d[256];
+        uint64_t dummy_u64;
+    } state;
+    int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+    size_t u;
+
+    /*
+     * If there are less than 9 bytes in the buffer, we refill it.
+     * This means that we may drop the last few bytes, but this allows
+     * for faster extraction code. Also, it means that we never leave
+     * an empty buffer.
+     */
+    u = p->ptr;
+    if (u >= (sizeof p->buf.d) - 9) {
+        PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+        u = 0;
+    }
+    p->ptr = u + 8;
+
+    return (uint64_t)p->buf.d[u + 0]
+           | ((uint64_t)p->buf.d[u + 1] << 8)
+           | ((uint64_t)p->buf.d[u + 2] << 16)
+           | ((uint64_t)p->buf.d[u + 3] << 24)
+           | ((uint64_t)p->buf.d[u + 4] << 32)
+           | ((uint64_t)p->buf.d[u + 5] << 40)
+           | ((uint64_t)p->buf.d[u + 6] << 48)
+           | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+    unsigned v;
+
+    v = p->buf.d[p->ptr ++];
+    if (p->ptr == sizeof p->buf.d) {
+        PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+    }
+    return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(fpr *d,
+        const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_add_muladj_fft(fpr *d,
+        const fpr *F, const fpr *G,
+        const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_div_autoadj_fft(fpr *a,
+        const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_LDL_fft(const fpr *g00,
+        fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(fpr *d11, fpr *l10,
+        const fpr *g00, const fpr *g01,
+        const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(fpr *f0, fpr *f1,
+        const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(fpr *f,
+        const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1      136
+#define FALCON_KEYGEN_TEMP_2      272
+#define FALCON_KEYGEN_TEMP_3      224
+#define FALCON_KEYGEN_TEMP_4      448
+#define FALCON_KEYGEN_TEMP_5      896
+#define FALCON_KEYGEN_TEMP_6     1792
+#define FALCON_KEYGEN_TEMP_7     3584
+#define FALCON_KEYGEN_TEMP_8     7168
+#define FALCON_KEYGEN_TEMP_9    14336
+#define FALCON_KEYGEN_TEMP_10   28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_keygen(inner_shake256_context *rng,
+        int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+        unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED512_CLEAN_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+        const fpr *expanded_key,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED512_CLEAN_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/keygen.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/keygen.c
new file mode 100644
index 000000000..f556877cc
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/keygen.c
@@ -0,0 +1,4234 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ *   Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ *   Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ *   Montgomery multiplication computes (x*y)/R mod p for
+ *   operands x and y. Therefore:
+ *
+ *    - if operands are x*R and y*R (Montgomery representations of x and
+ *      y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ *      mod p, which is the Montgomery representation of the product x*y;
+ *
+ *    - if operands are x*R and y (or x and y*R), then Montgomery
+ *      multiplication returns x*y mod p: mixed-representation
+ *      multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ *  p   The prime itself.
+ *
+ *  g   A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ *  s   The inverse of the product of all previous primes in the array,
+ *      computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+    uint32_t p;
+    uint32_t g;
+    uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+    { 2147473409,  383167813,      10239 },
+    { 2147389441,  211808905,  471403745 },
+    { 2147387393,   37672282, 1329335065 },
+    { 2147377153, 1977035326,  968223422 },
+    { 2147358721, 1067163706,  132460015 },
+    { 2147352577, 1606082042,  598693809 },
+    { 2147346433, 2033915641, 1056257184 },
+    { 2147338241, 1653770625,  421286710 },
+    { 2147309569,  631200819, 1111201074 },
+    { 2147297281, 2038364663, 1042003613 },
+    { 2147295233, 1962540515,   19440033 },
+    { 2147239937, 2100082663,  353296760 },
+    { 2147235841, 1991153006, 1703918027 },
+    { 2147217409,  516405114, 1258919613 },
+    { 2147205121,  409347988, 1089726929 },
+    { 2147196929,  927788991, 1946238668 },
+    { 2147178497, 1136922411, 1347028164 },
+    { 2147100673,  868626236,  701164723 },
+    { 2147082241, 1897279176,  617820870 },
+    { 2147074049, 1888819123,  158382189 },
+    { 2147051521,   25006327,  522758543 },
+    { 2147043329,  327546255,   37227845 },
+    { 2147039233,  766324424, 1133356428 },
+    { 2146988033, 1862817362,   73861329 },
+    { 2146963457,  404622040,  653019435 },
+    { 2146959361, 1936581214,  995143093 },
+    { 2146938881, 1559770096,  634921513 },
+    { 2146908161,  422623708, 1985060172 },
+    { 2146885633, 1751189170,  298238186 },
+    { 2146871297,  578919515,  291810829 },
+    { 2146846721, 1114060353,  915902322 },
+    { 2146834433, 2069565474,   47859524 },
+    { 2146818049, 1552824584,  646281055 },
+    { 2146775041, 1906267847, 1597832891 },
+    { 2146756609, 1847414714, 1228090888 },
+    { 2146744321, 1818792070, 1176377637 },
+    { 2146738177, 1118066398, 1054971214 },
+    { 2146736129,   52057278,  933422153 },
+    { 2146713601,  592259376, 1406621510 },
+    { 2146695169,  263161877, 1514178701 },
+    { 2146656257,  685363115,  384505091 },
+    { 2146650113,  927727032,  537575289 },
+    { 2146646017,   52575506, 1799464037 },
+    { 2146643969, 1276803876, 1348954416 },
+    { 2146603009,  814028633, 1521547704 },
+    { 2146572289, 1846678872, 1310832121 },
+    { 2146547713,  919368090, 1019041349 },
+    { 2146508801,  671847612,   38582496 },
+    { 2146492417,  283911680,  532424562 },
+    { 2146490369, 1780044827,  896447978 },
+    { 2146459649,  327980850, 1327906900 },
+    { 2146447361, 1310561493,  958645253 },
+    { 2146441217,  412148926,  287271128 },
+    { 2146437121,  293186449, 2009822534 },
+    { 2146430977,  179034356, 1359155584 },
+    { 2146418689, 1517345488, 1790248672 },
+    { 2146406401, 1615820390, 1584833571 },
+    { 2146404353,  826651445,  607120498 },
+    { 2146379777,    3816988, 1897049071 },
+    { 2146363393, 1221409784, 1986921567 },
+    { 2146355201, 1388081168,  849968120 },
+    { 2146336769, 1803473237, 1655544036 },
+    { 2146312193, 1023484977,  273671831 },
+    { 2146293761, 1074591448,  467406983 },
+    { 2146283521,  831604668, 1523950494 },
+    { 2146203649,  712865423, 1170834574 },
+    { 2146154497, 1764991362, 1064856763 },
+    { 2146142209,  627386213, 1406840151 },
+    { 2146127873, 1638674429, 2088393537 },
+    { 2146099201, 1516001018,  690673370 },
+    { 2146093057, 1294931393,  315136610 },
+    { 2146091009, 1942399533,  973539425 },
+    { 2146078721, 1843461814, 2132275436 },
+    { 2146060289, 1098740778,  360423481 },
+    { 2146048001, 1617213232, 1951981294 },
+    { 2146041857, 1805783169, 2075683489 },
+    { 2146019329,  272027909, 1753219918 },
+    { 2145986561, 1206530344, 2034028118 },
+    { 2145976321, 1243769360, 1173377644 },
+    { 2145964033,  887200839, 1281344586 },
+    { 2145906689, 1651026455,  906178216 },
+    { 2145875969, 1673238256, 1043521212 },
+    { 2145871873, 1226591210, 1399796492 },
+    { 2145841153, 1465353397, 1324527802 },
+    { 2145832961, 1150638905,  554084759 },
+    { 2145816577,  221601706,  427340863 },
+    { 2145785857,  608896761,  316590738 },
+    { 2145755137, 1712054942, 1684294304 },
+    { 2145742849, 1302302867,  724873116 },
+    { 2145728513,  516717693,  431671476 },
+    { 2145699841,  524575579, 1619722537 },
+    { 2145691649, 1925625239,  982974435 },
+    { 2145687553,  463795662, 1293154300 },
+    { 2145673217,  771716636,  881778029 },
+    { 2145630209, 1509556977,  837364988 },
+    { 2145595393,  229091856,  851648427 },
+    { 2145587201, 1796903241,  635342424 },
+    { 2145525761,  715310882, 1677228081 },
+    { 2145495041, 1040930522,  200685896 },
+    { 2145466369,  949804237, 1809146322 },
+    { 2145445889, 1673903706,   95316881 },
+    { 2145390593,  806941852, 1428671135 },
+    { 2145372161, 1402525292,  159350694 },
+    { 2145361921, 2124760298, 1589134749 },
+    { 2145359873, 1217503067, 1561543010 },
+    { 2145355777,  338341402,   83865711 },
+    { 2145343489, 1381532164,  641430002 },
+    { 2145325057, 1883895478, 1528469895 },
+    { 2145318913, 1335370424,   65809740 },
+    { 2145312769, 2000008042, 1919775760 },
+    { 2145300481,  961450962, 1229540578 },
+    { 2145282049,  910466767, 1964062701 },
+    { 2145232897,  816527501,  450152063 },
+    { 2145218561, 1435128058, 1794509700 },
+    { 2145187841,   33505311, 1272467582 },
+    { 2145181697,  269767433, 1380363849 },
+    { 2145175553,   56386299, 1316870546 },
+    { 2145079297, 2106880293, 1391797340 },
+    { 2145021953, 1347906152,  720510798 },
+    { 2145015809,  206769262, 1651459955 },
+    { 2145003521, 1885513236, 1393381284 },
+    { 2144960513, 1810381315,   31937275 },
+    { 2144944129, 1306487838, 2019419520 },
+    { 2144935937,   37304730, 1841489054 },
+    { 2144894977, 1601434616,  157985831 },
+    { 2144888833,   98749330, 2128592228 },
+    { 2144880641, 1772327002, 2076128344 },
+    { 2144864257, 1404514762, 2029969964 },
+    { 2144827393,  801236594,  406627220 },
+    { 2144806913,  349217443, 1501080290 },
+    { 2144796673, 1542656776, 2084736519 },
+    { 2144778241, 1210734884, 1746416203 },
+    { 2144759809, 1146598851,  716464489 },
+    { 2144757761,  286328400, 1823728177 },
+    { 2144729089, 1347555695, 1836644881 },
+    { 2144727041, 1795703790,  520296412 },
+    { 2144696321, 1302475157,  852964281 },
+    { 2144667649, 1075877614,  504992927 },
+    { 2144573441,  198765808, 1617144982 },
+    { 2144555009,  321528767,  155821259 },
+    { 2144550913,  814139516, 1819937644 },
+    { 2144536577,  571143206,  962942255 },
+    { 2144524289, 1746733766,    2471321 },
+    { 2144512001, 1821415077,  124190939 },
+    { 2144468993,  917871546, 1260072806 },
+    { 2144458753,  378417981, 1569240563 },
+    { 2144421889,  175229668, 1825620763 },
+    { 2144409601, 1699216963,  351648117 },
+    { 2144370689, 1071885991,  958186029 },
+    { 2144348161, 1763151227,  540353574 },
+    { 2144335873, 1060214804,  919598847 },
+    { 2144329729,  663515846, 1448552668 },
+    { 2144327681, 1057776305,  590222840 },
+    { 2144309249, 1705149168, 1459294624 },
+    { 2144296961,  325823721, 1649016934 },
+    { 2144290817,  738775789,  447427206 },
+    { 2144243713,  962347618,  893050215 },
+    { 2144237569, 1655257077,  900860862 },
+    { 2144161793,  242206694, 1567868672 },
+    { 2144155649,  769415308, 1247993134 },
+    { 2144137217,  320492023,  515841070 },
+    { 2144120833, 1639388522,  770877302 },
+    { 2144071681, 1761785233,  964296120 },
+    { 2144065537,  419817825,  204564472 },
+    { 2144028673,  666050597, 2091019760 },
+    { 2144010241, 1413657615, 1518702610 },
+    { 2143952897, 1238327946,  475672271 },
+    { 2143940609,  307063413, 1176750846 },
+    { 2143918081, 2062905559,  786785803 },
+    { 2143899649, 1338112849, 1562292083 },
+    { 2143891457,   68149545,   87166451 },
+    { 2143885313,  921750778,  394460854 },
+    { 2143854593,  719766593,  133877196 },
+    { 2143836161, 1149399850, 1861591875 },
+    { 2143762433, 1848739366, 1335934145 },
+    { 2143756289, 1326674710,  102999236 },
+    { 2143713281,  808061791, 1156900308 },
+    { 2143690753,  388399459, 1926468019 },
+    { 2143670273, 1427891374, 1756689401 },
+    { 2143666177, 1912173949,  986629565 },
+    { 2143645697, 2041160111,  371842865 },
+    { 2143641601, 1279906897, 2023974350 },
+    { 2143635457,  720473174, 1389027526 },
+    { 2143621121, 1298309455, 1732632006 },
+    { 2143598593, 1548762216, 1825417506 },
+    { 2143567873,  620475784, 1073787233 },
+    { 2143561729, 1932954575,  949167309 },
+    { 2143553537,  354315656, 1652037534 },
+    { 2143541249,  577424288, 1097027618 },
+    { 2143531009,  357862822,  478640055 },
+    { 2143522817, 2017706025, 1550531668 },
+    { 2143506433, 2078127419, 1824320165 },
+    { 2143488001,  613475285, 1604011510 },
+    { 2143469569, 1466594987,  502095196 },
+    { 2143426561, 1115430331, 1044637111 },
+    { 2143383553,    9778045, 1902463734 },
+    { 2143377409, 1557401276, 2056861771 },
+    { 2143363073,  652036455, 1965915971 },
+    { 2143260673, 1464581171, 1523257541 },
+    { 2143246337, 1876119649,  764541916 },
+    { 2143209473, 1614992673, 1920672844 },
+    { 2143203329,  981052047, 2049774209 },
+    { 2143160321, 1847355533,  728535665 },
+    { 2143129601,  965558457,  603052992 },
+    { 2143123457, 2140817191,    8348679 },
+    { 2143100929, 1547263683,  694209023 },
+    { 2143092737,  643459066, 1979934533 },
+    { 2143082497,  188603778, 2026175670 },
+    { 2143062017, 1657329695,  377451099 },
+    { 2143051777,  114967950,  979255473 },
+    { 2143025153, 1698431342, 1449196896 },
+    { 2143006721, 1862741675, 1739650365 },
+    { 2142996481,  756660457,  996160050 },
+    { 2142976001,  927864010, 1166847574 },
+    { 2142965761,  905070557,  661974566 },
+    { 2142916609,   40932754, 1787161127 },
+    { 2142892033, 1987985648,  675335382 },
+    { 2142885889,  797497211, 1323096997 },
+    { 2142871553, 2068025830, 1411877159 },
+    { 2142861313, 1217177090, 1438410687 },
+    { 2142830593,  409906375, 1767860634 },
+    { 2142803969, 1197788993,  359782919 },
+    { 2142785537,  643817365,  513932862 },
+    { 2142779393, 1717046338,  218943121 },
+    { 2142724097,   89336830,  416687049 },
+    { 2142707713,    5944581, 1356813523 },
+    { 2142658561,  887942135, 2074011722 },
+    { 2142638081,  151851972, 1647339939 },
+    { 2142564353, 1691505537, 1483107336 },
+    { 2142533633, 1989920200, 1135938817 },
+    { 2142529537,  959263126, 1531961857 },
+    { 2142527489,  453251129, 1725566162 },
+    { 2142502913, 1536028102,  182053257 },
+    { 2142498817,  570138730,  701443447 },
+    { 2142416897,  326965800,  411931819 },
+    { 2142363649, 1675665410, 1517191733 },
+    { 2142351361,  968529566, 1575712703 },
+    { 2142330881, 1384953238, 1769087884 },
+    { 2142314497, 1977173242, 1833745524 },
+    { 2142289921,   95082313, 1714775493 },
+    { 2142283777,  109377615, 1070584533 },
+    { 2142277633,   16960510,  702157145 },
+    { 2142263297,  553850819,  431364395 },
+    { 2142208001,  241466367, 2053967982 },
+    { 2142164993, 1795661326, 1031836848 },
+    { 2142097409, 1212530046,  712772031 },
+    { 2142087169, 1763869720,  822276067 },
+    { 2142078977,  644065713, 1765268066 },
+    { 2142074881,  112671944,  643204925 },
+    { 2142044161, 1387785471, 1297890174 },
+    { 2142025729,  783885537, 1000425730 },
+    { 2142011393,  905662232, 1679401033 },
+    { 2141974529,  799788433,  468119557 },
+    { 2141943809, 1932544124,  449305555 },
+    { 2141933569, 1527403256,  841867925 },
+    { 2141931521, 1247076451,  743823916 },
+    { 2141902849, 1199660531,  401687910 },
+    { 2141890561,  150132350, 1720336972 },
+    { 2141857793, 1287438162,  663880489 },
+    { 2141833217,  618017731, 1819208266 },
+    { 2141820929,  999578638, 1403090096 },
+    { 2141786113,   81834325, 1523542501 },
+    { 2141771777,  120001928,  463556492 },
+    { 2141759489,  122455485, 2124928282 },
+    { 2141749249,  141986041,  940339153 },
+    { 2141685761,  889088734,  477141499 },
+    { 2141673473,  324212681, 1122558298 },
+    { 2141669377, 1175806187, 1373818177 },
+    { 2141655041, 1113654822,  296887082 },
+    { 2141587457,  991103258, 1585913875 },
+    { 2141583361, 1401451409, 1802457360 },
+    { 2141575169, 1571977166,  712760980 },
+    { 2141546497, 1107849376, 1250270109 },
+    { 2141515777,  196544219,  356001130 },
+    { 2141495297, 1733571506, 1060744866 },
+    { 2141483009,  321552363, 1168297026 },
+    { 2141458433,  505818251,  733225819 },
+    { 2141360129, 1026840098,  948342276 },
+    { 2141325313,  945133744, 2129965998 },
+    { 2141317121, 1871100260, 1843844634 },
+    { 2141286401, 1790639498, 1750465696 },
+    { 2141267969, 1376858592,  186160720 },
+    { 2141255681, 2129698296, 1876677959 },
+    { 2141243393, 2138900688, 1340009628 },
+    { 2141214721, 1933049835, 1087819477 },
+    { 2141212673, 1898664939, 1786328049 },
+    { 2141202433,  990234828,  940682169 },
+    { 2141175809, 1406392421,  993089586 },
+    { 2141165569, 1263518371,  289019479 },
+    { 2141073409, 1485624211,  507864514 },
+    { 2141052929, 1885134788,  311252465 },
+    { 2141040641, 1285021247,  280941862 },
+    { 2141028353, 1527610374,  375035110 },
+    { 2141011969, 1400626168,  164696620 },
+    { 2140999681,  632959608,  966175067 },
+    { 2140997633, 2045628978, 1290889438 },
+    { 2140993537, 1412755491,  375366253 },
+    { 2140942337,  719477232,  785367828 },
+    { 2140925953,   45224252,  836552317 },
+    { 2140917761, 1157376588, 1001839569 },
+    { 2140887041,  278480752, 2098732796 },
+    { 2140837889, 1663139953,  924094810 },
+    { 2140788737,  802501511, 2045368990 },
+    { 2140766209, 1820083885, 1800295504 },
+    { 2140764161, 1169561905, 2106792035 },
+    { 2140696577,  127781498, 1885987531 },
+    { 2140684289,   16014477, 1098116827 },
+    { 2140653569,  665960598, 1796728247 },
+    { 2140594177, 1043085491,  377310938 },
+    { 2140579841, 1732838211, 1504505945 },
+    { 2140569601,  302071939,  358291016 },
+    { 2140567553,  192393733, 1909137143 },
+    { 2140557313,  406595731, 1175330270 },
+    { 2140549121, 1748850918,  525007007 },
+    { 2140477441,  499436566, 1031159814 },
+    { 2140469249, 1886004401, 1029951320 },
+    { 2140426241, 1483168100, 1676273461 },
+    { 2140420097, 1779917297,  846024476 },
+    { 2140413953,  522948893, 1816354149 },
+    { 2140383233, 1931364473, 1296921241 },
+    { 2140366849, 1917356555,  147196204 },
+    { 2140354561,   16466177, 1349052107 },
+    { 2140348417, 1875366972, 1860485634 },
+    { 2140323841,  456498717, 1790256483 },
+    { 2140321793, 1629493973,  150031888 },
+    { 2140315649, 1904063898,  395510935 },
+    { 2140280833, 1784104328,  831417909 },
+    { 2140250113,  256087139,  697349101 },
+    { 2140229633,  388553070,  243875754 },
+    { 2140223489,  747459608, 1396270850 },
+    { 2140200961,  507423743, 1895572209 },
+    { 2140162049,  580106016, 2045297469 },
+    { 2140149761,  712426444,  785217995 },
+    { 2140137473, 1441607584,  536866543 },
+    { 2140119041,  346538902, 1740434653 },
+    { 2140090369,  282642885,   21051094 },
+    { 2140076033, 1407456228,  319910029 },
+    { 2140047361, 1619330500, 1488632070 },
+    { 2140041217, 2089408064, 2012026134 },
+    { 2140008449, 1705524800, 1613440760 },
+    { 2139924481, 1846208233, 1280649481 },
+    { 2139906049,  989438755, 1185646076 },
+    { 2139867137, 1522314850,  372783595 },
+    { 2139842561, 1681587377,  216848235 },
+    { 2139826177, 2066284988, 1784999464 },
+    { 2139824129,  480888214, 1513323027 },
+    { 2139789313,  847937200,  858192859 },
+    { 2139783169, 1642000434, 1583261448 },
+    { 2139770881,  940699589,  179702100 },
+    { 2139768833,  315623242,  964612676 },
+    { 2139666433,  331649203,  764666914 },
+    { 2139641857, 2118730799, 1313764644 },
+    { 2139635713,  519149027,  519212449 },
+    { 2139598849, 1526413634, 1769667104 },
+    { 2139574273,  551148610,  820739925 },
+    { 2139568129, 1386800242,  472447405 },
+    { 2139549697,  813760130, 1412328531 },
+    { 2139537409, 1615286260, 1609362979 },
+    { 2139475969, 1352559299, 1696720421 },
+    { 2139455489, 1048691649, 1584935400 },
+    { 2139432961,  836025845,  950121150 },
+    { 2139424769, 1558281165, 1635486858 },
+    { 2139406337, 1728402143, 1674423301 },
+    { 2139396097, 1727715782, 1483470544 },
+    { 2139383809, 1092853491, 1741699084 },
+    { 2139369473,  690776899, 1242798709 },
+    { 2139351041, 1768782380, 2120712049 },
+    { 2139334657, 1739968247, 1427249225 },
+    { 2139332609, 1547189119,  623011170 },
+    { 2139310081, 1346827917, 1605466350 },
+    { 2139303937,  369317948,  828392831 },
+    { 2139301889, 1560417239, 1788073219 },
+    { 2139283457, 1303121623,  595079358 },
+    { 2139248641, 1354555286,  573424177 },
+    { 2139240449,   60974056,  885781403 },
+    { 2139222017,  355573421, 1221054839 },
+    { 2139215873,  566477826, 1724006500 },
+    { 2139150337,  871437673, 1609133294 },
+    { 2139144193, 1478130914, 1137491905 },
+    { 2139117569, 1854880922,  964728507 },
+    { 2139076609,  202405335,  756508944 },
+    { 2139062273, 1399715741,  884826059 },
+    { 2139045889, 1051045798, 1202295476 },
+    { 2139033601, 1707715206,  632234634 },
+    { 2139006977, 2035853139,  231626690 },
+    { 2138951681,  183867876,  838350879 },
+    { 2138945537, 1403254661,  404460202 },
+    { 2138920961,  310865011, 1282911681 },
+    { 2138910721, 1328496553,  103472415 },
+    { 2138904577,   78831681,  993513549 },
+    { 2138902529, 1319697451, 1055904361 },
+    { 2138816513,  384338872, 1706202469 },
+    { 2138810369, 1084868275,  405677177 },
+    { 2138787841,  401181788, 1964773901 },
+    { 2138775553, 1850532988, 1247087473 },
+    { 2138767361,  874261901, 1576073565 },
+    { 2138757121, 1187474742,  993541415 },
+    { 2138748929, 1782458888, 1043206483 },
+    { 2138744833, 1221500487,  800141243 },
+    { 2138738689,  413465368, 1450660558 },
+    { 2138695681,  739045140,  342611472 },
+    { 2138658817, 1355845756,  672674190 },
+    { 2138644481,  608379162, 1538874380 },
+    { 2138632193, 1444914034,  686911254 },
+    { 2138607617,  484707818, 1435142134 },
+    { 2138591233,  539460669, 1290458549 },
+    { 2138572801, 2093538990, 2011138646 },
+    { 2138552321, 1149786988, 1076414907 },
+    { 2138546177,  840688206, 2108985273 },
+    { 2138533889,  209669619,  198172413 },
+    { 2138523649, 1975879426, 1277003968 },
+    { 2138490881, 1351891144, 1976858109 },
+    { 2138460161, 1817321013, 1979278293 },
+    { 2138429441, 1950077177,  203441928 },
+    { 2138400769,  908970113,  628395069 },
+    { 2138398721,  219890864,  758486760 },
+    { 2138376193, 1306654379,  977554090 },
+    { 2138351617,  298822498, 2004708503 },
+    { 2138337281,  441457816, 1049002108 },
+    { 2138320897, 1517731724, 1442269609 },
+    { 2138290177, 1355911197, 1647139103 },
+    { 2138234881,  531313247, 1746591962 },
+    { 2138214401, 1899410930,  781416444 },
+    { 2138202113, 1813477173, 1622508515 },
+    { 2138191873, 1086458299, 1025408615 },
+    { 2138183681, 1998800427,  827063290 },
+    { 2138173441, 1921308898,  749670117 },
+    { 2138103809, 1620902804, 2126787647 },
+    { 2138099713,  828647069, 1892961817 },
+    { 2138085377,  179405355, 1525506535 },
+    { 2138060801,  615683235, 1259580138 },
+    { 2138044417, 2030277840, 1731266562 },
+    { 2138042369, 2087222316, 1627902259 },
+    { 2138032129,  126388712, 1108640984 },
+    { 2138011649,  715026550, 1017980050 },
+    { 2137993217, 1693714349, 1351778704 },
+    { 2137888769, 1289762259, 1053090405 },
+    { 2137853953,  199991890, 1254192789 },
+    { 2137833473,  941421685,  896995556 },
+    { 2137817089,  750416446, 1251031181 },
+    { 2137792513,  798075119,  368077456 },
+    { 2137786369,  878543495, 1035375025 },
+    { 2137767937,    9351178, 1156563902 },
+    { 2137755649, 1382297614, 1686559583 },
+    { 2137724929, 1345472850, 1681096331 },
+    { 2137704449,  834666929,  630551727 },
+    { 2137673729, 1646165729, 1892091571 },
+    { 2137620481,  778943821,   48456461 },
+    { 2137618433, 1730837875, 1713336725 },
+    { 2137581569,  805610339, 1378891359 },
+    { 2137538561,  204342388, 1950165220 },
+    { 2137526273, 1947629754, 1500789441 },
+    { 2137516033,  719902645, 1499525372 },
+    { 2137491457,  230451261,  556382829 },
+    { 2137440257,  979573541,  412760291 },
+    { 2137374721,  927841248, 1954137185 },
+    { 2137362433, 1243778559,  861024672 },
+    { 2137313281, 1341338501,  980638386 },
+    { 2137311233,  937415182, 1793212117 },
+    { 2137255937,  795331324, 1410253405 },
+    { 2137243649,  150756339, 1966999887 },
+    { 2137182209,  163346914, 1939301431 },
+    { 2137171969, 1952552395,  758913141 },
+    { 2137159681,  570788721,  218668666 },
+    { 2137147393, 1896656810, 2045670345 },
+    { 2137141249,  358493842,  518199643 },
+    { 2137139201, 1505023029,  674695848 },
+    { 2137133057,   27911103,  830956306 },
+    { 2137122817,  439771337, 1555268614 },
+    { 2137116673,  790988579, 1871449599 },
+    { 2137110529,  432109234,  811805080 },
+    { 2137102337, 1357900653, 1184997641 },
+    { 2137098241,  515119035, 1715693095 },
+    { 2137090049,  408575203, 2085660657 },
+    { 2137085953, 2097793407, 1349626963 },
+    { 2137055233, 1556739954, 1449960883 },
+    { 2137030657, 1545758650, 1369303716 },
+    { 2136987649,  332602570,  103875114 },
+    { 2136969217, 1499989506, 1662964115 },
+    { 2136924161,  857040753,    4738842 },
+    { 2136895489, 1948872712,  570436091 },
+    { 2136893441,   58969960, 1568349634 },
+    { 2136887297, 2127193379,  273612548 },
+    { 2136850433,  111208983, 1181257116 },
+    { 2136809473, 1627275942, 1680317971 },
+    { 2136764417, 1574888217,   14011331 },
+    { 2136741889,   14011055, 1129154251 },
+    { 2136727553,   35862563, 1838555253 },
+    { 2136721409,  310235666, 1363928244 },
+    { 2136698881, 1612429202, 1560383828 },
+    { 2136649729, 1138540131,  800014364 },
+    { 2136606721,  602323503, 1433096652 },
+    { 2136563713,  182209265, 1919611038 },
+    { 2136555521,  324156477,  165591039 },
+    { 2136549377,  195513113,  217165345 },
+    { 2136526849, 1050768046,  939647887 },
+    { 2136508417, 1886286237, 1619926572 },
+    { 2136477697,  609647664,   35065157 },
+    { 2136471553,  679352216, 1452259468 },
+    { 2136457217,  128630031,  824816521 },
+    { 2136422401,   19787464, 1526049830 },
+    { 2136420353,  698316836, 1530623527 },
+    { 2136371201, 1651862373, 1804812805 },
+    { 2136334337,  326596005,  336977082 },
+    { 2136322049,   63253370, 1904972151 },
+    { 2136297473,  312176076,  172182411 },
+    { 2136248321,  381261841,  369032670 },
+    { 2136242177,  358688773, 1640007994 },
+    { 2136229889,  512677188,   75585225 },
+    { 2136219649, 2095003250, 1970086149 },
+    { 2136207361, 1909650722,  537760675 },
+    { 2136176641, 1334616195, 1533487619 },
+    { 2136158209, 2096285632, 1793285210 },
+    { 2136143873, 1897347517,  293843959 },
+    { 2136133633,  923586222, 1022655978 },
+    { 2136096769, 1464868191, 1515074410 },
+    { 2136094721, 2020679520, 2061636104 },
+    { 2136076289,  290798503, 1814726809 },
+    { 2136041473,  156415894, 1250757633 },
+    { 2135996417,  297459940, 1132158924 },
+    { 2135955457,  538755304, 1688831340 },
+    { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+    uint32_t w;
+
+    w = (uint32_t)x;
+    w += p & -(w >> 31);
+    return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+    return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+    uint32_t y;
+
+    y = 2 - p;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    y *= 2 - p * y;
+    return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+    /*
+     * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+     * 2^31 - p.
+     */
+    return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a + b - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+    uint32_t d;
+
+    d = a - b;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+    a += p & -(a & 1);
+    return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+    uint64_t z, w;
+    uint32_t d;
+
+    z = (uint64_t)a * (uint64_t)b;
+    w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+    d = (uint32_t)((z + w) >> 31) - p;
+    d += p & -(d >> 31);
+    return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+    uint32_t z;
+
+    /*
+     * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+     * representation), then double it with an addition.
+     */
+    z = modp_R(p);
+    z = modp_add(z, z, p);
+
+    /*
+     * Square it five times to obtain 2^32 in Montgomery representation
+     * (i.e. 2^63 mod p).
+     */
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+    z = modp_montymul(z, z, p, p0i);
+
+    /*
+     * Halve the value mod p to get 2^62.
+     */
+    z = (z + (p & -(z & 1))) >> 1;
+    return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+    int i;
+    uint32_t r, z;
+
+    /*
+     * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+     * representation of (2^31)^e mod p, where e = x-1.
+     * R2 is 2^31 in Montgomery representation.
+     */
+    x --;
+    r = R2;
+    z = modp_R(p);
+    for (i = 0; (1U << i) <= x; i ++) {
+        if ((x & (1U << i)) != 0) {
+            z = modp_montymul(z, r, p, p0i);
+        }
+        r = modp_montymul(r, r, p, p0i);
+    }
+    return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ *   a     dividend
+ *   b     divisor
+ *   p     odd prime modulus
+ *   p0i   -1/p mod 2^31
+ *   R     2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+    uint32_t z, e;
+    int i;
+
+    e = p - 2;
+    z = R;
+    for (i = 30; i >= 0; i --) {
+        uint32_t z2;
+
+        z = modp_montymul(z, z, p, p0i);
+        z2 = modp_montymul(z, b, p, p0i);
+        z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+    }
+
+    /*
+     * The loop above just assumed that b was in Montgomery
+     * representation, i.e. really contained b*R; under that
+     * assumption, it returns 1/b in Montgomery representation,
+     * which is R/b. But we gave it b in normal representation,
+     * so the loop really returned R/(b/R) = R^2/b.
+     *
+     * We want a/b, so we need one Montgomery multiplication with a,
+     * which also remove one of the R factors, and another such
+     * multiplication to remove the second R factor.
+     */
+    z = modp_montymul(z, 1, p, p0i);
+    return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+    0,  512,  256,  768,  128,  640,  384,  896,   64,  576,  320,  832,
+    192,  704,  448,  960,   32,  544,  288,  800,  160,  672,  416,  928,
+    96,  608,  352,  864,  224,  736,  480,  992,   16,  528,  272,  784,
+    144,  656,  400,  912,   80,  592,  336,  848,  208,  720,  464,  976,
+    48,  560,  304,  816,  176,  688,  432,  944,  112,  624,  368,  880,
+    240,  752,  496, 1008,    8,  520,  264,  776,  136,  648,  392,  904,
+    72,  584,  328,  840,  200,  712,  456,  968,   40,  552,  296,  808,
+    168,  680,  424,  936,  104,  616,  360,  872,  232,  744,  488, 1000,
+    24,  536,  280,  792,  152,  664,  408,  920,   88,  600,  344,  856,
+    216,  728,  472,  984,   56,  568,  312,  824,  184,  696,  440,  952,
+    120,  632,  376,  888,  248,  760,  504, 1016,    4,  516,  260,  772,
+    132,  644,  388,  900,   68,  580,  324,  836,  196,  708,  452,  964,
+    36,  548,  292,  804,  164,  676,  420,  932,  100,  612,  356,  868,
+    228,  740,  484,  996,   20,  532,  276,  788,  148,  660,  404,  916,
+    84,  596,  340,  852,  212,  724,  468,  980,   52,  564,  308,  820,
+    180,  692,  436,  948,  116,  628,  372,  884,  244,  756,  500, 1012,
+    12,  524,  268,  780,  140,  652,  396,  908,   76,  588,  332,  844,
+    204,  716,  460,  972,   44,  556,  300,  812,  172,  684,  428,  940,
+    108,  620,  364,  876,  236,  748,  492, 1004,   28,  540,  284,  796,
+    156,  668,  412,  924,   92,  604,  348,  860,  220,  732,  476,  988,
+    60,  572,  316,  828,  188,  700,  444,  956,  124,  636,  380,  892,
+    252,  764,  508, 1020,    2,  514,  258,  770,  130,  642,  386,  898,
+    66,  578,  322,  834,  194,  706,  450,  962,   34,  546,  290,  802,
+    162,  674,  418,  930,   98,  610,  354,  866,  226,  738,  482,  994,
+    18,  530,  274,  786,  146,  658,  402,  914,   82,  594,  338,  850,
+    210,  722,  466,  978,   50,  562,  306,  818,  178,  690,  434,  946,
+    114,  626,  370,  882,  242,  754,  498, 1010,   10,  522,  266,  778,
+    138,  650,  394,  906,   74,  586,  330,  842,  202,  714,  458,  970,
+    42,  554,  298,  810,  170,  682,  426,  938,  106,  618,  362,  874,
+    234,  746,  490, 1002,   26,  538,  282,  794,  154,  666,  410,  922,
+    90,  602,  346,  858,  218,  730,  474,  986,   58,  570,  314,  826,
+    186,  698,  442,  954,  122,  634,  378,  890,  250,  762,  506, 1018,
+    6,  518,  262,  774,  134,  646,  390,  902,   70,  582,  326,  838,
+    198,  710,  454,  966,   38,  550,  294,  806,  166,  678,  422,  934,
+    102,  614,  358,  870,  230,  742,  486,  998,   22,  534,  278,  790,
+    150,  662,  406,  918,   86,  598,  342,  854,  214,  726,  470,  982,
+    54,  566,  310,  822,  182,  694,  438,  950,  118,  630,  374,  886,
+    246,  758,  502, 1014,   14,  526,  270,  782,  142,  654,  398,  910,
+    78,  590,  334,  846,  206,  718,  462,  974,   46,  558,  302,  814,
+    174,  686,  430,  942,  110,  622,  366,  878,  238,  750,  494, 1006,
+    30,  542,  286,  798,  158,  670,  414,  926,   94,  606,  350,  862,
+    222,  734,  478,  990,   62,  574,  318,  830,  190,  702,  446,  958,
+    126,  638,  382,  894,  254,  766,  510, 1022,    1,  513,  257,  769,
+    129,  641,  385,  897,   65,  577,  321,  833,  193,  705,  449,  961,
+    33,  545,  289,  801,  161,  673,  417,  929,   97,  609,  353,  865,
+    225,  737,  481,  993,   17,  529,  273,  785,  145,  657,  401,  913,
+    81,  593,  337,  849,  209,  721,  465,  977,   49,  561,  305,  817,
+    177,  689,  433,  945,  113,  625,  369,  881,  241,  753,  497, 1009,
+    9,  521,  265,  777,  137,  649,  393,  905,   73,  585,  329,  841,
+    201,  713,  457,  969,   41,  553,  297,  809,  169,  681,  425,  937,
+    105,  617,  361,  873,  233,  745,  489, 1001,   25,  537,  281,  793,
+    153,  665,  409,  921,   89,  601,  345,  857,  217,  729,  473,  985,
+    57,  569,  313,  825,  185,  697,  441,  953,  121,  633,  377,  889,
+    249,  761,  505, 1017,    5,  517,  261,  773,  133,  645,  389,  901,
+    69,  581,  325,  837,  197,  709,  453,  965,   37,  549,  293,  805,
+    165,  677,  421,  933,  101,  613,  357,  869,  229,  741,  485,  997,
+    21,  533,  277,  789,  149,  661,  405,  917,   85,  597,  341,  853,
+    213,  725,  469,  981,   53,  565,  309,  821,  181,  693,  437,  949,
+    117,  629,  373,  885,  245,  757,  501, 1013,   13,  525,  269,  781,
+    141,  653,  397,  909,   77,  589,  333,  845,  205,  717,  461,  973,
+    45,  557,  301,  813,  173,  685,  429,  941,  109,  621,  365,  877,
+    237,  749,  493, 1005,   29,  541,  285,  797,  157,  669,  413,  925,
+    93,  605,  349,  861,  221,  733,  477,  989,   61,  573,  317,  829,
+    189,  701,  445,  957,  125,  637,  381,  893,  253,  765,  509, 1021,
+    3,  515,  259,  771,  131,  643,  387,  899,   67,  579,  323,  835,
+    195,  707,  451,  963,   35,  547,  291,  803,  163,  675,  419,  931,
+    99,  611,  355,  867,  227,  739,  483,  995,   19,  531,  275,  787,
+    147,  659,  403,  915,   83,  595,  339,  851,  211,  723,  467,  979,
+    51,  563,  307,  819,  179,  691,  435,  947,  115,  627,  371,  883,
+    243,  755,  499, 1011,   11,  523,  267,  779,  139,  651,  395,  907,
+    75,  587,  331,  843,  203,  715,  459,  971,   43,  555,  299,  811,
+    171,  683,  427,  939,  107,  619,  363,  875,  235,  747,  491, 1003,
+    27,  539,  283,  795,  155,  667,  411,  923,   91,  603,  347,  859,
+    219,  731,  475,  987,   59,  571,  315,  827,  187,  699,  443,  955,
+    123,  635,  379,  891,  251,  763,  507, 1019,    7,  519,  263,  775,
+    135,  647,  391,  903,   71,  583,  327,  839,  199,  711,  455,  967,
+    39,  551,  295,  807,  167,  679,  423,  935,  103,  615,  359,  871,
+    231,  743,  487,  999,   23,  535,  279,  791,  151,  663,  407,  919,
+    87,  599,  343,  855,  215,  727,  471,  983,   55,  567,  311,  823,
+    183,  695,  439,  951,  119,  631,  375,  887,  247,  759,  503, 1015,
+    15,  527,  271,  783,  143,  655,  399,  911,   79,  591,  335,  847,
+    207,  719,  463,  975,   47,  559,  303,  815,  175,  687,  431,  943,
+    111,  623,  367,  879,  239,  751,  495, 1007,   31,  543,  287,  799,
+    159,  671,  415,  927,   95,  607,  351,  863,  223,  735,  479,  991,
+    63,  575,  319,  831,  191,  703,  447,  959,  127,  639,  383,  895,
+    255,  767,  511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ *   gm[rev(i)] = g^i mod p
+ *   igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+           uint32_t g, uint32_t p, uint32_t p0i) {
+    size_t u, n;
+    unsigned k;
+    uint32_t ig, x1, x2, R2;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * We want g such that g^(2N) = 1 mod p, but the provided
+     * generator has order 2048. We must square it a few times.
+     */
+    R2 = modp_R2(p, p0i);
+    g = modp_montymul(g, R2, p, p0i);
+    for (k = logn; k < 10; k ++) {
+        g = modp_montymul(g, g, p, p0i);
+    }
+
+    ig = modp_div(R2, g, p, p0i, modp_R(p));
+    k = 10 - logn;
+    x1 = x2 = modp_R(p);
+    for (u = 0; u < n; u ++) {
+        size_t v;
+
+        v = REV10[u << k];
+        gm[v] = x1;
+        igm[v] = x2;
+        x1 = modp_montymul(x1, g, p, p0i);
+        x2 = modp_montymul(x2, ig, p, p0i);
+    }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+              uint32_t p, uint32_t p0i) {
+    size_t t, m, n;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, u, v1;
+
+        ht = t >> 1;
+        for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = gm[m + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + ht * stride;
+            for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = modp_montymul(*r2, s, p, p0i);
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_sub(x, y, p);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+               uint32_t p, uint32_t p0i) {
+    size_t t, m, n, k;
+    uint32_t ni;
+    uint32_t *r;
+
+    if (logn == 0) {
+        return;
+    }
+    n = (size_t)1 << logn;
+    t = 1;
+    for (m = n; m > 1; m >>= 1) {
+        size_t hm, dt, u, v1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+            uint32_t s;
+            size_t v;
+            uint32_t *r1, *r2;
+
+            s = igm[hm + u];
+            r1 = a + v1 * stride;
+            r2 = r1 + t * stride;
+            for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+                uint32_t x, y;
+
+                x = *r1;
+                y = *r2;
+                *r1 = modp_add(x, y, p);
+                *r2 = modp_montymul(
+                          modp_sub(x, y, p), s, p, p0i);;
+            }
+        }
+        t = dt;
+    }
+
+    /*
+     * We need 1/n in Montgomery representation, i.e. R/n. Since
+     * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+     * thus a simple shift will do.
+     */
+    ni = (uint32_t)1 << (31 - logn);
+    for (k = 0, r = a; k < n; k ++, r += stride) {
+        *r = modp_montymul(*r, ni, p, p0i);
+    }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i)   modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+                  uint32_t p, uint32_t p0i, uint32_t R2) {
+    size_t hn, u;
+
+    hn = (size_t)1 << (logn - 1);
+    for (u = 0; u < hn; u ++) {
+        uint32_t w0, w1;
+
+        w0 = f[(u << 1) + 0];
+        w1 = f[(u << 1) + 1];
+        f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+    }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ *  - Rebuild the resultant and the polynomial coefficients from their
+ *    values modulo small primes (of length 31 bits each).
+ *
+ *  - Compute an extended GCD between the two computed resultants.
+ *
+ *  - Extract top bits and add scaled values during the successive steps
+ *    of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+         uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    cc = 0;
+    m = -ctl;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, w;
+
+        aw = a[u];
+        w = aw - b[u] - cc;
+        cc = w >> 31;
+        aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+        a[u] = aw;
+    }
+    return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < mlen; u ++) {
+        uint64_t z;
+
+        z = (uint64_t)m[u] * (uint64_t)x + cc;
+        m[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ *  d is unsigned
+ *  p is prime
+ *  2^30 < p < 2^31
+ *  p0i = -(1/p) mod 2^31
+ *  R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+                        uint32_t p, uint32_t p0i, uint32_t R2) {
+    uint32_t x;
+    size_t u;
+
+    /*
+     * Algorithm: we inject words one by one, starting with the high
+     * word. Each step is:
+     *  - multiply x by 2^31
+     *  - add new word
+     */
+    x = 0;
+    u = dlen;
+    while (u -- > 0) {
+        uint32_t w;
+
+        x = modp_montymul(x, R2, p, p0i);
+        w = d[u] - p;
+        w += p & -(w >> 31);
+        x = modp_add(x, w, p);
+    }
+    return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+                      uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+    uint32_t z;
+
+    if (dlen == 0) {
+        return 0;
+    }
+    z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+    z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+    return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+                   const uint32_t *y, size_t len, uint32_t s) {
+    size_t u;
+    uint32_t cc;
+
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t xw, yw;
+        uint64_t z;
+
+        xw = x[u];
+        yw = y[u];
+        z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+        cc = (uint32_t)(z >> 31);
+    }
+    x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+    size_t u;
+    uint32_t r, bb;
+
+    /*
+     * Compare x with p/2. We use the shifted version of p, and p
+     * is odd, so we really compare with (p-1)/2; we want to perform
+     * the subtraction if and only if x > (p-1)/2.
+     */
+    r = 0;
+    bb = 0;
+    u = len;
+    while (u -- > 0) {
+        uint32_t wx, wp, cc;
+
+        /*
+         * Get the two words to compare in wx and wp (both over
+         * 31 bits exactly).
+         */
+        wx = x[u];
+        wp = (p[u] >> 1) | (bb << 30);
+        bb = p[u] & 1;
+
+        /*
+         * We set cc to -1, 0 or 1, depending on whether wp is
+         * lower than, equal to, or greater than wx.
+         */
+        cc = wp - wx;
+        cc = ((-cc) >> 31) | -(cc >> 31);
+
+        /*
+         * If r != 0 then it is either 1 or -1, and we keep its
+         * value. Otherwise, if r = 0, then we replace it with cc.
+         */
+        r |= cc & ((r & 1) - 1);
+    }
+
+    /*
+     * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+     * is lower than, equal to, or greater than x. We thus want to
+     * do the subtraction only if r = -1.
+     */
+    zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+                 size_t num, const small_prime *primes, int normalize_signed,
+                 uint32_t *tmp) {
+    size_t u;
+    uint32_t *x;
+
+    tmp[0] = primes[0].p;
+    for (u = 1; u < xlen; u ++) {
+        /*
+         * At the entry of each loop iteration:
+         *  - the first u words of each array have been
+         *    reassembled;
+         *  - the first u words of tmp[] contains the
+         * product of the prime moduli processed so far.
+         *
+         * We call 'q' the product of all previous primes.
+         */
+        uint32_t p, p0i, s, R2;
+        size_t v;
+
+        p = primes[u].p;
+        s = primes[u].s;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        for (v = 0, x = xx; v < num; v ++, x += xstride) {
+            uint32_t xp, xq, xr;
+            /*
+             * xp = the integer x modulo the prime p for this
+             *      iteration
+             * xq = (x mod q) mod p
+             */
+            xp = x[u];
+            xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+            /*
+             * New value is (x mod q) + q * (s * (xp - xq) mod p)
+             */
+            xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+            zint_add_mul_small(x, tmp, u, xr);
+        }
+
+        /*
+         * Update product of primes in tmp[].
+         */
+        tmp[u] = zint_mul_small(tmp, u, p);
+    }
+
+    /*
+     * Normalize the reconstructed values around 0.
+     */
+    if (normalize_signed) {
+        for (u = 0, x = xx; u < num; u ++, x += xstride) {
+            zint_norm_zero(x, tmp, xlen);
+        }
+    }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+    size_t u;
+    uint32_t cc, m;
+
+    /*
+     * If ctl = 1 then we flip the bits of a by XORing with
+     * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+     * with 0 and add 0, which leaves the value unchanged.
+     */
+    cc = ctl;
+    m = -ctl >> 1;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw;
+
+        aw = a[u];
+        aw = (aw ^ m) + cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ *  0  both values were positive
+ *  1  new a had to be negated
+ *  2  new b had to be negated
+ *  3  both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+               int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t nega, negb;
+
+    cca = 0;
+    ccb = 0;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    nega = (uint32_t)((uint64_t)cca >> 63);
+    negb = (uint32_t)((uint64_t)ccb >> 63);
+    zint_negate(a, len, nega);
+    zint_negate(b, len, negb);
+    return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+    size_t u;
+    uint32_t cc, xm, ym;
+
+    /*
+     * First pass: compare a (assumed nonnegative) with m. Note that
+     * if the top word uses 32 bits, subtracting m must yield a
+     * value less than 2^31 since a < 2*m.
+     */
+    cc = 0;
+    for (u = 0; u < len; u ++) {
+        cc = (a[u] - m[u] - cc) >> 31;
+    }
+
+    /*
+     * If neg = 1 then we must add m (regardless of cc)
+     * If neg = 0 and cc = 0 then we must subtract m
+     * If neg = 0 and cc = 1 then we must do nothing
+     *
+     * In the loop below, we conditionally subtract either m or -m
+     * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+     * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+     */
+    xm = -neg >> 1;
+    ym = -(neg | (1 - cc));
+    cc = neg;
+    for (u = 0; u < len; u ++) {
+        uint32_t aw, mw;
+
+        aw = a[u];
+        mw = (m[u] ^ xm) & ym;
+        aw = aw - mw - cc;
+        a[u] = aw & 0x7FFFFFFF;
+        cc = aw >> 31;
+    }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+                   uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+    size_t u;
+    int64_t cca, ccb;
+    uint32_t fa, fb;
+
+    /*
+     * These are actually four combined Montgomery multiplications.
+     */
+    cca = 0;
+    ccb = 0;
+    fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+    fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+    for (u = 0; u < len; u ++) {
+        uint32_t wa, wb;
+        uint64_t za, zb;
+
+        wa = a[u];
+        wb = b[u];
+        za = wa * (uint64_t)xa + wb * (uint64_t)xb
+             + m[u] * (uint64_t)fa + (uint64_t)cca;
+        zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+             + m[u] * (uint64_t)fb + (uint64_t)ccb;
+        if (u > 0) {
+            a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+            b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+        }
+        cca = *(int64_t *)&za >> 31;
+        ccb = *(int64_t *)&zb >> 31;
+    }
+    a[len - 1] = (uint32_t)cca;
+    b[len - 1] = (uint32_t)ccb;
+
+    /*
+     * At this point:
+     *   -m <= a < 2*m
+     *   -m <= b < 2*m
+     * (this is a case of Montgomery reduction)
+     * The top words of 'a' and 'b' may have a 32-th bit set.
+     * We want to add or subtract the modulus, as required.
+     */
+    zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+    zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ *   0 <= u <= y
+ *   0 <= v <= x
+ *   x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+            const uint32_t *x, const uint32_t *y,
+            size_t len, uint32_t *tmp) {
+    /*
+     * Algorithm is an extended binary GCD. We maintain 6 values
+     * a, b, u0, u1, v0 and v1 with the following invariants:
+     *
+     *  a = x*u0 - y*v0
+     *  b = x*u1 - y*v1
+     *  0 <= a <= x
+     *  0 <= b <= y
+     *  0 <= u0 < y
+     *  0 <= v0 < x
+     *  0 <= u1 <= y
+     *  0 <= v1 < x
+     *
+     * Initial values are:
+     *
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     *
+     * Each iteration reduces either a or b, and maintains the
+     * invariants. Algorithm stops when a = b, at which point their
+     * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+     * the values (u,v) we want to return.
+     *
+     * The formal definition of the algorithm is a sequence of steps:
+     *
+     *  - If a is even, then:
+     *        a <- a/2
+     *        u0 <- u0/2 mod y
+     *        v0 <- v0/2 mod x
+     *
+     *  - Otherwise, if b is even, then:
+     *        b <- b/2
+     *        u1 <- u1/2 mod y
+     *        v1 <- v1/2 mod x
+     *
+     *  - Otherwise, if a > b, then:
+     *        a <- (a-b)/2
+     *        u0 <- (u0-u1)/2 mod y
+     *        v0 <- (v0-v1)/2 mod x
+     *
+     *  - Otherwise:
+     *        b <- (b-a)/2
+     *        u1 <- (u1-u0)/2 mod y
+     *        v1 <- (v1-v0)/2 mod y
+     *
+     * We can show that the operations above preserve the invariants:
+     *
+     *  - If a is even, then u0 and v0 are either both even or both
+     *    odd (since a = x*u0 - y*v0, and x and y are both odd).
+     *    If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+     *    Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+     *    the a = x*u0 - y*v0 invariant is preserved.
+     *
+     *  - The same holds for the case where b is even.
+     *
+     *  - If a and b are odd, and a > b, then:
+     *
+     *      a-b = x*(u0-u1) - y*(v0-v1)
+     *
+     *    In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+     *    a-b > 0; therefore, it must be that v0 < v1, and the
+     *    first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+     *    which preserves the invariants. Otherwise, if u0 > u1,
+     *    then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+     *    b >= 0, hence a-b <= x. It follows that, in that case,
+     *    v0-v1 >= 0. The first part of the update is then:
+     *    (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+     *    invariants.
+     *
+     *    Either way, once the subtraction is done, the new value of
+     *    a, which is the difference of two odd values, is even,
+     *    and the remaining of this step is a subcase of the
+     *    first algorithm case (i.e. when a is even).
+     *
+     *  - If a and b are odd, and b > a, then the a similar
+     *    argument holds.
+     *
+     * The values a and b start at x and y, respectively. Since x
+     * and y are odd, their GCD is odd, and it is easily seen that
+     * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+     * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+     * or b is reduced by at least one bit at each iteration, so
+     * the algorithm necessarily converges on the case a = b, at
+     * which point the common value is the GCD.
+     *
+     * In the algorithm expressed above, when a = b, the fourth case
+     * applies, and sets b = 0. Since a contains the GCD of x and y,
+     * which are both odd, a must be odd, and subsequent iterations
+     * (if any) will simply divide b by 2 repeatedly, which has no
+     * consequence. Thus, the algorithm can run for more iterations
+     * than necessary; the final GCD will be in a, and the (u,v)
+     * coefficients will be (u0,v0).
+     *
+     *
+     * The presentation above is bit-by-bit. It can be sped up by
+     * noticing that all decisions are taken based on the low bits
+     * and high bits of a and b. We can extract the two top words
+     * and low word of each of a and b, and compute reduction
+     * parameters pa, pb, qa and qb such that the new values for
+     * a and b are:
+     *    a' = (a*pa + b*pb) / (2^31)
+     *    b' = (a*qa + b*qb) / (2^31)
+     * the two divisions being exact. The coefficients are obtained
+     * just from the extracted words, and may be slightly off, requiring
+     * an optional correction: if a' < 0, then we replace pa with -pa
+     * and pb with -pb. Each such step will reduce the total length
+     * (sum of lengths of a and b) by at least 30 bits at each
+     * iteration.
+     */
+    uint32_t *u0, *u1, *v0, *v1, *a, *b;
+    uint32_t x0i, y0i;
+    uint32_t num, rc;
+    size_t j;
+
+    if (len == 0) {
+        return 0;
+    }
+
+    /*
+     * u0 and v0 are the u and v result buffers; the four other
+     * values (u1, v1, a and b) are taken from tmp[].
+     */
+    u0 = u;
+    v0 = v;
+    u1 = tmp;
+    v1 = u1 + len;
+    a = v1 + len;
+    b = a + len;
+
+    /*
+     * We'll need the Montgomery reduction coefficients.
+     */
+    x0i = modp_ninv31(x[0]);
+    y0i = modp_ninv31(y[0]);
+
+    /*
+     * Initialize a, b, u0, u1, v0 and v1.
+     *  a = x   u0 = 1   v0 = 0
+     *  b = y   u1 = y   v1 = x-1
+     * Note that x is odd, so computing x-1 is easy.
+     */
+    memcpy(a, x, len * sizeof * x);
+    memcpy(b, y, len * sizeof * y);
+    u0[0] = 1;
+    memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+    memset(v0, 0, len * sizeof * v0);
+    memcpy(u1, y, len * sizeof * u1);
+    memcpy(v1, x, len * sizeof * v1);
+    v1[0] --;
+
+    /*
+     * Each input operand may be as large as 31*len bits, and we
+     * reduce the total length by at least 30 bits at each iteration.
+     */
+    for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+        uint32_t c0, c1;
+        uint32_t a0, a1, b0, b1;
+        uint64_t a_hi, b_hi;
+        uint32_t a_lo, b_lo;
+        int64_t pa, pb, qa, qb;
+        int i;
+        uint32_t r;
+
+        /*
+         * Extract the top words of a and b. If j is the highest
+         * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+         * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+         * If a and b are down to one word each, then we use
+         * a[0] and b[0].
+         */
+        c0 = (uint32_t) -1;
+        c1 = (uint32_t) -1;
+        a0 = 0;
+        a1 = 0;
+        b0 = 0;
+        b1 = 0;
+        j = len;
+        while (j -- > 0) {
+            uint32_t aw, bw;
+
+            aw = a[j];
+            bw = b[j];
+            a0 ^= (a0 ^ aw) & c0;
+            a1 ^= (a1 ^ aw) & c1;
+            b0 ^= (b0 ^ bw) & c0;
+            b1 ^= (b1 ^ bw) & c1;
+            c1 = c0;
+            c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+        }
+
+        /*
+         * If c1 = 0, then we grabbed two words for a and b.
+         * If c1 != 0 but c0 = 0, then we grabbed one word. It
+         * is not possible that c1 != 0 and c0 != 0, because that
+         * would mean that both integers are zero.
+         */
+        a1 |= a0 & c1;
+        a0 &= ~c1;
+        b1 |= b0 & c1;
+        b0 &= ~c1;
+        a_hi = ((uint64_t)a0 << 31) + a1;
+        b_hi = ((uint64_t)b0 << 31) + b1;
+        a_lo = a[0];
+        b_lo = b[0];
+
+        /*
+         * Compute reduction factors:
+         *
+         *   a' = a*pa + b*pb
+         *   b' = a*qa + b*qb
+         *
+         * such that a' and b' are both multiple of 2^31, but are
+         * only marginally larger than a and b.
+         */
+        pa = 1;
+        pb = 0;
+        qa = 0;
+        qb = 1;
+        for (i = 0; i < 31; i ++) {
+            /*
+             * At each iteration:
+             *
+             *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+             *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+             *   a <- a/2 if: a is even
+             *   b <- b/2 if: a is odd, b is even
+             *
+             * We multiply a_lo and b_lo by 2 at each
+             * iteration, thus a division by 2 really is a
+             * non-multiplication by 2.
+             */
+            uint32_t rt, oa, ob, cAB, cBA, cA;
+            uint64_t rz;
+
+            /*
+             * rt = 1 if a_hi > b_hi, 0 otherwise.
+             */
+            rz = b_hi - a_hi;
+            rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+                                   & (a_hi ^ rz))) >> 63);
+
+            /*
+             * cAB = 1 if b must be subtracted from a
+             * cBA = 1 if a must be subtracted from b
+             * cA = 1 if a must be divided by 2
+             *
+             * Rules:
+             *
+             *   cAB and cBA cannot both be 1.
+             *   If a is not divided by 2, b is.
+             */
+            oa = (a_lo >> i) & 1;
+            ob = (b_lo >> i) & 1;
+            cAB = oa & ob & rt;
+            cBA = oa & ob & ~rt;
+            cA = cAB | (oa ^ 1);
+
+            /*
+             * Conditional subtractions.
+             */
+            a_lo -= b_lo & -cAB;
+            a_hi -= b_hi & -(uint64_t)cAB;
+            pa -= qa & -(int64_t)cAB;
+            pb -= qb & -(int64_t)cAB;
+            b_lo -= a_lo & -cBA;
+            b_hi -= a_hi & -(uint64_t)cBA;
+            qa -= pa & -(int64_t)cBA;
+            qb -= pb & -(int64_t)cBA;
+
+            /*
+             * Shifting.
+             */
+            a_lo += a_lo & (cA - 1);
+            pa += pa & ((int64_t)cA - 1);
+            pb += pb & ((int64_t)cA - 1);
+            a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+            b_lo += b_lo & -cA;
+            qa += qa & -(int64_t)cA;
+            qb += qb & -(int64_t)cA;
+            b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+        }
+
+        /*
+         * Apply the computed parameters to our values. We
+         * may have to correct pa and pb depending on the
+         * returned value of zint_co_reduce() (when a and/or b
+         * had to be negated).
+         */
+        r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+        pa -= (pa + pa) & -(int64_t)(r & 1);
+        pb -= (pb + pb) & -(int64_t)(r & 1);
+        qa -= (qa + qa) & -(int64_t)(r >> 1);
+        qb -= (qb + qb) & -(int64_t)(r >> 1);
+        zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+        zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+    }
+
+    /*
+     * At that point, array a[] should contain the GCD, and the
+     * results (u,v) should already be set. We check that the GCD
+     * is indeed 1. We also check that the two operands x and y
+     * are odd.
+     */
+    rc = a[0] ^ 1;
+    for (j = 1; j < len; j ++) {
+        rc |= a[j];
+    }
+    return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+                          const uint32_t *y, size_t ylen, int32_t k,
+                          uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    int32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t wy, wys, ccu;
+        uint64_t z;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        /*
+         * The expression below does not overflow.
+         */
+        z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+        x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+        /*
+         * Right-shifting the signed value z would yield
+         * implementation-defined results (arithmetic shift is
+         * not guaranteed). However, we can cast to unsigned,
+         * and get the next carry as an unsigned word. We can
+         * then convert it back to signed by using the guaranteed
+         * fact that 'int32_t' uses two's complement with no
+         * trap representation or padding bit, and with a layout
+         * compatible with that of 'uint32_t'.
+         */
+        ccu = (uint32_t)(z >> 31);
+        cc = *(int32_t *)&ccu;
+    }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ *   sch = sc / 31
+ *   scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+                const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+    size_t u;
+    uint32_t ysign, tw;
+    uint32_t cc;
+
+    if (ylen == 0) {
+        return;
+    }
+
+    ysign = -(y[ylen - 1] >> 30) >> 1;
+    tw = 0;
+    cc = 0;
+    for (u = sch; u < xlen; u ++) {
+        size_t v;
+        uint32_t w, wy, wys;
+
+        /*
+         * Get the next word of y (scaled).
+         */
+        v = u - sch;
+        if (v < ylen) {
+            wy = y[v];
+        } else {
+            wy = ysign;
+        }
+        wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+        tw = wy >> (31 - scl);
+
+        w = x[u] - wys - cc;
+        x[u] = w & 0x7FFFFFFF;
+        cc = w >> 31;
+    }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+    uint32_t w;
+
+    w = x[0];
+    w |= (w & 0x40000000) << 1;
+    return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+               unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    if (flen == 0) {
+        for (u = 0; u < n; u ++) {
+            d[u] = fpr_zero;
+        }
+        return;
+    }
+    for (u = 0; u < n; u ++, f += fstride) {
+        size_t v;
+        uint32_t neg, cc, xm;
+        fpr x, fsc;
+
+        /*
+         * Get sign of the integer; if it is negative, then we
+         * will load its absolute value instead, and negate the
+         * result.
+         */
+        neg = -(f[flen - 1] >> 30);
+        xm = neg >> 1;
+        cc = neg & 1;
+        x = fpr_zero;
+        fsc = fpr_one;
+        for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+            uint32_t w;
+
+            w = (f[v] ^ xm) + cc;
+            cc = w >> 31;
+            w &= 0x7FFFFFFF;
+            w -= (w << 1) & neg;
+            x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+        }
+        d[u] = x;
+    }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = zint_one_to_plain(s + u);
+        if (z < -lim || z > lim) {
+            return 0;
+        }
+        d[u] = (int8_t)z;
+    }
+    return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+                const uint32_t *f, size_t flen, size_t fstride,
+                const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        int32_t kf;
+        size_t v;
+        uint32_t *x;
+        const uint32_t *y;
+
+        kf = -k[u];
+        x = F + u * Fstride;
+        y = f;
+        for (v = 0; v < n; v ++) {
+            zint_add_scaled_mul_small(
+                x, Flen, y, flen, kf, sch, scl);
+            if (u + v == n - 1) {
+                x = F;
+                kf = -kf;
+            } else {
+                x += Fstride;
+            }
+            y += fstride;
+        }
+    }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+                    const uint32_t *f, size_t flen, size_t fstride,
+                    const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+                    uint32_t *tmp) {
+    uint32_t *gm, *igm, *fk, *t1, *x;
+    const uint32_t *y;
+    size_t n, u, tlen;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    tlen = flen + 1;
+    gm = tmp;
+    igm = gm + MKN(logn);
+    fk = igm + MKN(logn);
+    t1 = fk + n * tlen;
+
+    primes = PRIMES;
+
+    /*
+     * Compute k*f in fk[], in RNS notation.
+     */
+    for (u = 0; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0; v < n; v ++) {
+            t1[v] = modp_set(k[v], p);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, y = f, x = fk + u;
+                v < n; v ++, y += fstride, x += tlen) {
+            *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+        }
+        modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+        for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+            *x = modp_montymul(
+                     modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+        }
+        modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild k*f.
+     */
+    zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+    /*
+     * Subtract k*f, scaled, from F.
+     */
+    for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+        zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+    }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT   inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+    /*
+     * We enforce little-endian representation.
+     */
+
+    uint8_t tmp[8];
+
+    inner_shake256_extract(rng, tmp, sizeof tmp);
+    return (uint64_t)tmp[0]
+           | ((uint64_t)tmp[1] << 8)
+           | ((uint64_t)tmp[2] << 16)
+           | ((uint64_t)tmp[3] << 24)
+           | ((uint64_t)tmp[4] << 32)
+           | ((uint64_t)tmp[5] << 40)
+           | ((uint64_t)tmp[6] << 48)
+           | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ *    D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+    1283868770400643928u,  6416574995475331444u,  4078260278032692663u,
+    2353523259288686585u,  1227179971273316331u,   575931623374121527u,
+    242543240509105209u,    91437049221049666u,    30799446349977173u,
+    9255276791179340u,     2478152334826140u,      590642893610164u,
+    125206034929641u,       23590435911403u,        3948334035941u,
+    586753615614u,          77391054539u,           9056793210u,
+    940121950u,             86539696u,              7062824u,
+    510971u,                32764u,                 1862u,
+    94u,                    4u,                    0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+    unsigned u, g;
+    int val;
+
+    g = 1U << (10 - logn);
+    val = 0;
+    for (u = 0; u < g; u ++) {
+        /*
+         * Each iteration generates one value with the
+         * Gaussian distribution for N = 1024.
+         *
+         * We use two random 64-bit values. First value
+         * decides on whether the generated value is 0, and,
+         * if not, the sign of the value. Second random 64-bit
+         * word is used to generate the non-zero value.
+         *
+         * For constant-time code we have to read the complete
+         * table. This has negligible cost, compared with the
+         * remainder of the keygen process (solving the NTRU
+         * equation).
+         */
+        uint64_t r;
+        uint32_t f, v, k, neg;
+
+        /*
+         * First value:
+         *  - flag 'neg' is randomly selected to be 0 or 1.
+         *  - flag 'f' is set to 1 if the generated value is zero,
+         *    or set to 0 otherwise.
+         */
+        r = get_rng_u64(rng);
+        neg = (uint32_t)(r >> 63);
+        r &= ~((uint64_t)1 << 63);
+        f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+        /*
+         * We produce a new random 63-bit integer r, and go over
+         * the array, starting at index 1. We store in v the
+         * index of the first array element which is not greater
+         * than r, unless the flag f was already 1.
+         */
+        v = 0;
+        r = get_rng_u64(rng);
+        r &= ~((uint64_t)1 << 63);
+        for (k = 1; k < (sizeof gauss_1024_12289)
+                / (sizeof gauss_1024_12289[0]); k ++) {
+            uint32_t t;
+
+            t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+            v |= k & -(t & (f ^ 1));
+            f |= t;
+        }
+
+        /*
+         * We apply the sign ('neg' flag). If the value is zero,
+         * the sign has no effect.
+         */
+        v = (v ^ -neg) + neg;
+
+        /*
+         * Generated value is added to val.
+         */
+        val += *(int32_t *)&v;
+    }
+    return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ *   MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ *   MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ *  - Within an array, values grow.
+ *
+ *  - The 'SMALL' array must have an entry for maximum depth, corresponding
+ *    to the size of values used in the binary GCD. There is no such value
+ *    for the 'LARGE' array (the binary GCD yields already reduced
+ *    coefficients).
+ *
+ *  - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ *  - Values must be large enough to handle the common cases, with some
+ *    margins.
+ *
+ *  - Values must not be "too large" either because we will convert some
+ *    integers into floating-point values by considering the top 10 words,
+ *    i.e. 310 bits; hence, for values of length more than 10 words, we
+ *    should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ *    depth: 10    fg: 6307.52 (24.48)    FG: 6319.66 (24.51)
+ *    depth:  9    fg: 3138.35 (12.25)    FG: 9403.29 (27.55)
+ *    depth:  8    fg: 1576.87 ( 7.49)    FG: 4703.30 (14.77)
+ *    depth:  7    fg:  794.17 ( 4.98)    FG: 2361.84 ( 9.31)
+ *    depth:  6    fg:  400.67 ( 3.10)    FG: 1188.68 ( 6.04)
+ *    depth:  5    fg:  202.22 ( 1.87)    FG:  599.81 ( 3.87)
+ *    depth:  4    fg:  101.62 ( 1.02)    FG:  303.49 ( 2.38)
+ *    depth:  3    fg:   50.37 ( 0.53)    FG:  153.65 ( 1.39)
+ *    depth:  2    fg:   24.07 ( 0.25)    FG:   78.20 ( 0.73)
+ *    depth:  1    fg:   10.99 ( 0.08)    FG:   39.82 ( 0.41)
+ *    depth:  0    fg:    4.00 ( 0.00)    FG:   19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+    1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+    2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+    int avg;
+    int std;
+} BITLENGTH[] = {
+    {    4,  0 },
+    {   11,  1 },
+    {   24,  1 },
+    {   50,  1 },
+    {  102,  1 },
+    {  202,  2 },
+    {  401,  4 },
+    {  794,  5 },
+    { 1577,  8 },
+    { 3138, 13 },
+    { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG   4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+    size_t n, u;
+    uint32_t s, ng;
+
+    n = MKN(logn);
+    s = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = f[u];
+        s += (uint32_t)(z * z);
+        ng |= s;
+    }
+    return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(fpr);
+    if (km) {
+        k += (sizeof(fpr)) - km;
+    }
+    return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+    uint8_t *cb, *cd;
+    size_t k, km;
+
+    cb = base;
+    cd = data;
+    k = (size_t)(cd - cb);
+    km = k % sizeof(uint32_t);
+    if (km) {
+        k += (sizeof(uint32_t)) - km;
+    }
+    return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        x[u] = fpr_of(f[u]);
+    }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+             int in_ntt, int out_ntt) {
+    size_t n, hn, u;
+    size_t slen, tlen;
+    uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+    const small_prime *primes;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    slen = MAX_BL_SMALL[depth];
+    tlen = MAX_BL_SMALL[depth + 1];
+    primes = PRIMES;
+
+    /*
+     * Prepare room for the result.
+     */
+    fd = data;
+    gd = fd + hn * tlen;
+    fs = gd + hn * tlen;
+    gs = fs + n * slen;
+    gm = gs + n * slen;
+    igm = gm + n;
+    t1 = igm + n;
+    memmove(fs, data, 2 * n * slen * sizeof * data);
+
+    /*
+     * First slen words: we use the input values directly, and apply
+     * inverse NTT as we go.
+     */
+    for (u = 0; u < slen; u ++) {
+        uint32_t p, p0i, R2;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+        }
+
+        for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+            t1[v] = *x;
+        }
+        if (!in_ntt) {
+            modp_NTT2(t1, gm, logn, p, p0i);
+        }
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        if (in_ntt) {
+            modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+
+    /*
+     * Since the fs and gs words have been de-NTTized, we can use the
+     * CRT to rebuild the values.
+     */
+    zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+    zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+    /*
+     * Remaining words: use modular reductions to extract the values.
+     */
+    for (u = slen; u < tlen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *x;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+        for (v = 0, x = fs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+        for (v = 0, x = gs; v < n; v ++, x += slen) {
+            t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+        }
+        modp_NTT2(t1, gm, logn, p, p0i);
+        for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+            uint32_t w0, w1;
+
+            w0 = t1[(v << 1) + 0];
+            w1 = t1[(v << 1) + 1];
+            *x = modp_montymul(
+                     modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+        }
+
+        if (!out_ntt) {
+            modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+            modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+        }
+    }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ *   0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+        unsigned logn, unsigned depth, int out_ntt) {
+    size_t n, u;
+    uint32_t *ft, *gt, p0;
+    unsigned d;
+    const small_prime *primes;
+
+    n = MKN(logn);
+    ft = data;
+    gt = ft + n;
+    primes = PRIMES;
+    p0 = primes[0].p;
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p0);
+        gt[u] = modp_set(g[u], p0);
+    }
+
+    if (depth == 0 && out_ntt) {
+        uint32_t *gm, *igm;
+        uint32_t p, p0i;
+
+        p = primes[0].p;
+        p0i = modp_ninv31(p);
+        gm = gt + n;
+        igm = gm + MKN(logn);
+        modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+        modp_NTT2(ft, gm, logn, p, p0i);
+        modp_NTT2(gt, gm, logn, p, p0i);
+        return;
+    }
+
+    if (depth == 0) {
+        return;
+    }
+
+    if (depth == 1) {
+        make_fg_step(data, logn, 0, 0, out_ntt);
+        return;
+    }
+
+    make_fg_step(data, logn, 0, 0, 1);
+    for (d = 1; d + 1 < depth; d ++) {
+        make_fg_step(data, logn - d, d, 1, 1);
+    }
+    make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+                   const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t len;
+    uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+    const small_prime *primes;
+
+    len = MAX_BL_SMALL[logn_top];
+    primes = PRIMES;
+
+    Fp = tmp;
+    Gp = Fp + len;
+    fp = Gp + len;
+    gp = fp + len;
+    t1 = gp + len;
+
+    make_fg(fp, f, g, logn_top, logn_top, 0);
+
+    /*
+     * We use the CRT to rebuild the resultants as big integers.
+     * There are two such big integers. The resultants are always
+     * nonnegative.
+     */
+    zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+    /*
+     * Apply the binary GCD. The zint_bezout() function works only
+     * if both inputs are odd.
+     *
+     * We can test on the result and return 0 because that would
+     * imply failure of the NTRU solving equation, and the (f,g)
+     * values will be abandoned in that case.
+     */
+    if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+        return 0;
+    }
+
+    /*
+     * Multiply the two values by the target value q. Values must
+     * fit in the destination arrays.
+     * We can again test on the returned words: a non-zero output
+     * of zint_mul_small() means that we exceeded our array
+     * capacity, and that implies failure and rejection of (f,g).
+     */
+    q = 12289;
+    if (zint_mul_small(Fp, len, q) != 0
+            || zint_mul_small(Gp, len, q) != 0) {
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+                        const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+    /*
+     * In this function, 'logn' is the log2 of the degree for
+     * this step. If N = 2^logn, then:
+     *  - the F and G values already in fk->tmp (from the deeper
+     *    levels) have degree N/2;
+     *  - this function should return F and G of degree N.
+     */
+    unsigned logn;
+    size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+    int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+    uint32_t *x, *y;
+    int32_t *k;
+    const small_prime *primes;
+
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2 or N/3)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+    primes = PRIMES;
+
+    /*
+     * Fd and Gd are the F and G from the deeper level.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+
+    /*
+     * Compute the input f and g for this level. Note that we get f
+     * and g in RNS + NTT representation.
+     */
+    ft = Gd + dlen * hn;
+    make_fg(ft, f, g, logn_top, depth, 1);
+
+    /*
+     * Move the newly computed f and g to make room for our candidate
+     * F and G (unreduced).
+     */
+    Ft = tmp;
+    Gt = Ft + n * llen;
+    t1 = Gt + n * llen;
+    memmove(t1, ft, 2 * n * slen * sizeof * ft);
+    ft = t1;
+    gt = ft + slen * n;
+    t1 = gt + slen * n;
+
+    /*
+     * Move Fd and Gd _after_ f and g.
+     */
+    memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+    Fd = t1;
+    Gd = Fd + hn * dlen;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt (only n/2 values in each).
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * We do not need Fd and Gd after that point.
+     */
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = primes[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * If we processed slen words, then f and g have been
+         * de-NTTized, and are in RNS; we can rebuild them.
+         */
+        if (u == slen) {
+            zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+            zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+        }
+
+        gm = t1;
+        igm = gm + n;
+        fx = igm + n;
+        gx = fx + n;
+
+        modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+        if (u < slen) {
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = *x;
+                gx[v] = *y;
+            }
+            modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+            modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+        } else {
+            uint32_t Rx;
+
+            Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+            for (v = 0, x = ft, y = gt;
+                    v < n; v ++, x += slen, y += slen) {
+                fx[v] = zint_mod_small_signed(x, slen,
+                                              p, p0i, R2, Rx);
+                gx[v] = zint_mod_small_signed(y, slen,
+                                              p, p0i, R2, Rx);
+            }
+            modp_NTT2(fx, gm, logn, p, p0i);
+            modp_NTT2(gx, gm, logn, p, p0i);
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed in
+         * a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * General case:
+         *
+         *   we divide degree by d = 2 or 3
+         *   f'(x^d) = N(f)(x^d) = f * adj(f)
+         *   g'(x^d) = N(g)(x^d) = g * adj(g)
+         *   f'*G' - g'*F' = q
+         *   F = F'(x^d) * adj(g)
+         *   G = G'(x^d) * adj(f)
+         *
+         * We compute things in the NTT. We group roots of phi
+         * such that all roots x in a group share the same x^d.
+         * If the roots in a group are x_1, x_2... x_d, then:
+         *
+         *   N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+         *
+         * Thus, we have:
+         *
+         *   G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+         *   ...
+         *   G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+         *
+         * In all cases, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, in our chosen NTT representation, roots
+         * from the same group are consecutive in RAM.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+                v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+    }
+
+    /*
+     * Rebuild F and G with the CRT.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+    zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+    /*
+     * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+     * order).
+     */
+
+    /*
+     * Apply Babai reduction to bring back F and G to size slen.
+     *
+     * We use the FFT to compute successive approximations of the
+     * reduction coefficient. We first isolate the top bits of
+     * the coefficients of f and g, and convert them to floating
+     * point; with the FFT, we compute adj(f), adj(g), and
+     * 1/(f*adj(f)+g*adj(g)).
+     *
+     * Then, we repeatedly apply the following:
+     *
+     *   - Get the top bits of the coefficients of F and G into
+     *     floating point, and use the FFT to compute:
+     *        (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+     *
+     *   - Convert back that value into normal representation, and
+     *     round it to the nearest integers, yielding a polynomial k.
+     *     Proper scaling is applied to f, g, F and G so that the
+     *     coefficients fit on 32 bits (signed).
+     *
+     *   - Subtract k*f from F and k*g from G.
+     *
+     * Under normal conditions, this process reduces the size of F
+     * and G by some bits at each iteration. For constant-time
+     * operation, we do not want to measure the actual length of
+     * F and G; instead, we do the following:
+     *
+     *   - f and g are converted to floating-point, with some scaling
+     *     if necessary to keep values in the representable range.
+     *
+     *   - For each iteration, we _assume_ a maximum size for F and G,
+     *     and use the values at that size. If we overreach, then
+     *     we get zeros, which is harmless: the resulting coefficients
+     *     of k will be 0 and the value won't be reduced.
+     *
+     *   - We conservatively assume that F and G will be reduced by
+     *     at least 25 bits at each iteration.
+     *
+     * Even when reaching the bottom of the reduction, reduction
+     * coefficient will remain low. If it goes out-of-range, then
+     * something wrong occurred and the whole NTRU solving fails.
+     */
+
+    /*
+     * Memory layout:
+     *  - We need to compute and keep adj(f), adj(g), and
+     *    1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+     *    respectively).
+     *  - At each iteration we need two extra fp buffer (N fp values),
+     *    and produce a k (N 32-bit words). k will be shared with one
+     *    of the fp buffers.
+     *  - To compute k*f and k*g efficiently (with the NTT), we need
+     *    some extra room; we reuse the space of the temporary buffers.
+     *
+     * Arrays of 'fpr' are obtained from the temporary array itself.
+     * We ensure that the base is at a properly aligned offset (the
+     * source array tmp[] is supposed to be already aligned).
+     */
+
+    rt3 = align_fpr(tmp, t1);
+    rt4 = rt3 + n;
+    rt5 = rt4 + n;
+    rt1 = rt5 + (n >> 1);
+    k = (int32_t *)align_u32(tmp, rt1);
+    rt2 = align_fpr(tmp, k + n);
+    if (rt2 < (rt1 + n)) {
+        rt2 = rt1 + n;
+    }
+    t1 = (uint32_t *)k + n;
+
+    /*
+     * Get f and g into rt3 and rt4 as floating-point approximations.
+     *
+     * We need to "scale down" the floating-point representation of
+     * coefficients when they are too big. We want to keep the value
+     * below 2^310 or so. Thus, when values are larger than 10 words,
+     * we consider only the top 10 words. Array lengths have been
+     * computed so that average maximum length will fall in the
+     * middle or the upper half of these top 10 words.
+     */
+    if (slen > 10) {
+        rlen = 10;
+    } else {
+        rlen = slen;
+    }
+    poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+    poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+    /*
+     * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+     */
+    scale_fg = 31 * (int)(slen - rlen);
+
+    /*
+     * Estimated boundaries for the maximum size (in bits) of the
+     * coefficients of (f,g). We use the measured average, and
+     * allow for a deviation of at most six times the standard
+     * deviation.
+     */
+    minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+    maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+    /*
+     * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+     * and adj(g) in rt3 and rt4, respectively.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt4, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt3, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt4, logn);
+
+    /*
+     * Reduce F and G repeatedly.
+     *
+     * The expected maximum bit length of coefficients of F and G
+     * is kept in maxbl_FG, with the corresponding word length in
+     * FGlen.
+     */
+    FGlen = llen;
+    maxbl_FG = 31 * (int)llen;
+
+    /*
+     * Each reduction operation computes the reduction polynomial
+     * "k". We need that polynomial to have coefficients that fit
+     * on 32-bit signed integers, with some scaling; thus, we use
+     * a descending sequence of scaling values, down to zero.
+     *
+     * The size of the coefficients of k is (roughly) the difference
+     * between the size of the coefficients of (F,G) and the size
+     * of the coefficients of (f,g). Thus, the maximum size of the
+     * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+     * this is our starting scale value for k.
+     *
+     * We need to estimate the size of (F,G) during the execution of
+     * the algorithm; we are allowed some overestimation but not too
+     * much (poly_big_to_fp() uses a 310-bit window). Generally
+     * speaking, after applying a reduction with k scaled to
+     * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+     * where 'dd' is a few bits to account for the fact that the
+     * reduction is never perfect (intuitively, dd is on the order
+     * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+     * bits).
+     *
+     * The size of (f,g) is not known exactly, but maxbl_fg is an
+     * upper bound.
+     */
+    scale_k = maxbl_FG - minbl_fg;
+
+    for (;;) {
+        int scale_FG, dc, new_maxbl_FG;
+        uint32_t scl, sch;
+        fpr pdc, pt;
+
+        /*
+         * Convert current F and G into floating-point. We apply
+         * scaling if the current length is more than 10 words.
+         */
+        if (FGlen > 10) {
+            rlen = 10;
+        } else {
+            rlen = FGlen;
+        }
+        scale_FG = 31 * (int)(FGlen - rlen);
+        poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+        poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+        /*
+         * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+         */
+        PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt2, rt4, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_add(rt2, rt1, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt2, rt5, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt2, logn);
+
+        /*
+         * (f,g) are scaled by 'scale_fg', meaning that the
+         * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+         * to have their true mathematical value.
+         *
+         * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+         * the value we computed in rt2 is scaled by
+         * 'scale_FG-scale_fg'.
+         *
+         * We want that value to be scaled by 'scale_k', hence we
+         * apply a corrective scaling. After scaling, the values
+         * should fit in -2^31-1..+2^31-1.
+         */
+        dc = scale_k - scale_FG + scale_fg;
+
+        /*
+         * We will need to multiply values by 2^(-dc). The value
+         * 'dc' is not secret, so we can compute 2^(-dc) with a
+         * non-constant-time process.
+         * (We could use ldexp(), but we prefer to avoid any
+         * dependency on libm. When using FP emulation, we could
+         * use our fpr_ldexp(), which is constant-time.)
+         */
+        if (dc < 0) {
+            dc = -dc;
+            pt = fpr_two;
+        } else {
+            pt = fpr_onehalf;
+        }
+        pdc = fpr_one;
+        while (dc != 0) {
+            if ((dc & 1) != 0) {
+                pdc = fpr_mul(pdc, pt);
+            }
+            dc >>= 1;
+            pt = fpr_sqr(pt);
+        }
+
+        for (u = 0; u < n; u ++) {
+            fpr xv;
+
+            xv = fpr_mul(rt2[u], pdc);
+
+            /*
+             * Sometimes the values can be out-of-bounds if
+             * the algorithm fails; we must not call
+             * fpr_rint() (and cast to int32_t) if the value
+             * is not in-bounds. Note that the test does not
+             * break constant-time discipline, since any
+             * failure here implies that we discard the current
+             * secret key (f,g).
+             */
+            if (!fpr_lt(fpr_mtwo31m1, xv)
+                    || !fpr_lt(xv, fpr_ptwo31m1)) {
+                return 0;
+            }
+            k[u] = (int32_t)fpr_rint(xv);
+        }
+
+        /*
+         * Values in k[] are integers. They really are scaled
+         * down by maxbl_FG - minbl_fg bits.
+         *
+         * If we are at low depth, then we use the NTT to
+         * compute k*f and k*g.
+         */
+        sch = (uint32_t)(scale_k / 31);
+        scl = (uint32_t)(scale_k % 31);
+        if (depth <= DEPTH_INT_FG) {
+            poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+                                k, sch, scl, logn, t1);
+            poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+                                k, sch, scl, logn, t1);
+        } else {
+            poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+                            k, sch, scl, logn);
+            poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+                            k, sch, scl, logn);
+        }
+
+        /*
+         * We compute the new maximum size of (F,G), assuming that
+         * (f,g) has _maximal_ length (i.e. that reduction is
+         * "late" instead of "early". We also adjust FGlen
+         * accordingly.
+         */
+        new_maxbl_FG = scale_k + maxbl_fg + 10;
+        if (new_maxbl_FG < maxbl_FG) {
+            maxbl_FG = new_maxbl_FG;
+            if ((int)FGlen * 31 >= maxbl_FG + 31) {
+                FGlen --;
+            }
+        }
+
+        /*
+         * We suppose that scaling down achieves a reduction by
+         * at least 25 bits per iteration. We stop when we have
+         * done the loop with an unscaled k.
+         */
+        if (scale_k <= 0) {
+            break;
+        }
+        scale_k -= 25;
+        if (scale_k < 0) {
+            scale_k = 0;
+        }
+    }
+
+    /*
+     * If (F,G) length was lowered below 'slen', then we must take
+     * care to re-extend the sign.
+     */
+    if (FGlen < slen) {
+        for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+            size_t v;
+            uint32_t sw;
+
+            sw = -(Ft[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Ft[v] = sw;
+            }
+            sw = -(Gt[FGlen - 1] >> 30) >> 1;
+            for (v = FGlen; v < slen; v ++) {
+                Gt[v] = sw;
+            }
+        }
+    }
+
+    /*
+     * Compress encoding of all values to 'slen' words (this is the
+     * expected output format).
+     */
+    for (u = 0, x = tmp, y = tmp;
+            u < (n << 1); u ++, x += slen, y += llen) {
+        memmove(x, y, slen * sizeof * y);
+    }
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    /*
+     * The first half of this function is a copy of the corresponding
+     * part in solve_NTRU_intermediate(), for the reconstruction of
+     * the unreduced F and G. The second half (Babai reduction) is
+     * done differently, because the unreduced F and G fit in 53 bits
+     * of precision, allowing a much simpler process with lower RAM
+     * usage.
+     */
+    unsigned depth, logn;
+    size_t n_top, n, hn, slen, dlen, llen, u;
+    uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+    fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+    uint32_t *x, *y;
+
+    depth = 1;
+    n_top = (size_t)1 << logn_top;
+    logn = logn_top - depth;
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     */
+
+    /*
+     * slen = size for our input f and g; also size of the reduced
+     *        F and G we return (degree N)
+     *
+     * dlen = size of the F and G obtained from the deeper level
+     *        (degree N/2)
+     *
+     * llen = size for intermediary F and G before reduction (degree N)
+     *
+     * We build our non-reduced F and G as two independent halves each,
+     * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+     */
+    slen = MAX_BL_SMALL[depth];
+    dlen = MAX_BL_SMALL[depth + 1];
+    llen = MAX_BL_LARGE[depth];
+
+    /*
+     * Fd and Gd are the F and G from the deeper level. Ft and Gt
+     * are the destination arrays for the unreduced F and G.
+     */
+    Fd = tmp;
+    Gd = Fd + dlen * hn;
+    Ft = Gd + dlen * hn;
+    Gt = Ft + llen * n;
+
+    /*
+     * We reduce Fd and Gd modulo all the small primes we will need,
+     * and store the values in Ft and Gt.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2, Rx;
+        size_t v;
+        uint32_t *xs, *ys, *xd, *yd;
+
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+        Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+        for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+                v < hn;
+                v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+            *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+            *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+        }
+    }
+
+    /*
+     * Now Fd and Gd are not needed anymore; we can squeeze them out.
+     */
+    memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+    Ft = tmp;
+    memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+    Gt = Ft + llen * n;
+    ft = Gt + llen * n;
+    gt = ft + slen * n;
+
+    t1 = gt + slen * n;
+
+    /*
+     * Compute our F and G modulo sufficiently many small primes.
+     */
+    for (u = 0; u < llen; u ++) {
+        uint32_t p, p0i, R2;
+        uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+        unsigned e;
+        size_t v;
+
+        /*
+         * All computations are done modulo p.
+         */
+        p = PRIMES[u].p;
+        p0i = modp_ninv31(p);
+        R2 = modp_R2(p, p0i);
+
+        /*
+         * We recompute things from the source f and g, of full
+         * degree. However, we will need only the n first elements
+         * of the inverse NTT table (igm); the call to modp_mkgm()
+         * below will fill n_top elements in igm[] (thus overflowing
+         * into fx[]) but later code will overwrite these extra
+         * elements.
+         */
+        gm = t1;
+        igm = gm + n_top;
+        fx = igm + n;
+        gx = fx + n_top;
+        modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+        /*
+         * Set ft and gt to f and g modulo p, respectively.
+         */
+        for (v = 0; v < n_top; v ++) {
+            fx[v] = modp_set(f[v], p);
+            gx[v] = modp_set(g[v], p);
+        }
+
+        /*
+         * Convert to NTT and compute our f and g.
+         */
+        modp_NTT2(fx, gm, logn_top, p, p0i);
+        modp_NTT2(gx, gm, logn_top, p, p0i);
+        for (e = logn_top; e > logn; e --) {
+            modp_poly_rec_res(fx, e, p, p0i, R2);
+            modp_poly_rec_res(gx, e, p, p0i, R2);
+        }
+
+        /*
+         * From that point onward, we only need tables for
+         * degree n, so we can save some space.
+         */
+        if (depth > 0) { /* always true */
+            memmove(gm + n, igm, n * sizeof * igm);
+            igm = gm + n;
+            memmove(igm + n, fx, n * sizeof * ft);
+            fx = igm + n;
+            memmove(fx + n, gx, n * sizeof * gt);
+            gx = fx + n;
+        }
+
+        /*
+         * Get F' and G' modulo p and in NTT representation
+         * (they have degree n/2). These values were computed
+         * in a previous step, and stored in Ft and Gt.
+         */
+        Fp = gx + n;
+        Gp = Fp + hn;
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += llen, y += llen) {
+            Fp[v] = *x;
+            Gp[v] = *y;
+        }
+        modp_NTT2(Fp, gm, logn - 1, p, p0i);
+        modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+        /*
+         * Compute our F and G modulo p.
+         *
+         * Equations are:
+         *
+         *   f'(x^2) = N(f)(x^2) = f * adj(f)
+         *   g'(x^2) = N(g)(x^2) = g * adj(g)
+         *
+         *   f'*G' - g'*F' = q
+         *
+         *   F = F'(x^2) * adj(g)
+         *   G = G'(x^2) * adj(f)
+         *
+         * The NTT representation of f is f(w) for all w which
+         * are roots of phi. In the binary case, as well as in
+         * the ternary case for all depth except the deepest,
+         * these roots can be grouped in pairs (w,-w), and we
+         * then have:
+         *
+         *   f(w) = adj(f)(-w)
+         *   f(-w) = adj(f)(w)
+         *
+         * and w^2 is then a root for phi at the half-degree.
+         *
+         * At the deepest level in the ternary case, this still
+         * holds, in the following sense: the roots of x^2-x+1
+         * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+         * have:
+         *
+         *   f(w) = adj(f)(-w^2)
+         *   f(-w^2) = adj(f)(w)
+         *
+         * In all case, we can thus compute F and G in NTT
+         * representation by a few simple multiplications.
+         * Moreover, the two roots for each pair are consecutive
+         * in our bit-reversal encoding.
+         */
+        for (v = 0, x = Ft + u, y = Gt + u;
+                v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+            uint32_t ftA, ftB, gtA, gtB;
+            uint32_t mFp, mGp;
+
+            ftA = fx[(v << 1) + 0];
+            ftB = fx[(v << 1) + 1];
+            gtA = gx[(v << 1) + 0];
+            gtB = gx[(v << 1) + 1];
+            mFp = modp_montymul(Fp[v], R2, p, p0i);
+            mGp = modp_montymul(Gp[v], R2, p, p0i);
+            x[0] = modp_montymul(gtB, mFp, p, p0i);
+            x[llen] = modp_montymul(gtA, mFp, p, p0i);
+            y[0] = modp_montymul(ftB, mGp, p, p0i);
+            y[llen] = modp_montymul(ftA, mGp, p, p0i);
+        }
+        modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+        modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+        /*
+         * Also save ft and gt (only up to size slen).
+         */
+        if (u < slen) {
+            modp_iNTT2(fx, igm, logn, p, p0i);
+            modp_iNTT2(gx, igm, logn, p, p0i);
+            for (v = 0, x = ft + u, y = gt + u;
+                    v < n; v ++, x += slen, y += slen) {
+                *x = fx[v];
+                *y = gx[v];
+            }
+        }
+    }
+
+    /*
+     * Rebuild f, g, F and G with the CRT. Note that the elements of F
+     * and G are consecutive, and thus can be rebuilt in a single
+     * loop; similarly, the elements of f and g are consecutive.
+     */
+    zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+    zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+    /*
+     * Here starts the Babai reduction, specialized for depth = 1.
+     *
+     * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+     * are converted to floating point. There is no scaling, and a
+     * single pass is sufficient.
+     */
+
+    /*
+     * Convert F and G into floating point (rt1 and rt2).
+     */
+    rt1 = align_fpr(tmp, gt + slen * n);
+    rt2 = rt1 + n;
+    poly_big_to_fp(rt1, Ft, llen, llen, logn);
+    poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+    /*
+     * Integer representation of F and G is no longer needed, we
+     * can remove it.
+     */
+    memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+    ft = tmp;
+    gt = ft + slen * n;
+    rt3 = align_fpr(tmp, gt + slen * n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * Convert f and g into floating point (rt3 and rt4).
+     */
+    poly_big_to_fp(rt3, ft, slen, slen, logn);
+    poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+    /*
+     * Remove unneeded ft and gt.
+     */
+    memmove(tmp, rt1, 4 * n * sizeof * rt1);
+    rt1 = (fpr *)tmp;
+    rt2 = rt1 + n;
+    rt3 = rt2 + n;
+    rt4 = rt3 + n;
+
+    /*
+     * We now have:
+     *   rt1 = F
+     *   rt2 = G
+     *   rt3 = f
+     *   rt4 = g
+     * in that order in RAM. We convert all of them to FFT.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt1, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt2, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = F*adj(f) + G*adj(g)
+     *   rt6 = 1 / (f*adj(f) + g*adj(g))
+     * (Note that rt6 is half-length.)
+     */
+    rt5 = rt4 + n;
+    rt6 = rt5 + n;
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+    /*
+     * Compute:
+     *   rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+    /*
+     * Compute k as the rounded version of rt5. Check that none of
+     * the values is larger than 2^63-1 (in absolute value)
+     * because that would make the fpr_rint() do something undefined;
+     * note that any out-of-bounds value here implies a failure and
+     * (f,g) will be discarded, so we can make a simple test.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt5, logn);
+    for (u = 0; u < n; u ++) {
+        fpr z;
+
+        z = rt5[u];
+        if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+            return 0;
+        }
+        rt5[u] = fpr_of(fpr_rint(z));
+    }
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt5, logn);
+
+    /*
+     * Subtract k*f from F, and k*g from G.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt3, rt5, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt4, rt5, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(rt1, rt3, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(rt2, rt4, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt1, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt2, logn);
+
+    /*
+     * Convert back F and G to integers, and return.
+     */
+    Ft = tmp;
+    Gt = Ft + n;
+    rt3 = align_fpr(tmp, Gt + n);
+    memmove(rt3, rt1, 2 * n * sizeof * rt1);
+    rt1 = rt3;
+    rt2 = rt1 + n;
+    for (u = 0; u < n; u ++) {
+        Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+        Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+    }
+
+    return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+                         const int8_t *f, const int8_t *g, uint32_t *tmp) {
+    size_t n, hn, u;
+    uint32_t p, p0i, R2;
+    uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+    uint32_t *gm, *igm, *ft, *gt;
+    fpr *rt2, *rt3;
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Equations are:
+     *
+     *   f' = f0^2 - X^2*f1^2
+     *   g' = g0^2 - X^2*g1^2
+     *   F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+     *   F = F'*(g0 - X*g1)
+     *   G = G'*(f0 - X*f1)
+     *
+     * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+     * degree N/2 (their odd-indexed coefficients are all zero).
+     *
+     * Everything should fit in 31-bit integers, hence we can just use
+     * the first small prime p = 2147473409.
+     */
+    p = PRIMES[0].p;
+    p0i = modp_ninv31(p);
+    R2 = modp_R2(p, p0i);
+
+    Fp = tmp;
+    Gp = Fp + hn;
+    ft = Gp + hn;
+    gt = ft + n;
+    gm = gt + n;
+    igm = gm + n;
+
+    modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F' anf G' in NTT representation.
+     */
+    for (u = 0; u < hn; u ++) {
+        Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+        Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+    }
+    modp_NTT2(Fp, gm, logn - 1, p, p0i);
+    modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+    /*
+     * Load f and g and convert them to NTT representation.
+     */
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+
+    /*
+     * Build the unreduced F,G in ft and gt.
+     */
+    for (u = 0; u < n; u += 2) {
+        uint32_t ftA, ftB, gtA, gtB;
+        uint32_t mFp, mGp;
+
+        ftA = ft[u + 0];
+        ftB = ft[u + 1];
+        gtA = gt[u + 0];
+        gtB = gt[u + 1];
+        mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+        mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+        ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+        ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+        gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+        gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+    }
+    modp_iNTT2(ft, igm, logn, p, p0i);
+    modp_iNTT2(gt, igm, logn, p, p0i);
+
+    Gp = Fp + n;
+    t1 = Gp + n;
+    memmove(Fp, ft, 2 * n * sizeof * ft);
+
+    /*
+     * We now need to apply the Babai reduction. At that point,
+     * we have F and G in two n-word arrays.
+     *
+     * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+     * modulo p, using the NTT. We still move memory around in
+     * order to save RAM.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+
+    /*
+     * Compute the NTT tables in t1 and t2. We do not keep t2
+     * (we'll recompute it later on).
+     */
+    modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+    /*
+     * Convert F and G to NTT.
+     */
+    modp_NTT2(Fp, t1, logn, p, p0i);
+    modp_NTT2(Gp, t1, logn, p, p0i);
+
+    /*
+     * Load f and adj(f) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(f[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[n - u] = modp_set(-f[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Compute F*adj(f) in t2, and f*adj(f) in t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_montymul(w, Fp[u], p, p0i);
+        t3[u] = modp_montymul(w, t4[u], p, p0i);
+    }
+
+    /*
+     * Load g and adj(g) in t4 and t5, and convert them to NTT
+     * representation.
+     */
+    t4[0] = t5[0] = modp_set(g[0], p);
+    for (u = 1; u < n; u ++) {
+        t4[u] = modp_set(g[u], p);
+        t5[n - u] = modp_set(-g[u], p);
+    }
+    modp_NTT2(t4, t1, logn, p, p0i);
+    modp_NTT2(t5, t1, logn, p, p0i);
+
+    /*
+     * Add G*adj(g) to t2, and g*adj(g) to t3.
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = modp_montymul(t5[u], R2, p, p0i);
+        t2[u] = modp_add(t2[u],
+                         modp_montymul(w, Gp[u], p, p0i), p);
+        t3[u] = modp_add(t3[u],
+                         modp_montymul(w, t4[u], p, p0i), p);
+    }
+
+    /*
+     * Convert back t2 and t3 to normal representation (normalized
+     * around 0), and then
+     * move them to t1 and t2. We first need to recompute the
+     * inverse table for NTT.
+     */
+    modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+    modp_iNTT2(t2, t4, logn, p, p0i);
+    modp_iNTT2(t3, t4, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint32_t)modp_norm(t2[u], p);
+        t2[u] = (uint32_t)modp_norm(t3[u], p);
+    }
+
+    /*
+     * At that point, array contents are:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   F*adj(f)+G*adj(g) (t1)
+     *   f*adj(f)+g*adj(g) (t2)
+     *
+     * We want to divide t1 by t2. The result is not integral; it
+     * must be rounded. We thus need to use the FFT.
+     */
+
+    /*
+     * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+     * polynomial is auto-adjoint, all its coordinates in FFT
+     * representation are actually real, so we can truncate off
+     * the imaginary parts.
+     */
+    rt3 = align_fpr(tmp, t3);
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t2)[u]);
+    }
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+    rt2 = align_fpr(tmp, t2);
+    memmove(rt2, rt3, hn * sizeof * rt3);
+
+    /*
+     * Convert F*adj(f)+G*adj(g) in FFT representation.
+     */
+    rt3 = rt2 + hn;
+    for (u = 0; u < n; u ++) {
+        rt3[u] = fpr_of(((int32_t *)t1)[u]);
+    }
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+
+    /*
+     * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+     * its rounded normal representation in t1.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_div_autoadj_fft(rt3, rt2, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt3, logn);
+    for (u = 0; u < n; u ++) {
+        t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+    }
+
+    /*
+     * RAM contents are now:
+     *
+     *   F (NTT representation) (Fp)
+     *   G (NTT representation) (Gp)
+     *   k (t1)
+     *
+     * We want to compute F-k*f, and G-k*g.
+     */
+    t2 = t1 + n;
+    t3 = t2 + n;
+    t4 = t3 + n;
+    t5 = t4 + n;
+    modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        t4[u] = modp_set(f[u], p);
+        t5[u] = modp_set(g[u], p);
+    }
+    modp_NTT2(t1, t2, logn, p, p0i);
+    modp_NTT2(t4, t2, logn, p, p0i);
+    modp_NTT2(t5, t2, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t kw;
+
+        kw = modp_montymul(t1[u], R2, p, p0i);
+        Fp[u] = modp_sub(Fp[u],
+                         modp_montymul(kw, t4[u], p, p0i), p);
+        Gp[u] = modp_sub(Gp[u],
+                         modp_montymul(kw, t5[u], p, p0i), p);
+    }
+    modp_iNTT2(Fp, t3, logn, p, p0i);
+    modp_iNTT2(Gp, t3, logn, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+        Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+    }
+
+    return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+           const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+    size_t n, u;
+    uint32_t *ft, *gt, *Ft, *Gt, *gm;
+    uint32_t p, p0i, r;
+    const small_prime *primes;
+
+    n = MKN(logn);
+
+    if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+        return 0;
+    }
+
+    /*
+     * For logn <= 2, we need to use solve_NTRU_intermediate()
+     * directly, because coefficients are a bit too large and
+     * do not fit the hypotheses in solve_NTRU_binary_depth0().
+     */
+    if (logn <= 2) {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 0) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+    } else {
+        unsigned depth;
+
+        depth = logn;
+        while (depth -- > 2) {
+            if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+                return 0;
+            }
+        }
+        if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+            return 0;
+        }
+        if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+            return 0;
+        }
+    }
+
+    /*
+     * If no buffer has been provided for G, use a temporary one.
+     */
+    if (G == NULL) {
+        G = (int8_t *)(tmp + 2 * n);
+    }
+
+    /*
+     * Final F and G are in fk->tmp, one word per coefficient
+     * (signed value over 31 bits).
+     */
+    if (!poly_big_to_small(F, tmp, lim, logn)
+            || !poly_big_to_small(G, tmp + n, lim, logn)) {
+        return 0;
+    }
+
+    /*
+     * Verify that the NTRU equation is fulfilled. Since all elements
+     * have short lengths, verifying modulo a small prime p works, and
+     * allows using the NTT.
+     *
+     * We put Gt[] first in tmp[], and process it first, so that it does
+     * not overlap with G[] in case we allocated it ourselves.
+     */
+    Gt = tmp;
+    ft = Gt + n;
+    gt = ft + n;
+    Ft = gt + n;
+    gm = Ft + n;
+
+    primes = PRIMES;
+    p = primes[0].p;
+    p0i = modp_ninv31(p);
+    modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+    for (u = 0; u < n; u ++) {
+        Gt[u] = modp_set(G[u], p);
+    }
+    for (u = 0; u < n; u ++) {
+        ft[u] = modp_set(f[u], p);
+        gt[u] = modp_set(g[u], p);
+        Ft[u] = modp_set(F[u], p);
+    }
+    modp_NTT2(ft, gm, logn, p, p0i);
+    modp_NTT2(gt, gm, logn, p, p0i);
+    modp_NTT2(Ft, gm, logn, p, p0i);
+    modp_NTT2(Gt, gm, logn, p, p0i);
+    r = modp_montymul(12289, 1, p, p0i);
+    for (u = 0; u < n; u ++) {
+        uint32_t z;
+
+        z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+                     modp_montymul(gt[u], Ft[u], p, p0i), p);
+        if (z != r) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+    size_t n, u;
+    unsigned mod2;
+
+    n = MKN(logn);
+    mod2 = 0;
+    for (u = 0; u < n; u ++) {
+        int s;
+
+restart:
+        s = mkgauss(rng, logn);
+
+        /*
+         * We need the coefficient to fit within -127..+127;
+         * realistically, this is always the case except for
+         * the very low degrees (N = 2 or 4), for which there
+         * is no real security anyway.
+         */
+        if (s < -127 || s > 127) {
+            goto restart;
+        }
+
+        /*
+         * We need the sum of all coefficients to be 1; otherwise,
+         * the resultant of the polynomial with X^N+1 will be even,
+         * and the binary GCD will fail.
+         */
+        if (u == n - 1) {
+            if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+                goto restart;
+            }
+        } else {
+            mod2 ^= (unsigned)(s & 1);
+        }
+        f[u] = (int8_t)s;
+    }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_keygen(inner_shake256_context *rng,
+                                     int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+                                     unsigned logn, uint8_t *tmp) {
+    /*
+     * Algorithm is the following:
+     *
+     *  - Generate f and g with the Gaussian distribution.
+     *
+     *  - If either Res(f,phi) or Res(g,phi) is even, try again.
+     *
+     *  - If ||(f,g)|| is too large, try again.
+     *
+     *  - If ||B~_{f,g}|| is too large, try again.
+     *
+     *  - If f is not invertible mod phi mod q, try again.
+     *
+     *  - Compute h = g/f mod phi mod q.
+     *
+     *  - Solve the NTRU equation fG - gF = q; if the solving fails,
+     *    try again. Usual failure condition is when Res(f,phi)
+     *    and Res(g,phi) are not prime to each other.
+     */
+    size_t n, u;
+    uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
+
+    n = MKN(logn);
+    rc = rng;
+
+    /*
+     * We need to generate f and g randomly, until we find values
+     * such that the norm of (g,-f), and of the orthogonalized
+     * vector, are satisfying. The orthogonalized vector is:
+     *   (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+     * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+     *
+     * In the binary case, coefficients of f and g are generated
+     * independently of each other, with a discrete Gaussian
+     * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+     * the two vectors have expected norm 1.17*sqrt(q), which is
+     * also our acceptance bound: we require both vectors to be no
+     * larger than that (this will be satisfied about 1/4th of the
+     * time, thus we expect sampling new (f,g) about 4 times for that
+     * step).
+     *
+     * We require that Res(f,phi) and Res(g,phi) are both odd (the
+     * NTRU equation solver requires it).
+     */
+    for (;;) {
+        fpr *rt1, *rt2, *rt3;
+        fpr bnorm;
+        uint32_t normf, normg, norm;
+        int lim;
+
+        /*
+         * The poly_small_mkgauss() function makes sure
+         * that the sum of coefficients is 1 modulo 2
+         * (i.e. the resultant of the polynomial with phi
+         * will be odd).
+         */
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
+
+        /*
+         * Verify that all coefficients are within the bounds
+         * defined in max_fg_bits. This is the case with
+         * overwhelming probability; this guarantees that the
+         * key will be encodable with FALCON_COMP_TRIM.
+         */
+        lim = 1 << (PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[logn] - 1);
+        for (u = 0; u < n; u ++) {
+            /*
+             * We can use non-CT tests since on any failure
+             * we will discard f and g.
+             */
+            if (f[u] >= lim || f[u] <= -lim
+                    || g[u] >= lim || g[u] <= -lim) {
+                lim = -1;
+                break;
+            }
+        }
+        if (lim < 0) {
+            continue;
+        }
+
+        /*
+         * Bound is 1.17*sqrt(q). We compute the squared
+         * norms. With q = 12289, the squared bound is:
+         *   (1.17^2)* 12289 = 16822.4121
+         * Since f and g are integral, the squared norm
+         * of (g,-f) is an integer.
+         */
+        normf = poly_small_sqnorm(f, logn);
+        normg = poly_small_sqnorm(g, logn);
+        norm = (normf + normg) | -((normf | normg) >> 31);
+        if (norm >= 16823) {
+            continue;
+        }
+
+        /*
+         * We compute the orthogonalized vector norm.
+         */
+        rt1 = (fpr *)tmp;
+        rt2 = rt1 + n;
+        rt3 = rt2 + n;
+        poly_small_to_fp(rt1, f, logn);
+        poly_small_to_fp(rt2, g, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt2, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt1, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt2, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(rt1, fpr_q, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(rt2, fpr_q, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt1, rt3, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt2, rt3, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt1, logn);
+        PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt2, logn);
+        bnorm = fpr_zero;
+        for (u = 0; u < n; u ++) {
+            bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+            bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+        }
+        if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+            continue;
+        }
+
+        /*
+         * Compute public key h = g/f mod X^N+1 mod q. If this
+         * fails, we must restart.
+         */
+        if (h == NULL) {
+            h2 = (uint16_t *)tmp;
+            tmp2 = h2 + n;
+        } else {
+            h2 = h;
+            tmp2 = (uint16_t *)tmp;
+        }
+        if (!PQCLEAN_FALCONPADDED512_CLEAN_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+            continue;
+        }
+
+        /*
+         * Solve the NTRU equation to get F and G.
+         */
+        lim = (1 << (PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[logn] - 1)) - 1;
+        if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+            continue;
+        }
+
+        /*
+         * Key pair is generated.
+         */
+        break;
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/pqclean.c
new file mode 100644
index 000000000..7edf6a874
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN   40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ *   private key:
+ *      header byte: 0101nnnn
+ *      private f  (6 or 5 bits by element, depending on degree)
+ *      private g  (6 or 5 bits by element, depending on degree)
+ *      private F  (8 bits by element)
+ *
+ *   public key:
+ *      header byte: 0000nnnn
+ *      public h   (14 bits by element)
+ *
+ *   signature:
+ *      header byte: 0011nnnn
+ *      nonce (r)  40 bytes
+ *      value (s)  compressed format
+ *      padding    to 666 bytes
+ *
+ *   message + signature:
+ *      signature  666 bytes
+ *      message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(
+    uint8_t *pk, uint8_t *sk) {
+    union {
+        uint8_t b[FALCON_KEYGEN_TEMP_9];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[512], g[512], F[512];
+    uint16_t h[512];
+    unsigned char seed[48];
+    inner_shake256_context rng;
+    size_t u, v;
+
+    /*
+     * Generate key pair.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
+    PQCLEAN_FALCONPADDED512_CLEAN_keygen(&rng, f, g, F, NULL, h, 9, tmp.b);
+    inner_shake256_ctx_release(&rng);
+
+    /*
+     * Encode private key.
+     */
+    sk[0] = 0x50 + 9;
+    u = 1;
+    v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+            f, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+            g, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+            sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+            F, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[9]);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+
+    /*
+     * Encode public key.
+     */
+    pk[0] = 0x00 + 9;
+    v = PQCLEAN_FALCONPADDED512_CLEAN_modq_encode(
+            pk + 1, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1,
+            h, 9);
+    if (v != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+        const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    union {
+        uint8_t b[72 * 512];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    int8_t f[512], g[512], F[512], G[512];
+    struct {
+        int16_t sig[512];
+        uint16_t hm[512];
+    } r;
+    unsigned char seed[48];
+    inner_shake256_context sc;
+    size_t u, v;
+
+    /*
+     * Decode the private key.
+     */
+    if (sk[0] != 0x50 + 9) {
+        return -1;
+    }
+    u = 1;
+    v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+            f, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9],
+            sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+            g, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9],
+            sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+            F, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[9],
+            sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+    if (v == 0) {
+        return -1;
+    }
+    u += v;
+    if (u != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES) {
+        return -1;
+    }
+    if (!PQCLEAN_FALCONPADDED512_CLEAN_complete_private(G, f, g, F, 9, tmp.b)) {
+        return -1;
+    }
+
+    /*
+     * Create a random nonce (40 bytes).
+     */
+    randombytes(nonce, NONCELEN);
+
+    /*
+     * Hash message nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(&sc, r.hm, 9, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Initialize a RNG.
+     */
+    randombytes(seed, sizeof seed);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
+
+    /*
+     * Compute and return the signature. This loops until a signature
+     * value is found that fits in the provided buffer.
+     */
+    for (;;) {
+        PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+        v = PQCLEAN_FALCONPADDED512_CLEAN_comp_encode(sigbuf, sigbuflen, r.sig, 9);
+        if (v != 0) {
+            inner_shake256_ctx_release(&sc);
+            memset(sigbuf + v, 0, sigbuflen - v);
+            return 0;
+        }
+    }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+    const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    union {
+        uint8_t b[2 * 512];
+        uint64_t dummy_u64;
+        fpr dummy_fpr;
+    } tmp;
+    uint16_t h[512], hm[512];
+    int16_t sig[512];
+    inner_shake256_context sc;
+    size_t v;
+
+    /*
+     * Decode public key.
+     */
+    if (pk[0] != 0x00 + 9) {
+        return -1;
+    }
+    if (PQCLEAN_FALCONPADDED512_CLEAN_modq_decode(h, 9,
+            pk + 1, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1)
+            != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+        return -1;
+    }
+    PQCLEAN_FALCONPADDED512_CLEAN_to_ntt_monty(h, 9);
+
+    /*
+     * Decode signature.
+     */
+    if (sigbuflen == 0) {
+        return -1;
+    }
+
+    v = PQCLEAN_FALCONPADDED512_CLEAN_comp_decode(sig, 9, sigbuf, sigbuflen);
+    if (v == 0) {
+        return -1;
+    }
+    if (v != sigbuflen) {
+        if (sigbuflen == PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+            while (v < sigbuflen) {
+                if (sigbuf[v++] != 0) {
+                    return -1;
+                }
+            }
+        } else {
+            return -1;
+        }
+    }
+
+    /*
+     * Hash nonce + message into a vector.
+     */
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(&sc, hm, 9, tmp.b);
+    inner_shake256_ctx_release(&sc);
+
+    /*
+     * Verify signature.
+     */
+    if (!PQCLEAN_FALCONPADDED512_CLEAN_verify_raw(hm, sig, h, 9, tmp.b)) {
+        return -1;
+    }
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(
+    uint8_t *sig, size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    size_t vlen;
+
+    vlen = PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sig[0] = 0x30 + 9;
+    *siglen = 1 + NONCELEN + vlen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(
+    const uint8_t *sig, size_t siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pk) {
+    if (siglen < 1 + NONCELEN) {
+        return -1;
+    }
+    if (sig[0] != 0x30 + 9) {
+        return -1;
+    }
+    return do_verify(sig + 1,
+                     sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign(
+    uint8_t *sm, size_t *smlen,
+    const uint8_t *m, size_t mlen, const uint8_t *sk) {
+    uint8_t *sigbuf;
+    size_t sigbuflen;
+
+    /*
+     * Move the message to its final location; this is a memmove() so
+     * it handles overlaps properly.
+     */
+    memmove(sm + PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES, m, mlen);
+    sigbuf = sm + 1 + NONCELEN;
+    sigbuflen = PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+    if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+        return -1;
+    }
+    sm[0] = 0x30 + 9;
+    sigbuflen ++;
+    *smlen = mlen + NONCELEN + sigbuflen;
+    return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_open(
+    uint8_t *m, size_t *mlen,
+    const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+    const uint8_t *sigbuf;
+    size_t pmlen, sigbuflen;
+
+    if (smlen < PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES) {
+        return -1;
+    }
+    sigbuflen = PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+    pmlen = smlen - PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES;
+    if (sm[0] != 0x30 + 9) {
+        return -1;
+    }
+    sigbuf = sm + 1 + NONCELEN;
+
+    /*
+     * The one-byte signature header has been verified. Nonce is at sm+1
+     * followed by the signature (pointed to by sigbuf). The message
+     * follows the signature value.
+     */
+    if (do_verify(sm + 1, sigbuf, sigbuflen,
+                  sm + PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES, pmlen, pk) < 0) {
+        return -1;
+    }
+
+    /*
+     * Signature is correct, we just have to copy/move the message
+     * to its final destination. The memmove() properly handles
+     * overlaps.
+     */
+    memmove(m, sm + PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES, pmlen);
+    *mlen = pmlen;
+    return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/rng.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/rng.c
new file mode 100644
index 000000000..ccce5e886
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/rng.c
@@ -0,0 +1,188 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include <assert.h>
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
+    /*
+     * To ensure reproducibility for a given seed, we
+     * must enforce little-endian interpretation of
+     * the state words.
+     */
+    uint8_t tmp[56];
+    uint64_t th, tl;
+    int i;
+
+    uint32_t *d32 = (uint32_t *) p->state.d;
+    uint64_t *d64 = (uint64_t *) p->state.d;
+
+    inner_shake256_extract(src, tmp, 56);
+    for (i = 0; i < 14; i ++) {
+        uint32_t w;
+
+        w = (uint32_t)tmp[(i << 2) + 0]
+            | ((uint32_t)tmp[(i << 2) + 1] << 8)
+            | ((uint32_t)tmp[(i << 2) + 2] << 16)
+            | ((uint32_t)tmp[(i << 2) + 3] << 24);
+        d32[i] = w;
+    }
+    tl = d32[48 / sizeof(uint32_t)];
+    th = d32[52 / sizeof(uint32_t)];
+    d64[48 / sizeof(uint64_t)] = tl + (th << 32);
+    PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(prng *p) {
+
+    static const uint32_t CW[] = {
+        0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+    };
+
+    uint64_t cc;
+    size_t u;
+
+    /*
+     * State uses local endianness. Only the output bytes must be
+     * converted to little endian (if used on a big-endian machine).
+     */
+    cc = *(uint64_t *)(p->state.d + 48);
+    for (u = 0; u < 8; u ++) {
+        uint32_t state[16];
+        size_t v;
+        int i;
+
+        memcpy(&state[0], CW, sizeof CW);
+        memcpy(&state[4], p->state.d, 48);
+        state[14] ^= (uint32_t)cc;
+        state[15] ^= (uint32_t)(cc >> 32);
+        for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d)   do { \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] << 16) | (state[d] >> 16); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] << 12) | (state[b] >> 20); \
+        state[a] += state[b]; \
+        state[d] ^= state[a]; \
+        state[d] = (state[d] <<  8) | (state[d] >> 24); \
+        state[c] += state[d]; \
+        state[b] ^= state[c]; \
+        state[b] = (state[b] <<  7) | (state[b] >> 25); \
+    } while (0)
+
+            QROUND( 0,  4,  8, 12);
+            QROUND( 1,  5,  9, 13);
+            QROUND( 2,  6, 10, 14);
+            QROUND( 3,  7, 11, 15);
+            QROUND( 0,  5, 10, 15);
+            QROUND( 1,  6, 11, 12);
+            QROUND( 2,  7,  8, 13);
+            QROUND( 3,  4,  9, 14);
+
+#undef QROUND
+
+        }
+
+        for (v = 0; v < 4; v ++) {
+            state[v] += CW[v];
+        }
+        for (v = 4; v < 14; v ++) {
+            state[v] += ((uint32_t *)p->state.d)[v - 4];
+        }
+        state[14] += ((uint32_t *)p->state.d)[10]
+                     ^ (uint32_t)cc;
+        state[15] += ((uint32_t *)p->state.d)[11]
+                     ^ (uint32_t)(cc >> 32);
+        cc ++;
+
+        /*
+         * We mimic the interleaving that is used in the AVX2
+         * implementation.
+         */
+        for (v = 0; v < 16; v ++) {
+            p->buf.d[(u << 2) + (v << 5) + 0] =
+                (uint8_t)state[v];
+            p->buf.d[(u << 2) + (v << 5) + 1] =
+                (uint8_t)(state[v] >> 8);
+            p->buf.d[(u << 2) + (v << 5) + 2] =
+                (uint8_t)(state[v] >> 16);
+            p->buf.d[(u << 2) + (v << 5) + 3] =
+                (uint8_t)(state[v] >> 24);
+        }
+    }
+    *(uint64_t *)(p->state.d + 48) = cc;
+
+    p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len) {
+    uint8_t *buf;
+
+    buf = dst;
+    while (len > 0) {
+        size_t clen;
+
+        clen = (sizeof p->buf.d) - p->ptr;
+        if (clen > len) {
+            clen = len;
+        }
+        memcpy(buf, p->buf.d, clen);
+        buf += clen;
+        len -= clen;
+        p->ptr += clen;
+        if (p->ptr == sizeof p->buf.d) {
+            PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/sign.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/sign.c
new file mode 100644
index 000000000..5e37a4613
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/sign.c
@@ -0,0 +1,1248 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn)   ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ *   N = 2^logn
+ *   phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+    /*
+     * For logn = 0 (polynomials are constant), the "tree" is a
+     * single element. Otherwise, the tree node has size 2^logn, and
+     * has two child trees for size logn-1 each. Thus, treesize s()
+     * must fulfill these two relations:
+     *
+     *   s(0) = 1
+     *   s(logn) = (2^logn) + 2*s(logn-1)
+     */
+    return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+                fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g0[0];
+        return;
+    }
+    hn = n >> 1;
+
+    /*
+     * The LDL decomposition yields L (which is written in the tree)
+     * and the diagonal of D. Since d00 = g0, we just write d11
+     * into tmp.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+    /*
+     * Split d00 (currently in g0) and d11 (currently in tmp). We
+     * reuse g0 and g1 as temporary storage spaces:
+     *   d00 splits into g1, g1+hn
+     *   d11 splits into g0, g0+hn
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(g1, g1 + hn, g0, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+    /*
+     * Each split result is the first row of a new auto-adjoint
+     * quasicyclic matrix for the next recursive step.
+     */
+    ffLDL_fft_inner(tree + n,
+                    g1, g1 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+          const fpr *g01, const fpr *g11,
+          unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *d00, *d11;
+
+    n = MKN(logn);
+    if (n == 1) {
+        tree[0] = g00[0];
+        return;
+    }
+    hn = n >> 1;
+    d00 = tmp;
+    d11 = tmp + n;
+    tmp += n << 1;
+
+    memcpy(d00, g00, n * sizeof * g00);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(tmp, tmp + hn, d00, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(d00, d00 + hn, d11, logn);
+    memcpy(d11, tmp, n * sizeof * tmp);
+    ffLDL_fft_inner(tree + n,
+                    d11, d11 + hn, logn - 1, tmp);
+    ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+                    d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+    /*
+     * TODO: make an iterative version.
+     */
+    size_t n;
+
+    n = MKN(logn);
+    if (n == 1) {
+        /*
+         * We actually store in the tree leaf the inverse of
+         * the value mandated by the specification: this
+         * saves a division both here and in the sampler.
+         */
+        tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+    } else {
+        ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+        ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+                               orig_logn, logn - 1);
+    }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+    size_t n, u;
+
+    n = MKN(logn);
+    for (u = 0; u < n; u ++) {
+        r[u] = fpr_of(t[u]);
+    }
+}
+
+/*
+ * The expanded private key contains:
+ *  - The B0 matrix (four elements)
+ *  - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+    (void)logn;
+    return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+    return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+    return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+    return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+    return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_expand_privkey(fpr *expanded_key,
+        const int8_t *f, const int8_t *g,
+        const int8_t *F, const int8_t *G,
+        unsigned logn, uint8_t *tmp) {
+    size_t n;
+    fpr *rf, *rg, *rF, *rG;
+    fpr *b00, *b01, *b10, *b11;
+    fpr *g00, *g01, *g11, *gxx;
+    fpr *tree;
+
+    n = MKN(logn);
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * We load the private key elements directly into the B0 matrix,
+     * since B0 = [[g, -f], [G, -F]].
+     */
+    rf = b01;
+    rg = b00;
+    rF = b11;
+    rG = b10;
+
+    smallints_to_fpr(rf, f, logn);
+    smallints_to_fpr(rg, g, logn);
+    smallints_to_fpr(rF, F, logn);
+    smallints_to_fpr(rG, G, logn);
+
+    /*
+     * Compute the FFT for the key elements, and negate f and F.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rf, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rg, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rF, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(rG, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(rf, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(rF, logn);
+
+    /*
+     * The Gram matrix is G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle).
+     */
+    g00 = (fpr *)tmp;
+    g01 = g00 + n;
+    g11 = g01 + n;
+    gxx = g11 + n;
+
+    memcpy(g00, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(g00, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(g00, gxx, logn);
+
+    memcpy(g01, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(g01, b10, logn);
+    memcpy(gxx, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(gxx, b11, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(g01, gxx, logn);
+
+    memcpy(g11, b10, n * sizeof * b10);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(g11, logn);
+    memcpy(gxx, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(gxx, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(g11, gxx, logn);
+
+    /*
+     * Compute the Falcon tree.
+     */
+    ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+    /*
+     * Normalize tree.
+     */
+    ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+                       fpr *t0, fpr *t1,
+                       fpr *g00, fpr *g01, fpr *g11,
+                       unsigned orig_logn, unsigned logn, fpr *tmp) {
+    size_t n, hn;
+    fpr *z0, *z1;
+
+    /*
+     * Deepest level: the LDL tree leaf value is just g00 (the
+     * array has length only 1 at this point); we normalize it
+     * with regards to sigma, then use it for sampling.
+     */
+    if (logn == 0) {
+        fpr leaf;
+
+        leaf = g00[0];
+        leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+        t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+        t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+        return;
+    }
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+
+    /*
+     * Decompose G into LDL. We only need d00 (identical to g00),
+     * d11, and l10; we do that in place.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_LDL_fft(g00, g01, g11, logn);
+
+    /*
+     * Split d00 and d11 and expand them into half-size quasi-cyclic
+     * Gram matrices. We also save l10 in tmp[].
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(tmp, tmp + hn, g00, logn);
+    memcpy(g00, tmp, n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(tmp, tmp + hn, g11, logn);
+    memcpy(g11, tmp, n * sizeof * tmp);
+    memcpy(tmp, g01, n * sizeof * g01);
+    memcpy(g01, g00, hn * sizeof * g00);
+    memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+    /*
+     * The half-size Gram matrices for the recursive LDL tree
+     * building are now:
+     *   - left sub-tree: g00, g00+hn, g01
+     *   - right sub-tree: g11, g11+hn, g01+hn
+     * l10 is in tmp[].
+     */
+
+    /*
+     * We split t1 and use the first recursive call on the two
+     * halves, using the right sub-tree. The result is merged
+     * back into tmp + 2*n.
+     */
+    z1 = tmp + n;
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+                           g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * l10.
+     * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+     * in tmp + (n << 1). The buffer in z1 is free.
+     *
+     * In the end, z1 is written over t1, and tb0 is in t0.
+     */
+    memcpy(z1, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(z1, tmp + (n << 1), logn);
+    memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(t0, tmp, logn);
+
+    /*
+     * Second recursive invocation, on the split tb0 (currently in t0)
+     * and the left sub-tree.
+     */
+    z0 = tmp;
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z0, z0 + hn, t0, logn);
+    ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+                           g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+               fpr *z0, fpr *z1,
+               const fpr *tree,
+               const fpr *t0, const fpr *t1, unsigned logn,
+               fpr *tmp) {
+    size_t n, hn;
+    const fpr *tree0, *tree1;
+
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
+        fpr x0, x1, sigma;
+
+        x0 = t0[0];
+        x1 = t1[0];
+        sigma = tree[0];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+        return;
+    }
+
+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
+    hn = n >> 1;
+    tree0 = tree + n;
+    tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+    /*
+     * We split t1 into z1 (reused as temporary storage), then do
+     * the recursive invocation, with output in tmp. We finally
+     * merge back into z1.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree1, z1, z1 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+    /*
+     * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+     */
+    memcpy(tmp, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(tmp, z1, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tmp, tree, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(tmp, t0, logn);
+
+    /*
+     * Second recursive invocation.
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z0, z0 + hn, tmp, logn);
+    ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+                   tree0, z0, z0 + hn, logn - 1, tmp + n);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+             const fpr *expanded_key,
+             const uint16_t *hm,
+             unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    const fpr *b00, *b01, *b10, *b11, *tree;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+    t0 = tmp;
+    t1 = t0 + n;
+    b00 = expanded_key + skoff_b00(logn);
+    b01 = expanded_key + skoff_b01(logn);
+    b10 = expanded_key + skoff_b10(logn);
+    b11 = expanded_key + skoff_b11(logn);
+    tree = expanded_key + skoff_tree(logn);
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t0, ni, logn);
+
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Apply sampling. Output is written back in [tx, ty].
+     */
+    ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(t0, tx, n * sizeof * tx);
+    memcpy(t1, ty, n * sizeof * ty);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(t1, ty, logn);
+
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t1, logn);
+
+    /*
+     * Compute the signature.
+     */
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+            const int8_t *f, const int8_t *g,
+            const int8_t *F, const int8_t *G,
+            const uint16_t *hm, unsigned logn, fpr *tmp) {
+    size_t n, u;
+    fpr *t0, *t1, *tx, *ty;
+    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+    fpr ni;
+    uint32_t sqn, ng;
+    int16_t *s1tmp, *s2tmp;
+
+    n = MKN(logn);
+
+    /*
+     * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b11, logn);
+
+    /*
+     * Compute the Gram matrix G = B·B*. Formulas are:
+     *   g00 = b00*adj(b00) + b01*adj(b01)
+     *   g01 = b00*adj(b10) + b01*adj(b11)
+     *   g10 = b10*adj(b00) + b11*adj(b01)
+     *   g11 = b10*adj(b10) + b11*adj(b11)
+     *
+     * For historical reasons, this implementation uses
+     * g00, g01 and g11 (upper triangle). g10 is not kept
+     * since it is equal to adj(g01).
+     *
+     * We _replace_ the matrix B with the Gram matrix, but we
+     * must keep b01 and b11 for computing the target vector.
+     */
+    t0 = b11 + n;
+    t1 = t0 + n;
+
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(t0, logn);    // t0 <- b01*adj(b01)
+
+    memcpy(t1, b00, n * sizeof * b00);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(t1, b10, logn);   // t1 <- b00*adj(b10)
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(b00, logn);   // b00 <- b00*adj(b00)
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(b00, t0, logn);      // b00 <- g00
+    memcpy(t0, b01, n * sizeof * b01);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(b01, b11, logn);  // b01 <- b01*adj(b11)
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(b01, t1, logn);      // b01 <- g01
+
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(b10, logn);   // b10 <- b10*adj(b10)
+    memcpy(t1, b11, n * sizeof * b11);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(t1, logn);    // t1 <- b11*adj(b11)
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(b10, t1, logn);      // b10 <- g11
+
+    /*
+     * We rename variables to make things clearer. The three elements
+     * of the Gram matrix uses the first 3*n slots of tmp[], followed
+     * by b11 and b01 (in that order).
+     */
+    g00 = b00;
+    g01 = b01;
+    g11 = b10;
+    b01 = t0;
+    t0 = b01 + n;
+    t1 = t0 + n;
+
+    /*
+     * Memory layout at that point:
+     *   g00 g01 g11 b11 b01 t0 t1
+     */
+
+    /*
+     * Set the target vector to [hm, 0] (hm is the hashed message).
+     */
+    for (u = 0; u < n; u ++) {
+        t0[u] = fpr_of(hm[u]);
+        /* This is implicit.
+        t1[u] = fpr_zero;
+        */
+    }
+
+    /*
+     * Apply the lattice basis to obtain the real target
+     * vector (after normalization with regards to modulus).
+     */
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(t0, logn);
+    ni = fpr_inverse_of_q;
+    memcpy(t1, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b01, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t0, b11, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t0, ni, logn);
+
+    /*
+     * b01 and b11 can be discarded, so we move back (t0,t1).
+     * Memory layout is now:
+     *      g00 g01 g11 t0 t1
+     */
+    memcpy(b11, t0, n * 2 * sizeof * t0);
+    t0 = g11 + n;
+    t1 = t0 + n;
+
+    /*
+     * Apply sampling; result is written over (t0,t1).
+     */
+    ffSampling_fft_dyntree(samp, samp_ctx,
+                           t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+    /*
+     * We arrange the layout back to:
+     *     b00 b01 b10 b11 t0 t1
+     *
+     * We did not conserve the matrix basis, so we must recompute
+     * it now.
+     */
+    b00 = tmp;
+    b01 = b00 + n;
+    b10 = b01 + n;
+    b11 = b10 + n;
+    memmove(b11 + n, t0, n * 2 * sizeof * t0);
+    t0 = b11 + n;
+    t1 = t0 + n;
+    smallints_to_fpr(b01, f, logn);
+    smallints_to_fpr(b00, g, logn);
+    smallints_to_fpr(b11, F, logn);
+    smallints_to_fpr(b10, G, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b01, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b00, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b11, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_FFT(b10, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b01, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b11, logn);
+    tx = t1 + n;
+    ty = tx + n;
+
+    /*
+     * Get the lattice point corresponding to that tiny vector.
+     */
+    memcpy(tx, t0, n * sizeof * t0);
+    memcpy(ty, t1, n * sizeof * t1);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tx, b00, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b10, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(tx, ty, logn);
+    memcpy(ty, t0, n * sizeof * t0);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b01, logn);
+
+    memcpy(t0, tx, n * sizeof * tx);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b11, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_poly_add(t1, ty, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t0, logn);
+    PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t1, logn);
+
+    s1tmp = (int16_t *)tx;
+    sqn = 0;
+    ng = 0;
+    for (u = 0; u < n; u ++) {
+        int32_t z;
+
+        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+        sqn += (uint32_t)(z * z);
+        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
+    }
+    sqn |= -(ng >> 31);
+
+    /*
+     * With "normal" degrees (e.g. 512 or 1024), it is very
+     * improbable that the computed vector is not short enough;
+     * however, it may happen in practice for the very reduced
+     * versions (e.g. degree 16 or below). In that case, the caller
+     * will loop, and we must not write anything into s2[] because
+     * s2[] may overlap with the hashed message hm[] and we need
+     * hm[] for the next iteration.
+     */
+    s2tmp = (int16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+    }
+    if (PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+        return 1;
+    }
+    return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_gaussian0_sampler(prng *p) {
+
+    static const uint32_t dist[] = {
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
+    };
+
+    uint32_t v0, v1, v2, hi;
+    uint64_t lo;
+    size_t u;
+    int z;
+
+    /*
+     * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+     */
+    lo = prng_get_u64(p);
+    hi = prng_get_u8(p);
+    v0 = (uint32_t)lo & 0xFFFFFF;
+    v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+    v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+    /*
+     * Sampled value is z, such that v0..v2 is lower than the first
+     * z elements of the table.
+     */
+    z = 0;
+    for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) {
+        uint32_t w0, w1, w2, cc;
+
+        w0 = dist[u + 2];
+        w1 = dist[u + 1];
+        w2 = dist[u + 0];
+        cc = (v0 - w0) >> 31;
+        cc = (v1 - w1 - cc) >> 31;
+        cc = (v2 - w2 - cc) >> 31;
+        z += (int)cc;
+    }
+    return z;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+    int s, i;
+    fpr r;
+    uint32_t sw, w;
+    uint64_t z;
+
+    /*
+     * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+     * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+     */
+    s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+    r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+    /*
+     * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+     * (the minimum value for sigma), r = 0 and b = 1, then we get
+     * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+     * with probability about 0.000000000230383991, which is
+     * approximatively equal to 2^(-32). In any case, if s >= 64,
+     * then BerExp will be non-zero with probability less than
+     * 2^(-64), so we can simply saturate s at 63.
+     */
+    sw = (uint32_t)s;
+    sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+    s = (int)sw;
+
+    /*
+     * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+     * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+     * We scale it up to 2^64, then right-shift it by s bits because
+     * we really want exp(-x) = 2^(-s)*exp(-r).
+     *
+     * The "-1" operation makes sure that the value fits on 64 bits
+     * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+     * case). The bias is negligible since fpr_expm_p63() only computes
+     * with 51 bits of precision or so.
+     */
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+    /*
+     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+     * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+     * PRNG output to limit its consumption, the sign of the difference
+     * yields the expected result.
+     */
+    i = 64;
+    do {
+        i -= 8;
+        w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+    } while (!w && i > 0);
+    return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
+    sampler_context *spc;
+    int s;
+    fpr r, dss, ccs;
+
+    spc = ctx;
+
+    /*
+     * Center is mu. We compute mu = s + r where s is an integer
+     * and 0 <= r < 1.
+     */
+    s = (int)fpr_floor(mu);
+    r = fpr_sub(mu, fpr_of(s));
+
+    /*
+     * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+     */
+    dss = fpr_half(fpr_sqr(isigma));
+
+    /*
+     * ccs = sigma_min / sigma = sigma_min * isigma.
+     */
+    ccs = fpr_mul(isigma, spc->sigma_min);
+
+    /*
+     * We now need to sample on center r.
+     */
+    for (;;) {
+        int z0, z, b;
+        fpr x;
+
+        /*
+         * Sample z for a Gaussian distribution. Then get a
+         * random bit b to turn the sampling into a bimodal
+         * distribution: if b = 1, we use z+1, otherwise we
+         * use -z. We thus have two situations:
+         *
+         *  - b = 1: z >= 1 and sampled against a Gaussian
+         *    centered on 1.
+         *  - b = 0: z <= 0 and sampled against a Gaussian
+         *    centered on 0.
+         */
+        z0 = PQCLEAN_FALCONPADDED512_CLEAN_gaussian0_sampler(&spc->p);
+        b = (int)prng_get_u8(&spc->p) & 1;
+        z = b + ((b << 1) - 1) * z0;
+
+        /*
+         * Rejection sampling. We want a Gaussian centered on r;
+         * but we sampled against a Gaussian centered on b (0 or
+         * 1). But we know that z is always in the range where
+         * our sampling distribution is greater than the Gaussian
+         * distribution, so rejection works.
+         *
+         * We got z with distribution:
+         *    G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+         * We target distribution:
+         *    S(z) = exp(-((z-r)^2)/(2*sigma^2))
+         * Rejection sampling works by keeping the value z with
+         * probability S(z)/G(z), and starting again otherwise.
+         * This requires S(z) <= G(z), which is the case here.
+         * Thus, we simply need to keep our z with probability:
+         *    P = exp(-x)
+         * where:
+         *    x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+         *
+         * Here, we scale up the Bernouilli distribution, which
+         * makes rejection more probable, but makes rejection
+         * rate sufficiently decorrelated from the Gaussian
+         * center and standard deviation that the whole sampler
+         * can be said to be constant-time.
+         */
+        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+        if (BerExp(&spc->p, x, ccs)) {
+            /*
+             * Rejection sampling was centered on r, but the
+             * actual center is mu = s + r.
+             */
+            return s + z;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+                                        const fpr *expanded_key,
+                                        const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED512_CLEAN_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED512_CLEAN_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_tree(samp, samp_ctx, sig,
+                         expanded_key, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+                                       const int8_t *f, const int8_t *g,
+                                       const int8_t *F, const int8_t *G,
+                                       const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+    fpr *ftmp;
+
+    ftmp = (fpr *)tmp;
+    for (;;) {
+        /*
+         * Signature produces short vectors s1 and s2. The
+         * signature is acceptable only if the aggregate vector
+         * s1,s2 is short; we must use the same bound as the
+         * verifier.
+         *
+         * If the signature is acceptable, then we return only s2
+         * (the verifier recomputes s1 from s2, the hashed message,
+         * and the public key).
+         */
+        sampler_context spc;
+        samplerZ samp;
+        void *samp_ctx;
+
+        /*
+         * Normal sampling. We use a fast PRNG seeded from our
+         * SHAKE context ('rng').
+         */
+        spc.sigma_min = fpr_sigma_min[logn];
+        PQCLEAN_FALCONPADDED512_CLEAN_prng_init(&spc.p, rng);
+        samp = PQCLEAN_FALCONPADDED512_CLEAN_sampler;
+        samp_ctx = &spc;
+
+        /*
+         * Do the actual signature.
+         */
+        if (do_sign_dyn(samp, samp_ctx, sig,
+                        f, g, F, G, hm, logn, ftmp)) {
+            break;
+        }
+    }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/vrfy.c
new file mode 100644
index 000000000..5bcc2b52b
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019  Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ *   n = 2^logn  (2 <= n <= 1024)
+ *   phi = X^n + 1
+ *   q = 12289
+ *   q0i = -1/q mod 2^16
+ *   R = 2^16 mod q
+ *   R2 = 2^32 mod q
+ */
+
+#define Q     12289
+#define Q0I   12287
+#define R      4091
+#define R2    10952
+
+/*
+ * Table for NTT, binary case:
+ *   GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+    4091,  7888, 11060, 11208,  6960,  4342,  6275,  9759,
+    1591,  6399,  9477,  5266,   586,  5825,  7538,  9710,
+    1134,  6407,  1711,   965,  7099,  7674,  3743,  6442,
+    10414,  8100,  1885,  1688,  1364, 10329, 10164,  9180,
+    12210,  6240,   997,   117,  4783,  4407,  1549,  7072,
+    2829,  6458,  4431,  8877,  7144,  2564,  5664,  4042,
+    12189,   432, 10751,  1237,  7610,  1534,  3983,  7863,
+    2181,  6308,  8720,  6570,  4843,  1690,    14,  3872,
+    5569,  9368, 12163,  2019,  7543,  2315,  4673,  7340,
+    1553,  1156,  8401, 11389,  1020,  2967, 10772,  7045,
+    3316, 11236,  5285, 11578, 10637, 10086,  9493,  6180,
+    9277,  6130,  3323,   883, 10469,   489,  1502,  2851,
+    11061,  9729,  2742, 12241,  4970, 10481, 10078,  1195,
+    730,  1762,  3854,  2030,  5892, 10922,  9020,  5274,
+    9179,  3604,  3782, 10206,  3180,  3467,  4668,  2446,
+    7613,  9386,   834,  7703,  6836,  3403,  5351, 12276,
+    3580,  1739, 10820,  9787, 10209,  4070, 12250,  8525,
+    10401,  2749,  7338, 10574,  6040,   943,  9330,  1477,
+    6865,  9668,  3585,  6633, 12145,  4063,  3684,  7680,
+    8188,  6902,  3533,  9807,  6090,   727, 10099,  7003,
+    6945,  1949,  9731, 10559,  6057,   378,  7871,  8763,
+    8901,  9229,  8846,  4551,  9589, 11664,  7630,  8821,
+    5680,  4956,  6251,  8388, 10156,  8723,  2341,  3159,
+    1467,  5460,  8553,  7783,  2649,  2320,  9036,  6188,
+    737,  3698,  4699,  5753,  9046,  3687,    16,   914,
+    5186, 10531,  4552,  1964,  3509,  8436,  7516,  5381,
+    10733,  3281,  7037,  1060,  2895,  7156,  8887,  5357,
+    6409,  8197,  2962,  6375,  5064,  6634,  5625,   278,
+    932, 10229,  8927,  7642,   351,  9298,   237,  5858,
+    7692,  3146, 12126,  7586,  2053, 11285,  3802,  5204,
+    4602,  1748, 11300,   340,  3711,  4614,   300, 10993,
+    5070, 10049, 11616, 12247,  7421, 10707,  5746,  5654,
+    3835,  5553,  1224,  8476,  9237,  3845,   250, 11209,
+    4225,  6326,  9680, 12254,  4136,  2778,   692,  8808,
+    6410,  6718, 10105, 10418,  3759,  7356, 11361,  8433,
+    6437,  3652,  6342,  8978,  5391,  2272,  6476,  7416,
+    8418, 10824, 11986,  5733,   876,  7030,  2167,  2436,
+    3442,  9217,  8206,  4858,  5964,  2746,  7178,  1434,
+    7389,  8879, 10661, 11457,  4220,  1432, 10832,  4328,
+    8557,  1867,  9454,  2416,  3816,  9076,   686,  5393,
+    2523,  4339,  6115,   619,   937,  2834,  7775,  3279,
+    2363,  7488,  6112,  5056,   824, 10204, 11690,  1113,
+    2727,  9848,   896,  2028,  5075,  2654, 10464,  7884,
+    12169,  5434,  3070,  6400,  9132, 11672, 12153,  4520,
+    1273,  9739, 11468,  9937, 10039,  9720,  2262,  9399,
+    11192,   315,  4511,  1158,  6061,  6751, 11865,   357,
+    7367,  4550,   983,  8534,  8352, 10126,  7530,  9253,
+    4367,  5221,  3999,  8777,  3161,  6990,  4130, 11652,
+    3374, 11477,  1753,   292,  8681,  2806, 10378, 12188,
+    5800, 11811,  3181,  1988,  1024,  9340,  2477, 10928,
+    4582,  6750,  3619,  5503,  5233,  2463,  8470,  7650,
+    7964,  6395,  1071,  1272,  3474, 11045,  3291, 11344,
+    8502,  9478,  9837,  1253,  1857,  6233,  4720, 11561,
+    6034,  9817,  3339,  1797,  2879,  6242,  5200,  2114,
+    7962,  9353, 11363,  5475,  6084,  9601,  4108,  7323,
+    10438,  9471,  1271,   408,  6911,  3079,   360,  8276,
+    11535,  9156,  9049, 11539,   850,  8617,   784,  7919,
+    8334, 12170,  1846, 10213, 12184,  7827, 11903,  5600,
+    9779,  1012,   721,  2784,  6676,  6552,  5348,  4424,
+    6816,  8405,  9959,  5150,  2356,  5552,  5267,  1333,
+    8801,  9661,  7308,  5788,  4910,   909, 11613,  4395,
+    8238,  6686,  4302,  3044,  2285, 12249,  1963,  9216,
+    4296, 11918,   695,  4371,  9793,  4884,  2411, 10230,
+    2650,   841,  3890, 10231,  7248,  8505, 11196,  6688,
+    4059,  6060,  3686,  4722, 11853,  5816,  7058,  6868,
+    11137,  7926,  4894, 12284,  4102,  3908,  3610,  6525,
+    7938,  7982, 11977,  6755,   537,  4562,  1623,  8227,
+    11453,  7544,   906, 11816,  9548, 10858,  9703,  2815,
+    11736,  6813,  6979,   819,  8903,  6271, 10843,   348,
+    7514,  8339,  6439,   694,   852,  5659,  2781,  3716,
+    11589,  3024,  1523,  8659,  4114, 10738,  3303,  5885,
+    2978,  7289, 11884,  9123,  9323, 11830,    98,  2526,
+    2116,  4131, 11407,  1844,  3645,  3916,  8133,  2224,
+    10871,  8092,  9651,  5989,  7140,  8480,  1670,   159,
+    10923,  4918,   128,  7312,   725,  9157,  5006,  6393,
+    3494,  6043, 10972,  6181, 11838,  3423, 10514,  7668,
+    3693,  6658,  6905, 11953, 10212, 11922,  9101,  8365,
+    5110,    45,  2400,  1921,  4377,  2720,  1695,    51,
+    2808,   650,  1896,  9997,  9971, 11980,  8098,  4833,
+    4135,  4257,  5838,  4765, 10985, 11532,   590, 12198,
+    482, 12173,  2006,  7064, 10018,  3912, 12016, 10519,
+    11362,  6954,  2210,   284,  5413,  6601,  3865, 10339,
+    11188,  6231,   517,  9564, 11281,  3863,  1210,  4604,
+    8160, 11447,   153,  7204,  5763,  5089,  9248, 12154,
+    11748,  1354,  6672,   179,  5532,  2646,  5941, 12185,
+    862,  3158,   477,  7279,  5678,  7914,  4254,   302,
+    2893, 10114,  6890,  9560,  9647, 11905,  4098,  9824,
+    10269,  1353, 10715,  5325,  6254,  3951,  1807,  6449,
+    5159,  1308,  8315,  3404,  1877,  1231,   112,  6398,
+    11724, 12272,  7286,  1459, 12274,  9896,  3456,   800,
+    1397, 10678,   103,  7420,  7976,   936,   764,   632,
+    7996,  8223,  8445,  7758, 10870,  9571,  2508,  1946,
+    6524, 10158,  1044,  4338,  2457,  3641,  1659,  4139,
+    4688,  9733, 11148,  3946,  2082,  5261,  2036, 11850,
+    7636, 12236,  5366,  2380,  1399,  7720,  2100,  3217,
+    10912,  8898,  7578, 11995,  2791,  1215,  3355,  2711,
+    2267,  2004,  8568, 10176,  3214,  2337,  1750,  4729,
+    4997,  7415,  6315, 12044,  4374,  7157,  4844,   211,
+    8003, 10159,  9290, 11481,  1735,  2336,  5793,  9875,
+    8192,   986,  7527,  1401,   870,  3615,  8465,  2756,
+    9770,  2034, 10168,  3264,  6132,    54,  2880,  4763,
+    11805,  3074,  8286,  9428,  4881,  6933,  1090, 10038,
+    2567,   708,   893,  6465,  4962, 10024,  2090,  5718,
+    10743,   780,  4733,  4623,  2134,  2087,  4802,   884,
+    5372,  5795,  5938,  4333,  6559,  7549,  5269, 10664,
+    4252,  3260,  5917, 10814,  5768,  9983,  8096,  7791,
+    6800,  7491,  6272,  1907, 10947,  6289, 11803,  6032,
+    11449,  1171,  9201,  7933,  2479,  7970, 11337,  7062,
+    8911,  6728,  6542,  8114,  8828,  6595,  3545,  4348,
+    4610,  2205,  6999,  8106,  5560, 10390,  9321,  2499,
+    2413,  7272,  6881, 10582,  9308,  9437,  3554,  3326,
+    5991, 11969,  3415, 12283,  9838, 12063,  4332,  7830,
+    11329,  6605, 12271,  2044, 11611,  7353, 11201, 11582,
+    3733,  8943,  9978,  1627,  7168,  3935,  5050,  2762,
+    7496, 10383,   755,  1654, 12053,  4952, 10134,  4394,
+    6592,  7898,  7497,  8904, 12029,  3581, 10748,  5674,
+    10358,  4901,  7414,  8771,   710,  6764,  8462,  7193,
+    5371,  7274, 11084,   290,  7864,  6827, 11822,  2509,
+    6578,  4026,  5807,  1458,  5721,  5762,  4178,  2105,
+    11621,  4852,  8897,  2856, 11510,  9264,  2520,  8776,
+    7011,  2647,  1898,  7039,  5950, 11163,  5488,  6277,
+    9182, 11456,   633, 10046, 11554,  5633,  9587,  2333,
+    7008,  7084,  5047,  7199,  9865,  8997,   569,  6390,
+    10845,  9679,  8268, 11472,  4203,  1997,     2,  9331,
+    162,  6182,  2000,  3649,  9792,  6363,  7557,  6187,
+    8510,  9935,  5536,  9019,  3706, 12009,  1452,  3067,
+    5494,  9692,  4865,  6019,  7106,  9610,  4588, 10165,
+    6261,  5887,  2652, 10172,  1580, 10379,  4638,  9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ *   iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+    4091,  4401,  1081,  1229,  2530,  6014,  7947,  5329,
+    2579,  4751,  6464, 11703,  7023,  2812,  5890, 10698,
+    3109,  2125,  1960, 10925, 10601, 10404,  4189,  1875,
+    5847,  8546,  4615,  5190, 11324, 10578,  5882, 11155,
+    8417, 12275, 10599,  7446,  5719,  3569,  5981, 10108,
+    4426,  8306, 10755,  4679, 11052,  1538, 11857,   100,
+    8247,  6625,  9725,  5145,  3412,  7858,  5831,  9460,
+    5217, 10740,  7882,  7506, 12172, 11292,  6049,    79,
+    13,  6938,  8886,  5453,  4586, 11455,  2903,  4676,
+    9843,  7621,  8822,  9109,  2083,  8507,  8685,  3110,
+    7015,  3269,  1367,  6397, 10259,  8435, 10527, 11559,
+    11094,  2211,  1808,  7319,    48,  9547,  2560,  1228,
+    9438, 10787, 11800,  1820, 11406,  8966,  6159,  3012,
+    6109,  2796,  2203,  1652,   711,  7004,  1053,  8973,
+    5244,  1517,  9322, 11269,   900,  3888, 11133, 10736,
+    4949,  7616,  9974,  4746, 10270,   126,  2921,  6720,
+    6635,  6543,  1582,  4868,    42,   673,  2240,  7219,
+    1296, 11989,  7675,  8578, 11949,   989, 10541,  7687,
+    7085,  8487,  1004, 10236,  4703,   163,  9143,  4597,
+    6431, 12052,  2991, 11938,  4647,  3362,  2060, 11357,
+    12011,  6664,  5655,  7225,  5914,  9327,  4092,  5880,
+    6932,  3402,  5133,  9394, 11229,  5252,  9008,  1556,
+    6908,  4773,  3853,  8780, 10325,  7737,  1758,  7103,
+    11375, 12273,  8602,  3243,  6536,  7590,  8591, 11552,
+    6101,  3253,  9969,  9640,  4506,  3736,  6829, 10822,
+    9130,  9948,  3566,  2133,  3901,  6038,  7333,  6609,
+    3468,  4659,   625,  2700,  7738,  3443,  3060,  3388,
+    3526,  4418, 11911,  6232,  1730,  2558, 10340,  5344,
+    5286,  2190, 11562,  6199,  2482,  8756,  5387,  4101,
+    4609,  8605,  8226,   144,  5656,  8704,  2621,  5424,
+    10812,  2959, 11346,  6249,  1715,  4951,  9540,  1888,
+    3764,    39,  8219,  2080,  2502,  1469, 10550,  8709,
+    5601,  1093,  3784,  5041,  2058,  8399, 11448,  9639,
+    2059,  9878,  7405,  2496,  7918, 11594,   371,  7993,
+    3073, 10326,    40, 10004,  9245,  7987,  5603,  4051,
+    7894,   676, 11380,  7379,  6501,  4981,  2628,  3488,
+    10956,  7022,  6737,  9933,  7139,  2330,  3884,  5473,
+    7865,  6941,  5737,  5613,  9505, 11568, 11277,  2510,
+    6689,   386,  4462,   105,  2076, 10443,   119,  3955,
+    4370, 11505,  3672, 11439,   750,  3240,  3133,   754,
+    4013, 11929,  9210,  5378, 11881, 11018,  2818,  1851,
+    4966,  8181,  2688,  6205,  6814,   926,  2936,  4327,
+    10175,  7089,  6047,  9410, 10492,  8950,  2472,  6255,
+    728,  7569,  6056, 10432, 11036,  2452,  2811,  3787,
+    945,  8998,  1244,  8815, 11017, 11218,  5894,  4325,
+    4639,  3819,  9826,  7056,  6786,  8670,  5539,  7707,
+    1361,  9812,  2949, 11265, 10301,  9108,   478,  6489,
+    101,  1911,  9483,  3608, 11997, 10536,   812,  8915,
+    637,  8159,  5299,  9128,  3512,  8290,  7068,  7922,
+    3036,  4759,  2163,  3937,  3755, 11306,  7739,  4922,
+    11932,   424,  5538,  6228, 11131,  7778, 11974,  1097,
+    2890, 10027,  2569,  2250,  2352,   821,  2550, 11016,
+    7769,   136,   617,  3157,  5889,  9219,  6855,   120,
+    4405,  1825,  9635,  7214, 10261, 11393,  2441,  9562,
+    11176,   599,  2085, 11465,  7233,  6177,  4801,  9926,
+    9010,  4514,  9455, 11352, 11670,  6174,  7950,  9766,
+    6896, 11603,  3213,  8473,  9873,  2835, 10422,  3732,
+    7961,  1457, 10857,  8069,   832,  1628,  3410,  4900,
+    10855,  5111,  9543,  6325,  7431,  4083,  3072,  8847,
+    9853, 10122,  5259, 11413,  6556,   303,  1465,  3871,
+    4873,  5813, 10017,  6898,  3311,  5947,  8637,  5852,
+    3856,   928,  4933,  8530,  1871,  2184,  5571,  5879,
+    3481, 11597,  9511,  8153,    35,  2609,  5963,  8064,
+    1080, 12039,  8444,  3052,  3813, 11065,  6736,  8454,
+    2340,  7651,  1910, 10709,  2117,  9637,  6402,  6028,
+    2124,  7701,  2679,  5183,  6270,  7424,  2597,  6795,
+    9222, 10837,   280,  8583,  3270,  6753,  2354,  3779,
+    6102,  4732,  5926,  2497,  8640, 10289,  6107, 12127,
+    2958, 12287, 10292,  8086,   817,  4021,  2610,  1444,
+    5899, 11720,  3292,  2424,  5090,  7242,  5205,  5281,
+    9956,  2702,  6656,   735,  2243, 11656,   833,  3107,
+    6012,  6801,  1126,  6339,  5250, 10391,  9642,  5278,
+    3513,  9769,  3025,   779,  9433,  3392,  7437,   668,
+    10184,  8111,  6527,  6568, 10831,  6482,  8263,  5711,
+    9780,   467,  5462,  4425, 11999,  1205,  5015,  6918,
+    5096,  3827,  5525, 11579,  3518,  4875,  7388,  1931,
+    6615,  1541,  8708,   260,  3385,  4792,  4391,  5697,
+    7895,  2155,  7337,   236, 10635, 11534,  1906,  4793,
+    9527,  7239,  8354,  5121, 10662,  2311,  3346,  8556,
+    707,  1088,  4936,   678, 10245,    18,  5684,   960,
+    4459,  7957,   226,  2451,     6,  8874,   320,  6298,
+    8963,  8735,  2852,  2981,  1707,  5408,  5017,  9876,
+    9790,  2968,  1899,  6729,  4183,  5290, 10084,  7679,
+    7941,  8744,  5694,  3461,  4175,  5747,  5561,  3378,
+    5227,   952,  4319,  9810,  4356,  3088, 11118,   840,
+    6257,   486,  6000,  1342, 10382,  6017,  4798,  5489,
+    4498,  4193,  2306,  6521,  1475,  6372,  9029,  8037,
+    1625,  7020,  4740,  5730,  7956,  6351,  6494,  6917,
+    11405,  7487, 10202, 10155,  7666,  7556, 11509,  1546,
+    6571, 10199,  2265,  7327,  5824, 11396, 11581,  9722,
+    2251, 11199,  5356,  7408,  2861,  4003,  9215,   484,
+    7526,  9409, 12235,  6157,  9025,  2121, 10255,  2519,
+    9533,  3824,  8674, 11419, 10888,  4762, 11303,  4097,
+    2414,  6496,  9953, 10554,   808,  2999,  2130,  4286,
+    12078,  7445,  5132,  7915,   245,  5974,  4874,  7292,
+    7560, 10539,  9952,  9075,  2113,  3721, 10285, 10022,
+    9578,  8934, 11074,  9498,   294,  4711,  3391,  1377,
+    9072, 10189,  4569, 10890,  9909,  6923,    53,  4653,
+    439, 10253,  7028, 10207,  8343,  1141,  2556,  7601,
+    8150, 10630,  8648,  9832,  7951, 11245,  2131,  5765,
+    10343,  9781,  2718,  1419,  4531,  3844,  4066,  4293,
+    11657, 11525, 11353,  4313,  4869, 12186,  1611, 10892,
+    11489,  8833,  2393,    15, 10830,  5003,    17,   565,
+    5891, 12177, 11058, 10412,  8885,  3974, 10981,  7130,
+    5840, 10482,  8338,  6035,  6964,  1574, 10936,  2020,
+    2465,  8191,   384,  2642,  2729,  5399,  2175,  9396,
+    11987,  8035,  4375,  6611,  5010, 11812,  9131, 11427,
+    104,  6348,  9643,  6757, 12110,  5617, 10935,   541,
+    135,  3041,  7200,  6526,  5085, 12136,   842,  4129,
+    7685, 11079,  8426,  1008,  2725, 11772,  6058,  1101,
+    1950,  8424,  5688,  6876, 12005, 10079,  5335,   927,
+    1770,   273,  8377,  2271,  5225, 10283,   116, 11807,
+    91, 11699,   757,  1304,  7524,  6451,  8032,  8154,
+    7456,  4191,   309,  2318,  2292, 10393, 11639,  9481,
+    12238, 10594,  9569,  7912, 10368,  9889, 12244,  7179,
+    3924,  3188,   367,  2077,   336,  5384,  5631,  8596,
+    4621,  1775,  8866,   451,  6108,  1317,  6246,  8795,
+    5896,  7283,  3132, 11564,  4977, 12161,  7371,  1366,
+    12130, 10619,  3809,  5149,  6300,  2638,  4197,  1418,
+    10065,  4156,  8373,  8644, 10445,   882,  8158, 10173,
+    9763, 12191,   459,  2966,  3166,   405,  5000,  9311,
+    6404,  8986,  1551,  8175,  3630, 10766,  9265,   700,
+    8573,  9508,  6630, 11437, 11595,  5850,  3950,  4775,
+    11941,  1446,  6018,  3386, 11470,  5310,  5476,   553,
+    9474,  2586,  1431,  2741,   473, 11383,  4745,   836,
+    4062, 10666,  7727, 11752,  5534,   312,  4307,  4351,
+    5764,  8679,  8381,  8187,     5,  7395,  4363,  1152,
+    5421,  5231,  6473,   436,  7567,  8603,  6229,  8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+    /*
+     * If x < 0, the cast to uint32_t will set the high bit to 1.
+     */
+    uint32_t y;
+
+    y = (uint32_t)x;
+    y += Q & -(y >> 31);
+    return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+    /*
+     * We compute x + y - q. If the result is negative, then the
+     * high bit will be set, and 'd >> 31' will be equal to 1;
+     * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+     * it will be an all-zero pattern. In other words, this
+     * implements a conditional addition of q.
+     */
+    uint32_t d;
+
+    d = x + y - Q;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+    /*
+     * As in mq_add(), we use a conditional addition to ensure the
+     * result is in the 0..q-1 range.
+     */
+    uint32_t d;
+
+    d = x - y;
+    d += Q & -(d >> 31);
+    return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+    x += Q & -(x & 1);
+    return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+    uint32_t z, w;
+
+    /*
+     * We compute x*y + k*q with a value of k chosen so that the 16
+     * low bits of the result are 0. We can then shift the value.
+     * After the shift, result may still be larger than q, but it
+     * will be lower than 2*q, so a conditional subtraction works.
+     */
+
+    z = x * y;
+    w = ((z * Q0I) & 0xFFFF) * Q;
+
+    /*
+     * When adding z and w, the result will have its low 16 bits
+     * equal to 0. Since x, y and z are lower than q, the sum will
+     * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+     * fit on 29 bits.
+     */
+    z = (z + w) >> 16;
+
+    /*
+     * After the shift, analysis shows that the value will be less
+     * than 2q. We do a subtraction then conditional subtraction to
+     * ensure the result is in the expected range.
+     */
+    z -= Q;
+    z += Q & -(z >> 31);
+    return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+    return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+    /*
+     * We invert y by computing y^(q-2) mod q.
+     *
+     * We use the following addition chain for exponent e = 12287:
+     *
+     *   e0 = 1
+     *   e1 = 2 * e0 = 2
+     *   e2 = e1 + e0 = 3
+     *   e3 = e2 + e1 = 5
+     *   e4 = 2 * e3 = 10
+     *   e5 = 2 * e4 = 20
+     *   e6 = 2 * e5 = 40
+     *   e7 = 2 * e6 = 80
+     *   e8 = 2 * e7 = 160
+     *   e9 = e8 + e2 = 163
+     *   e10 = e9 + e8 = 323
+     *   e11 = 2 * e10 = 646
+     *   e12 = 2 * e11 = 1292
+     *   e13 = e12 + e9 = 1455
+     *   e14 = 2 * e13 = 2910
+     *   e15 = 2 * e14 = 5820
+     *   e16 = e15 + e10 = 6143
+     *   e17 = 2 * e16 = 12286
+     *   e18 = e17 + e0 = 12287
+     *
+     * Additions on exponents are converted to Montgomery
+     * multiplications. We define all intermediate results as so
+     * many local variables, and let the C compiler work out which
+     * must be kept around.
+     */
+    uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+    uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+    y0 = mq_montymul(y, R2);
+    y1 = mq_montysqr(y0);
+    y2 = mq_montymul(y1, y0);
+    y3 = mq_montymul(y2, y1);
+    y4 = mq_montysqr(y3);
+    y5 = mq_montysqr(y4);
+    y6 = mq_montysqr(y5);
+    y7 = mq_montysqr(y6);
+    y8 = mq_montysqr(y7);
+    y9 = mq_montymul(y8, y2);
+    y10 = mq_montymul(y9, y8);
+    y11 = mq_montysqr(y10);
+    y12 = mq_montysqr(y11);
+    y13 = mq_montymul(y12, y9);
+    y14 = mq_montysqr(y13);
+    y15 = mq_montysqr(y14);
+    y16 = mq_montymul(y15, y10);
+    y17 = mq_montysqr(y16);
+    y18 = mq_montymul(y17, y0);
+
+    /*
+     * Final multiplication with x, which is not in Montgomery
+     * representation, computes the correct division result.
+     */
+    return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+
+    n = (size_t)1 << logn;
+    t = n;
+    for (m = 1; m < n; m <<= 1) {
+        size_t ht, i, j1;
+
+        ht = t >> 1;
+        for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+            size_t j, j2;
+            uint32_t s;
+
+            s = GMb[m + i];
+            j2 = j1 + ht;
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v;
+
+                u = a[j];
+                v = mq_montymul(a[j + ht], s);
+                a[j] = (uint16_t)mq_add(u, v);
+                a[j + ht] = (uint16_t)mq_sub(u, v);
+            }
+        }
+        t = ht;
+    }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+    size_t n, t, m;
+    uint32_t ni;
+
+    n = (size_t)1 << logn;
+    t = 1;
+    m = n;
+    while (m > 1) {
+        size_t hm, dt, i, j1;
+
+        hm = m >> 1;
+        dt = t << 1;
+        for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+            size_t j, j2;
+            uint32_t s;
+
+            j2 = j1 + t;
+            s = iGMb[hm + i];
+            for (j = j1; j < j2; j ++) {
+                uint32_t u, v, w;
+
+                u = a[j];
+                v = a[j + t];
+                a[j] = (uint16_t)mq_add(u, v);
+                w = mq_sub(u, v);
+                a[j + t] = (uint16_t)
+                           mq_montymul(w, s);
+            }
+        }
+        t = dt;
+        m = hm;
+    }
+
+    /*
+     * To complete the inverse NTT, we must now divide all values by
+     * n (the vector size). We thus need the inverse of n, i.e. we
+     * need to divide 1 by 2 logn times. But we also want it in
+     * Montgomery representation, i.e. we also want to multiply it
+     * by R = 2^16. In the common case, this should be a simple right
+     * shift. The loop below is generic and works also in corner cases;
+     * its computation time is negligible.
+     */
+    ni = R;
+    for (m = n; m > 1; m >>= 1) {
+        ni = mq_rshift1(ni);
+    }
+    for (m = 0; m < n; m ++) {
+        a[m] = (uint16_t)mq_montymul(a[m], ni);
+    }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], R2);
+    }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+    }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+    size_t u, n;
+
+    n = (size_t)1 << logn;
+    for (u = 0; u < n; u ++) {
+        f[u] = (uint16_t)mq_sub(f[u], g[u]);
+    }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn) {
+    mq_NTT(h, logn);
+    mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+        const uint16_t *h, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+
+    /*
+     * Reduce s2 elements modulo q ([0..q-1] range).
+     */
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+     */
+    mq_NTT(tt, logn);
+    mq_poly_montymul_ntt(tt, h, logn);
+    mq_iNTT(tt, logn);
+    mq_poly_sub(tt, c0, logn);
+
+    /*
+     * Normalize -s1 elements into the [-q/2..q/2] range.
+     */
+    for (u = 0; u < n; u ++) {
+        int32_t w;
+
+        w = (int32_t)tt[u];
+        w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+        ((int16_t *)tt)[u] = (int16_t)w;
+    }
+
+    /*
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
+     * is short enough.
+     */
+    return PQCLEAN_FALCONPADDED512_CLEAN_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_compute_public(uint16_t *h,
+        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        tt[u] = (uint16_t)mq_conv_small(f[u]);
+        h[u] = (uint16_t)mq_conv_small(g[u]);
+    }
+    mq_NTT(h, logn);
+    mq_NTT(tt, logn);
+    for (u = 0; u < n; u ++) {
+        if (tt[u] == 0) {
+            return 0;
+        }
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_complete_private(int8_t *G,
+        const int8_t *f, const int8_t *g, const int8_t *F,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *t1, *t2;
+
+    n = (size_t)1 << logn;
+    t1 = (uint16_t *)tmp;
+    t2 = t1 + n;
+    for (u = 0; u < n; u ++) {
+        t1[u] = (uint16_t)mq_conv_small(g[u]);
+        t2[u] = (uint16_t)mq_conv_small(F[u]);
+    }
+    mq_NTT(t1, logn);
+    mq_NTT(t2, logn);
+    mq_poly_tomonty(t1, logn);
+    mq_poly_montymul_ntt(t1, t2, logn);
+    for (u = 0; u < n; u ++) {
+        t2[u] = (uint16_t)mq_conv_small(f[u]);
+    }
+    mq_NTT(t2, logn);
+    for (u = 0; u < n; u ++) {
+        if (t2[u] == 0) {
+            return 0;
+        }
+        t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+    }
+    mq_iNTT(t1, logn);
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+        int32_t gi;
+
+        w = t1[u];
+        w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+        gi = *(int32_t *)&w;
+        if (gi < -127 || gi > +127) {
+            return 0;
+        }
+        G[u] = (int8_t)gi;
+    }
+    return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED512_CLEAN_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}
diff --git a/src/sig/falcon/sig_falcon.h b/src/sig/falcon/sig_falcon.h
index dfd43e88b..a8eb1454f 100644
--- a/src/sig/falcon/sig_falcon.h
+++ b/src/sig/falcon/sig_falcon.h
@@ -8,7 +8,7 @@
 #if defined(OQS_ENABLE_SIG_falcon_512)
 #define OQS_SIG_falcon_512_length_public_key 897
 #define OQS_SIG_falcon_512_length_secret_key 1281
-#define OQS_SIG_falcon_512_length_signature 666
+#define OQS_SIG_falcon_512_length_signature 752
 
 OQS_SIG *OQS_SIG_falcon_512_new(void);
 OQS_API OQS_STATUS OQS_SIG_falcon_512_keypair(uint8_t *public_key, uint8_t *secret_key);
@@ -19,7 +19,7 @@ OQS_API OQS_STATUS OQS_SIG_falcon_512_verify(const uint8_t *message, size_t mess
 #if defined(OQS_ENABLE_SIG_falcon_1024)
 #define OQS_SIG_falcon_1024_length_public_key 1793
 #define OQS_SIG_falcon_1024_length_secret_key 2305
-#define OQS_SIG_falcon_1024_length_signature 1280
+#define OQS_SIG_falcon_1024_length_signature 1462
 
 OQS_SIG *OQS_SIG_falcon_1024_new(void);
 OQS_API OQS_STATUS OQS_SIG_falcon_1024_keypair(uint8_t *public_key, uint8_t *secret_key);
@@ -27,4 +27,26 @@ OQS_API OQS_STATUS OQS_SIG_falcon_1024_sign(uint8_t *signature, size_t *signatur
 OQS_API OQS_STATUS OQS_SIG_falcon_1024_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
 #endif
 
+#if defined(OQS_ENABLE_SIG_falcon_padded_512)
+#define OQS_SIG_falcon_padded_512_length_public_key 897
+#define OQS_SIG_falcon_padded_512_length_secret_key 1281
+#define OQS_SIG_falcon_padded_512_length_signature 666
+
+OQS_SIG *OQS_SIG_falcon_padded_512_new(void);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024)
+#define OQS_SIG_falcon_padded_1024_length_public_key 1793
+#define OQS_SIG_falcon_padded_1024_length_secret_key 2305
+#define OQS_SIG_falcon_padded_1024_length_signature 1280
+
+OQS_SIG *OQS_SIG_falcon_padded_1024_new(void);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
 #endif
diff --git a/src/sig/falcon/sig_falcon_padded_1024.c b/src/sig/falcon/sig_falcon_padded_1024.c
new file mode 100644
index 000000000..53b8c3926
--- /dev/null
+++ b/src/sig/falcon/sig_falcon_padded_1024.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+
+#include <stdlib.h>
+
+#include <oqs/sig_falcon.h>
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024)
+
+OQS_SIG *OQS_SIG_falcon_padded_1024_new(void) {
+
+	OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+	if (sig == NULL) {
+		return NULL;
+	}
+	sig->method_name = OQS_SIG_alg_falcon_padded_1024;
+	sig->alg_version = "20211101 with PQClean patches";
+
+	sig->claimed_nist_level = 5;
+	sig->euf_cma = true;
+
+	sig->length_public_key = OQS_SIG_falcon_padded_1024_length_public_key;
+	sig->length_secret_key = OQS_SIG_falcon_padded_1024_length_secret_key;
+	sig->length_signature = OQS_SIG_falcon_padded_1024_length_signature;
+
+	sig->keypair = OQS_SIG_falcon_padded_1024_keypair;
+	sig->sign = OQS_SIG_falcon_padded_1024_sign;
+	sig->verify = OQS_SIG_falcon_padded_1024_verify;
+
+	return sig;
+}
+
+extern int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+extern int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+extern int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/falcon/sig_falcon_padded_512.c b/src/sig/falcon/sig_falcon_padded_512.c
new file mode 100644
index 000000000..9521187b8
--- /dev/null
+++ b/src/sig/falcon/sig_falcon_padded_512.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+
+#include <stdlib.h>
+
+#include <oqs/sig_falcon.h>
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_512)
+
+OQS_SIG *OQS_SIG_falcon_padded_512_new(void) {
+
+	OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+	if (sig == NULL) {
+		return NULL;
+	}
+	sig->method_name = OQS_SIG_alg_falcon_padded_512;
+	sig->alg_version = "20211101 with PQClean patches";
+
+	sig->claimed_nist_level = 1;
+	sig->euf_cma = true;
+
+	sig->length_public_key = OQS_SIG_falcon_padded_512_length_public_key;
+	sig->length_secret_key = OQS_SIG_falcon_padded_512_length_secret_key;
+	sig->length_signature = OQS_SIG_falcon_padded_512_length_signature;
+
+	sig->keypair = OQS_SIG_falcon_padded_512_keypair;
+	sig->sign = OQS_SIG_falcon_padded_512_sign;
+	sig->verify = OQS_SIG_falcon_padded_512_verify;
+
+	return sig;
+}
+
+extern int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+extern int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+extern int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/sig.c b/src/sig/sig.c
index b953af756..ae4147838 100644
--- a/src/sig/sig.c
+++ b/src/sig/sig.c
@@ -26,6 +26,8 @@ OQS_API const char *OQS_SIG_alg_identifier(size_t i) {
 		OQS_SIG_alg_ml_dsa_87,
 		OQS_SIG_alg_falcon_512,
 		OQS_SIG_alg_falcon_1024,
+		OQS_SIG_alg_falcon_padded_512,
+		OQS_SIG_alg_falcon_padded_1024,
 		OQS_SIG_alg_sphincs_sha2_128f_simple,
 		OQS_SIG_alg_sphincs_sha2_128s_simple,
 		OQS_SIG_alg_sphincs_sha2_192f_simple,
@@ -133,6 +135,20 @@ OQS_API int OQS_SIG_alg_is_enabled(const char *method_name) {
 		return 0;
 #endif
 
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_512)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_512
+		return 1;
+#else
+		return 0;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_1024)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_1024
+		return 1;
+#else
+		return 0;
+#endif
+
 	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_sphincs_sha2_128f_simple)) {
 #ifdef OQS_ENABLE_SIG_sphincs_sha2_128f_simple
 		return 1;
@@ -305,6 +321,20 @@ OQS_API OQS_SIG *OQS_SIG_new(const char *method_name) {
 		return NULL;
 #endif
 
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_512)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_512
+		return OQS_SIG_falcon_padded_512_new();
+#else
+		return NULL;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_1024)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_1024
+		return OQS_SIG_falcon_padded_1024_new();
+#else
+		return NULL;
+#endif
+
 	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_sphincs_sha2_128f_simple)) {
 #ifdef OQS_ENABLE_SIG_sphincs_sha2_128f_simple
 		return OQS_SIG_sphincs_sha2_128f_simple_new();
diff --git a/src/sig/sig.h b/src/sig/sig.h
index 97a40cd88..11db75f00 100644
--- a/src/sig/sig.h
+++ b/src/sig/sig.h
@@ -54,6 +54,10 @@ extern "C" {
 #define OQS_SIG_alg_falcon_512 "Falcon-512"
 /** Algorithm identifier for Falcon-1024 */
 #define OQS_SIG_alg_falcon_1024 "Falcon-1024"
+/** Algorithm identifier for Falcon-padded-512 */
+#define OQS_SIG_alg_falcon_padded_512 "Falcon-padded-512"
+/** Algorithm identifier for Falcon-padded-1024 */
+#define OQS_SIG_alg_falcon_padded_1024 "Falcon-padded-1024"
 /** Algorithm identifier for SPHINCS+-SHA2-128f-simple */
 #define OQS_SIG_alg_sphincs_sha2_128f_simple "SPHINCS+-SHA2-128f-simple"
 /** Algorithm identifier for SPHINCS+-SHA2-128s-simple */
@@ -83,7 +87,7 @@ extern "C" {
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_START
 
 /** Number of algorithm identifiers above. */
-#define OQS_SIG_algs_length 23
+#define OQS_SIG_algs_length 25
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_END
 
 /**
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
     memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         for (j = 0; j < 8; j++) {
             u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
     memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         for (j = 0; j < 8; j++) {
             u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
     memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         for (j = 0; j < 8; j++) {
             u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
     memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         for (j = 0; j < 8; j++) {
             u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
     memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         for (j = 0; j < 8; j++) {
             u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
     memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         for (j = 0; j < 8; j++) {
             u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA256 output.. */
-    for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha256(out, inbuf, inlen + 4);
         out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
     memcpy(inbuf, in, inlen);
 
     /* While we can fit in at least another full block of SHA512 output.. */
-    for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+    for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
         u32_to_bytes(inbuf + inlen, i);
         sha512(out, inbuf, inlen + 4);
         out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
         }
         state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
         state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
-                    state[(SPX_N / 8) * (1 + inblocks) + 4],
-                    _mm256_set1_epi64x(0x1f)
-                );
+                state[(SPX_N / 8) * (1 + inblocks) + 4],
+                _mm256_set1_epi64x(0x1f)
+            );
         for (int i = 17; i < 25; i++) {
             state[i] = _mm256_set1_epi64x(0);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
         }
         state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
         state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
-                    state[(SPX_N / 8) * (1 + inblocks) + 4],
-                    _mm256_set1_epi64x(0x1f)
-                );
+                state[(SPX_N / 8) * (1 + inblocks) + 4],
+                _mm256_set1_epi64x(0x1f)
+            );
         for (int i = 17; i < 25; i++) {
             state[i] = _mm256_set1_epi64x(0);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
         }
         state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
         state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
-                    state[(SPX_N / 8) * (1 + inblocks) + 4],
-                    _mm256_set1_epi64x(0x1f)
-                );
+                state[(SPX_N / 8) * (1 + inblocks) + 4],
+                _mm256_set1_epi64x(0x1f)
+            );
         for (int i = 17; i < 25; i++) {
             state[i] = _mm256_set1_epi64x(0);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
         }
         state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
         state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
-                    state[(SPX_N / 8) * (1 + inblocks) + 4],
-                    _mm256_set1_epi64x(0x1f)
-                );
+                state[(SPX_N / 8) * (1 + inblocks) + 4],
+                _mm256_set1_epi64x(0x1f)
+            );
         for (int i = 17; i < 25; i++) {
             state[i] = _mm256_set1_epi64x(0);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
         }
         state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
         state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
-                    state[(SPX_N / 8) * (1 + inblocks) + 4],
-                    _mm256_set1_epi64x(0x1f)
-                );
+                state[(SPX_N / 8) * (1 + inblocks) + 4],
+                _mm256_set1_epi64x(0x1f)
+            );
         for (int i = 17; i < 25; i++) {
             state[i] = _mm256_set1_epi64x(0);
         }
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
         }
         state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
         state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
-                    state[(SPX_N / 8) * (1 + inblocks) + 4],
-                    _mm256_set1_epi64x(0x1f)
-                );
+                state[(SPX_N / 8) * (1 + inblocks) + 4],
+                _mm256_set1_epi64x(0x1f)
+            );
         for (int i = 17; i < 25; i++) {
             state[i] = _mm256_set1_epi64x(0);
         }
diff --git a/tests/KATs/sig/kats.json b/tests/KATs/sig/kats.json
index 73595b103..e60fe897b 100644
--- a/tests/KATs/sig/kats.json
+++ b/tests/KATs/sig/kats.json
@@ -19,6 +19,14 @@
     "all": "f4f23c1153682007d5dec02c35e47061c17900fcf0adb3fd0437f1988fa13655",
     "single": "da27fe8a462de7307ddf1f9b00072a457d9c5b14e838c148fbe2662094b9a2ca"
   },
+  "Falcon-padded-1024": {
+    "all": "907a4931ddc2ce8360478a45f1bffededd6a04015b00233ecd851a62ecba06c1",
+    "single": "ddcc5683293388249e6fe85e992ea19d0986d34e060a44f82bc3db524a8c8390"
+  },
+  "Falcon-padded-512": {
+    "all": "362ecc0537ca1fe25143fb7ccb04de8ee7703469d13ebcf311ab124a5c374a65",
+    "single": "91842d41138e7cfaf6e2e8f12a03c3b3411302255121e4d07d02f91a003c0395"
+  },
   "ML-DSA-44": {
     "all": "183bc0c4398ade4fc17b6a7d876b82545a96331139a4f27269c95664b8c483f9",
     "single": "e6f3ec4dc0b02dd3bcbbc6b105190e1890ca0bb3f802e2b571f0d70f3993a2e1"
diff --git a/tests/constant_time/sig/issues.json b/tests/constant_time/sig/issues.json
index 2cb9f200b..3b174fdca 100644
--- a/tests/constant_time/sig/issues.json
+++ b/tests/constant_time/sig/issues.json
@@ -5,6 +5,8 @@
   "Dilithium5": [],
   "Falcon-1024": ["falcon"],
   "Falcon-512": ["falcon"],
+  "Falcon-padded-1024": ["falcon"],
+  "Falcon-padded-512": ["falcon"],
   "ML-DSA-44-ipd": [],
   "ML-DSA-65-ipd": [],
   "ML-DSA-87-ipd": [],
diff --git a/tests/constant_time/sig/passes.json b/tests/constant_time/sig/passes.json
index fee99dcfc..a6096eb64 100644
--- a/tests/constant_time/sig/passes.json
+++ b/tests/constant_time/sig/passes.json
@@ -5,6 +5,8 @@
   "Dilithium5": ["dilithium", "dilithium-avx2", "dilithium-aarch64"],
   "Falcon-1024": ["falcon_keygen", "falcon_sign"],
   "Falcon-512": ["falcon_keygen", "falcon_sign"],
+  "Falcon-padded-1024": ["falcon_keygen", "falcon_sign"],
+  "Falcon-padded-512": ["falcon_keygen", "falcon_sign"],
   "ML-DSA-44-ipd": ["ml_dsa", "ml_dsa-avx2"],
   "ML-DSA-65-ipd": ["ml_dsa", "ml_dsa-avx2"],
   "ML-DSA-87-ipd": ["ml_dsa", "ml_dsa-avx2"],
diff --git a/tests/kat_sig.c b/tests/kat_sig.c
index db70d1dd3..21c208f3a 100644
--- a/tests/kat_sig.c
+++ b/tests/kat_sig.c
@@ -132,6 +132,26 @@ OQS_STATUS combine_message_signature(uint8_t **signed_msg, size_t *signed_msg_le
 		(*signed_msg)[42 + msg_len] = 0x2A;
 		memcpy(*signed_msg + 42 + msg_len + 1, falc_sig, signature_len - 41);
 		return OQS_SUCCESS;
+	} else if (0 == strcmp(sig->method_name, "Falcon-padded-512")) {
+		// signed_msg = signature || msg
+		*signed_msg_len = signature_len + msg_len;
+		*signed_msg = malloc(*signed_msg_len);
+		if (*signed_msg == NULL) {
+			return OQS_ERROR;
+		}
+		memcpy(*signed_msg, signature, signature_len);
+		memcpy(*signed_msg + signature_len, msg, msg_len);
+		return OQS_SUCCESS;
+	} else if (0 == strcmp(sig->method_name, "Falcon-padded-1024")) {
+		// signed_msg = signature || msg
+		*signed_msg_len = signature_len + msg_len;
+		*signed_msg = malloc(*signed_msg_len);
+		if (*signed_msg == NULL) {
+			return OQS_ERROR;
+		}
+		memcpy(*signed_msg, signature, signature_len);
+		memcpy(*signed_msg + signature_len, msg, msg_len);
+		return OQS_SUCCESS;
 	} else if (0 == strcmp(sig->method_name, "SPHINCS+-SHA2-128f-simple")) {
 		// signed_msg = signature || msg
 		*signed_msg_len = signature_len + msg_len;