diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index 6f8e07ba8..df921ef99 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -348,6 +348,28 @@ if(OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS AND OQS_USE_ARM_NEO
endif()
endif()
+cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_512 "" ON "OQS_ENABLE_SIG_FALCON" OFF)
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_512_avx2 "" ON "OQS_ENABLE_SIG_falcon_padded_512" OFF)
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+if(OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS AND OQS_USE_ARM_NEON_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_512_aarch64 "" ON "OQS_ENABLE_SIG_falcon_padded_512" OFF)
+endif()
+endif()
+
+cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_1024 "" ON "OQS_ENABLE_SIG_FALCON" OFF)
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_1024_avx2 "" ON "OQS_ENABLE_SIG_falcon_padded_1024" OFF)
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+if(OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS AND OQS_USE_ARM_NEON_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_falcon_padded_1024_aarch64 "" ON "OQS_ENABLE_SIG_falcon_padded_1024" OFF)
+endif()
+endif()
+
option(OQS_ENABLE_SIG_SPHINCS "Enable sphincs algorithm family" ON)
cmake_dependent_option(OQS_ENABLE_SIG_sphincs_sha2_128f_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
@@ -448,7 +470,7 @@ if(NOT ((OQS_MINIMAL_BUILD STREQUAL "") OR (OQS_MINIMAL_BUILD STREQUAL "OFF")))
filter_algs("${OQS_MINIMAL_BUILD}")
elseif (${OQS_ALGS_ENABLED} STREQUAL "STD")
##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_LIST_STANDARDIZED_ALGS_START
- filter_algs("KEM_ml_kem_512_ipd;KEM_ml_kem_512;KEM_ml_kem_768_ipd;KEM_ml_kem_768;KEM_ml_kem_1024_ipd;KEM_ml_kem_1024;SIG_ml_dsa_44_ipd;SIG_ml_dsa_44;SIG_ml_dsa_65_ipd;SIG_ml_dsa_65;SIG_ml_dsa_87_ipd;SIG_ml_dsa_87;SIG_falcon_512;SIG_falcon_1024;SIG_sphincs_sha2_128f_simple;SIG_sphincs_sha2_128s_simple;SIG_sphincs_sha2_192f_simple;SIG_sphincs_sha2_192s_simple;SIG_sphincs_sha2_256f_simple;SIG_sphincs_sha2_256s_simple;SIG_sphincs_shake_128f_simple;SIG_sphincs_shake_128s_simple;SIG_sphincs_shake_192f_simple;SIG_sphincs_shake_192s_simple;SIG_sphincs_shake_256f_simple;SIG_sphincs_shake_256s_simple")
+ filter_algs("KEM_ml_kem_512_ipd;KEM_ml_kem_512;KEM_ml_kem_768_ipd;KEM_ml_kem_768;KEM_ml_kem_1024_ipd;KEM_ml_kem_1024;SIG_ml_dsa_44_ipd;SIG_ml_dsa_44;SIG_ml_dsa_65_ipd;SIG_ml_dsa_65;SIG_ml_dsa_87_ipd;SIG_ml_dsa_87;SIG_falcon_512;SIG_falcon_1024;SIG_falcon_padded_512;SIG_falcon_padded_1024;SIG_sphincs_sha2_128f_simple;SIG_sphincs_sha2_128s_simple;SIG_sphincs_sha2_192f_simple;SIG_sphincs_sha2_192s_simple;SIG_sphincs_sha2_256f_simple;SIG_sphincs_sha2_256s_simple;SIG_sphincs_shake_128f_simple;SIG_sphincs_shake_128s_simple;SIG_sphincs_shake_192f_simple;SIG_sphincs_shake_192s_simple;SIG_sphincs_shake_256f_simple;SIG_sphincs_shake_256s_simple")
##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_LIST_STANDARDIZED_ALGS_END
elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_R4")
filter_algs("KEM_classic_mceliece_348864;KEM_classic_mceliece_348864f;KEM_classic_mceliece_460896;KEM_classic_mceliece_460896f;KEM_classic_mceliece_6688128;KEM_classic_mceliece_6688128f;KEM_classic_mceliece_6960119;KEM_classic_mceliece_6960119f;KEM_classic_mceliece_8192128;KEM_classic_mceliece_8192128f;KEM_hqc_128;KEM_hqc_192;KEM_hqc_256;KEM_bike_l1;KEM_bike_l3")
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 493670f80..5c15e2dc3 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -158,7 +158,7 @@ jobs:
# See https://github.com/open-quantum-safe/liboqs/issues/738#issuecomment-621394744
default: --numprocesses=auto
machine:
- image: ubuntu-2004:202101-01
+ image: default # analogous to ubuntu-latest on GH Actions
resource_class: arm.medium
steps:
- checkout
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 28b6f3cca..327e04d16 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -46,12 +46,10 @@ jobs:
container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
CMAKE_ARGS: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=generic
PYTEST_ARGS: --numprocesses=auto -k 'test_kat_all'
- SKIP_ALGS: 'Falcon-1024' # re-enable when #1561 is resolved
- name: extensions
container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
CMAKE_ARGS: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=haswell
PYTEST_ARGS: --numprocesses=auto -k 'test_kat_all'
- SKIP_ALGS: 'Falcon-1024' # re-enable when #1561 is resolved
container:
image: ${{ matrix.container }}
steps:
diff --git a/README.md b/README.md
index f9b49615d..738fa19d5 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ The list below indicates all algorithms supported by liboqs, but not all those a
- **CRYSTALS-Dilithium**: Dilithium2, Dilithium3, Dilithium5
-- **Falcon**: Falcon-512, Falcon-1024
+- **Falcon**: Falcon-512, Falcon-1024, Falcon-padded-512, Falcon-padded-1024
- **ML-DSA**: ML-DSA-44-ipd (alias: ML-DSA-44), ML-DSA-65-ipd (alias: ML-DSA-65), ML-DSA-87-ipd (alias: ML-DSA-87)
- **SPHINCS+-SHA2**: SPHINCS+-SHA2-128f-simple, SPHINCS+-SHA2-128s-simple, SPHINCS+-SHA2-192f-simple, SPHINCS+-SHA2-192s-simple, SPHINCS+-SHA2-256f-simple, SPHINCS+-SHA2-256s-simple
- **SPHINCS+-SHAKE**: SPHINCS+-SHAKE-128f-simple, SPHINCS+-SHAKE-128s-simple, SPHINCS+-SHAKE-192f-simple, SPHINCS+-SHAKE-192s-simple, SPHINCS+-SHAKE-256f-simple, SPHINCS+-SHAKE-256s-simple
@@ -185,6 +185,7 @@ liboqs includes some third party libraries or modules that are licensed differen
- `src/kem/ml_kem/pqcrystals-*`: public domain (CC0) or Apache License v2.0
- `src/sig/dilithium/pqcrystals-*`: public domain (CC0) or Apache License v2.0
- `src/sig/dilithium/pqclean_*`: public domain (CC0), and public domain (CC0) or Apache License v2.0, and public domain (CC0) or MIT, and MIT
+- src/sig/falcon/pqclean_\*\_aarch64 : Apache License v2.0
- `src/sig/ml_dsa/pqcrystals-*`: public domain (CC0) or Apache License v2.0
- `src/sig/sphincs/pqclean_*`: CC0 (public domain)
diff --git a/docs/algorithms/kem/classic_mceliece.md b/docs/algorithms/kem/classic_mceliece.md
index 68840c4b0..2c6a267e4 100644
--- a/docs/algorithms/kem/classic_mceliece.md
+++ b/docs/algorithms/kem/classic_mceliece.md
@@ -6,7 +6,7 @@
- **Authors' website**: https://classic.mceliece.org
- **Specification version**: SUPERCOP-20221025.
- **Primary Source**:
- - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+ - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
- **Implementation license (SPDX-Identifier)**: Public domain
- **Ancestors of primary source**:
- SUPERCOP-20221025 "clean" and "avx2" implementations
diff --git a/docs/algorithms/kem/classic_mceliece.yml b/docs/algorithms/kem/classic_mceliece.yml
index 3af5a3e74..99a828bc6 100644
--- a/docs/algorithms/kem/classic_mceliece.yml
+++ b/docs/algorithms/kem/classic_mceliece.yml
@@ -378,4 +378,4 @@ parameter-sets:
auxiliary-submitters: []
primary-upstream:
spdx-license-identifier: Public domain
- source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+ source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
diff --git a/docs/algorithms/kem/hqc.md b/docs/algorithms/kem/hqc.md
index 58d083481..dca44d745 100644
--- a/docs/algorithms/kem/hqc.md
+++ b/docs/algorithms/kem/hqc.md
@@ -6,7 +6,7 @@
- **Authors' website**: https://pqc-hqc.org/
- **Specification version**: 2023-04-30.
- **Primary Source**:
- - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+ - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
- **Implementation license (SPDX-Identifier)**: Public domain
- **Ancestors of primary source**:
- https://github.com/SWilson4/package-pqclean/tree/8db1b24b/hqc, which takes it from:
diff --git a/docs/algorithms/kem/hqc.yml b/docs/algorithms/kem/hqc.yml
index 1bcbe6566..8e78c4f9c 100644
--- a/docs/algorithms/kem/hqc.yml
+++ b/docs/algorithms/kem/hqc.yml
@@ -76,4 +76,4 @@ parameter-sets:
upstream: primary-upstream
primary-upstream:
spdx-license-identifier: Public domain
- source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+ source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
diff --git a/docs/algorithms/sig/falcon.md b/docs/algorithms/sig/falcon.md
index df0580968..3dd6dddc9 100644
--- a/docs/algorithms/sig/falcon.md
+++ b/docs/algorithms/sig/falcon.md
@@ -7,24 +7,30 @@
- **Authors' website**: https://falcon-sign.info
- **Specification version**: 20211101.
- **Primary Source**:
- - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+ - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
- **Implementation license (SPDX-Identifier)**: MIT
+- **Optimized Implementation sources**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
+ - **pqclean-aarch64**:
+ - **Source**: https://github.com/PQClean/PQClean/commit/7707d1bcc8ae7f9ffd296dd13b1d76d2767d14f8
+ - **Implementation license (SPDX-Identifier)**: Apache-2.0
## Parameter set summary
-| Parameter set | Parameter set alias | Security model | Claimed NIST Level | Public key size (bytes) | Secret key size (bytes) | Signature size (bytes) |
-|:---------------:|:----------------------|:-----------------|---------------------:|--------------------------:|--------------------------:|-------------------------:|
-| Falcon-512 | NA | EUF-CMA | 1 | 897 | 1281 | 666 |
-| Falcon-1024 | NA | EUF-CMA | 5 | 1793 | 2305 | 1280 |
+| Parameter set | Parameter set alias | Security model | Claimed NIST Level | Public key size (bytes) | Secret key size (bytes) | Signature size (bytes) |
+|:------------------:|:----------------------|:-----------------|---------------------:|--------------------------:|--------------------------:|-------------------------:|
+| Falcon-512 | NA | EUF-CMA | 1 | 897 | 1281 | 752 |
+| Falcon-1024 | NA | EUF-CMA | 5 | 1793 | 2305 | 1462 |
+| Falcon-padded-512 | NA | EUF-CMA | 1 | 897 | 1281 | 666 |
+| Falcon-padded-1024 | NA | EUF-CMA | 5 | 1793 | 2305 | 1280 |
## Falcon-512 implementation characteristics
-| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage?‡ |
-|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
-| [Primary Source](#primary-source) | clean | All | All | None | True | True | False |
-| [Primary Source](#primary-source) | avx2 | x86\_64 | All | AVX2 | False | False | False |
-| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False |
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage?‡ |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
+| [Primary Source](#primary-source) | clean | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | All | AVX2 | False | False | False |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False |
Are implementations chosen based on runtime CPU feature detection? **Yes**.
@@ -32,11 +38,31 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
## Falcon-1024 implementation characteristics
-| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? |
-|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean | All | All | None | True | True | False |
-| [Primary Source](#primary-source) | avx2 | x86\_64 | All | AVX2 | False | False | False |
-| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False |
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | clean | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | All | AVX2 | False | False | False |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## Falcon-padded-512 implementation characteristics
+
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | clean | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | All | AVX2 | False | False | False |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## Falcon-padded-1024 implementation characteristics
+
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? |
+|:-----------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | clean | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | All | AVX2 | False | False | False |
+| [pqclean-aarch64](#pqclean-aarch64) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False |
Are implementations chosen based on runtime CPU feature detection? **Yes**.
diff --git a/docs/algorithms/sig/falcon.yml b/docs/algorithms/sig/falcon.yml
index aa6a80304..781e188e0 100644
--- a/docs/algorithms/sig/falcon.yml
+++ b/docs/algorithms/sig/falcon.yml
@@ -18,17 +18,21 @@ website: https://falcon-sign.info
nist-round: 3
spec-version: 20211101
primary-upstream:
- source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+ source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
spdx-license-identifier: MIT
upstream-ancestors:
- https://www.falcon-sign.info
+optimized-upstreams:
+ pqclean-aarch64:
+ source: https://github.com/PQClean/PQClean/commit/7707d1bcc8ae7f9ffd296dd13b1d76d2767d14f8
+ spdx-license-identifier: Apache-2.0
parameter-sets:
- name: Falcon-512
claimed-nist-level: 1
claimed-security: EUF-CMA
length-public-key: 897
length-secret-key: 1281
- length-signature: 666
+ length-signature: 752
implementations-switch-on-runtime-cpu-features: true
implementations:
- upstream: primary-upstream
@@ -50,7 +54,7 @@ parameter-sets:
no-secret-dependent-branching-claimed: false
no-secret-dependent-branching-checked-by-valgrind: false
large-stack-usage: false
- - upstream: primary-upstream
+ - upstream: pqclean-aarch64
upstream-id: aarch64
supported-platforms:
- architecture: ARM64_V8
@@ -67,7 +71,46 @@ parameter-sets:
claimed-security: EUF-CMA
length-public-key: 1793
length-secret-key: 2305
- length-signature: 1280
+ length-signature: 1462
+ implementations-switch-on-runtime-cpu-features: true
+ implementations:
+ - upstream: primary-upstream
+ upstream-id: clean
+ supported-platforms: all
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+ - upstream: primary-upstream
+ upstream-id: avx2
+ supported-platforms:
+ - architecture: x86_64
+ required_flags:
+ - avx2
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: false
+ no-secret-dependent-branching-checked-by-valgrind: false
+ large-stack-usage: false
+ - upstream: pqclean-aarch64
+ upstream-id: aarch64
+ supported-platforms:
+ - architecture: ARM64_V8
+ operating_systems:
+ - Linux
+ - Darwin
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: false
+ no-secret-dependent-branching-checked-by-valgrind: false
+ large-stack-usage: false
+- name: Falcon-padded-512
+ claimed-nist-level: 1
+ claimed-security: EUF-CMA
+ length-public-key: 897
+ length-secret-key: 1281
+ length-signature: 666
implementations-switch-on-runtime-cpu-features: true
implementations:
- upstream: primary-upstream
@@ -89,7 +132,46 @@ parameter-sets:
no-secret-dependent-branching-claimed: false
no-secret-dependent-branching-checked-by-valgrind: false
large-stack-usage: false
+ - upstream: pqclean-aarch64
+ upstream-id: aarch64
+ supported-platforms:
+ - architecture: ARM64_V8
+ operating_systems:
+ - Linux
+ - Darwin
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: false
+ no-secret-dependent-branching-checked-by-valgrind: false
+ large-stack-usage: false
+- name: Falcon-padded-1024
+ claimed-nist-level: 5
+ claimed-security: EUF-CMA
+ length-public-key: 1793
+ length-secret-key: 2305
+ length-signature: 1280
+ implementations-switch-on-runtime-cpu-features: true
+ implementations:
+ - upstream: primary-upstream
+ upstream-id: clean
+ supported-platforms: all
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
- upstream: primary-upstream
+ upstream-id: avx2
+ supported-platforms:
+ - architecture: x86_64
+ required_flags:
+ - avx2
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: false
+ no-secret-dependent-branching-checked-by-valgrind: false
+ large-stack-usage: false
+ - upstream: pqclean-aarch64
upstream-id: aarch64
supported-platforms:
- architecture: ARM64_V8
diff --git a/docs/algorithms/sig/sphincs.md b/docs/algorithms/sig/sphincs.md
index a1660e483..096a87b29 100644
--- a/docs/algorithms/sig/sphincs.md
+++ b/docs/algorithms/sig/sphincs.md
@@ -7,7 +7,7 @@
- **Authors' website**: https://sphincs.org/
- **Specification version**: NIST Round 3 submission, v3.1 (June 10, 2022).
- **Primary Source**:
- - **Source**: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852 with copy_from_upstream patches
+ - **Source**: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789 with copy_from_upstream patches
- **Implementation license (SPDX-Identifier)**: CC0-1.0
diff --git a/docs/algorithms/sig/sphincs.yml b/docs/algorithms/sig/sphincs.yml
index b5148335a..d3e6816c9 100644
--- a/docs/algorithms/sig/sphincs.yml
+++ b/docs/algorithms/sig/sphincs.yml
@@ -26,7 +26,7 @@ nist-round: 3
spec-version: NIST Round 3 submission, v3.1 (June 10, 2022)
spdx-license-identifier: CC0-1.0
primary-upstream:
- source: https://github.com/PQClean/PQClean/commit/0657749a785db30e7f49e9435452cb042edb1852
+ source: https://github.com/PQClean/PQClean/commit/8e221ae797b229858a0b0d784577a8cb149d5789
with copy_from_upstream patches
spdx-license-identifier: CC0-1.0
upstream-ancestors:
diff --git a/docs/cbom.json b/docs/cbom.json
index 02d2d59ca..7dd47dc21 100644
--- a/docs/cbom.json
+++ b/docs/cbom.json
@@ -1,23 +1,23 @@
{
"bomFormat": "CBOM",
"specVersion": "1.4-cbom-1.0",
- "serialNumber": "urn:uuid:c25dad99-ad00-48b6-aa9e-25d4f7c3c8c5",
+ "serialNumber": "urn:uuid:b3ac0f3d-b320-4f0f-bbef-6c535c1e9874",
"version": 1,
"metadata": {
- "timestamp": "2023-12-13T17:05:36.137517",
+ "timestamp": "2024-03-05T11:49:42.428605",
"component": {
"type": "library",
- "bom-ref": "pkg:github/open-quantum-safe/liboqs@5f83324a6c464448b70b1e57b3cd161b6832e0e0",
+ "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
"name": "liboqs",
- "version": "5f83324a6c464448b70b1e57b3cd161b6832e0e0"
+ "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
}
},
"components": [
{
"type": "library",
- "bom-ref": "pkg:github/open-quantum-safe/liboqs@5f83324a6c464448b70b1e57b3cd161b6832e0e0",
+ "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
"name": "liboqs",
- "version": "5f83324a6c464448b70b1e57b3cd161b6832e0e0"
+ "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
},
{
"type": "crypto-asset",
@@ -1419,6 +1419,126 @@
"nistQuantumSecurityLevel": 5
}
},
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:Falcon-padded-512:generic",
+ "name": "Falcon",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "Falcon-padded-512",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "generic"
+ },
+ "nistQuantumSecurityLevel": 1
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:Falcon-padded-512:x86_64",
+ "name": "Falcon",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "Falcon-padded-512",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "x86_64"
+ },
+ "nistQuantumSecurityLevel": 1
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:Falcon-padded-512:armv8-a",
+ "name": "Falcon",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "Falcon-padded-512",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "armv8-a"
+ },
+ "nistQuantumSecurityLevel": 1
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:Falcon-padded-1024:generic",
+ "name": "Falcon",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "Falcon-padded-1024",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "generic"
+ },
+ "nistQuantumSecurityLevel": 5
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:Falcon-padded-1024:x86_64",
+ "name": "Falcon",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "Falcon-padded-1024",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "x86_64"
+ },
+ "nistQuantumSecurityLevel": 5
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:Falcon-padded-1024:armv8-a",
+ "name": "Falcon",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "Falcon-padded-1024",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "armv8-a"
+ },
+ "nistQuantumSecurityLevel": 5
+ }
+ },
{
"type": "crypto-asset",
"bom-ref": "alg:ML-DSA-44-ipd:generic",
@@ -2048,7 +2168,7 @@
],
"dependencies": [
{
- "ref": "pkg:github/open-quantum-safe/liboqs@5f83324a6c464448b70b1e57b3cd161b6832e0e0",
+ "ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
"dependsOn": [
"alg:BIKE-L1:x86_64",
"alg:BIKE-L3:x86_64",
@@ -2120,6 +2240,12 @@
"alg:Falcon-1024:generic",
"alg:Falcon-1024:x86_64",
"alg:Falcon-1024:armv8-a",
+ "alg:Falcon-padded-512:generic",
+ "alg:Falcon-padded-512:x86_64",
+ "alg:Falcon-padded-512:armv8-a",
+ "alg:Falcon-padded-1024:generic",
+ "alg:Falcon-padded-1024:x86_64",
+ "alg:Falcon-padded-1024:armv8-a",
"alg:ML-DSA-44-ipd:generic",
"alg:ML-DSA-44-ipd:x86_64",
"alg:ML-DSA-65-ipd:generic",
@@ -2675,6 +2801,48 @@
],
"dependencyType": "uses"
},
+ {
+ "ref": "alg:Falcon-padded-512:generic",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:Falcon-padded-512:x86_64",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:Falcon-padded-512:armv8-a",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:Falcon-padded-1024:generic",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:Falcon-padded-1024:x86_64",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:Falcon-padded-1024:armv8-a",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
{
"ref": "alg:ML-DSA-44-ipd:generic",
"dependsOn": [
diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py
index 32d897cdf..0db38f54b 100755
--- a/scripts/copy_from_upstream/copy_from_upstream.py
+++ b/scripts/copy_from_upstream/copy_from_upstream.py
@@ -548,6 +548,9 @@ def process_families(instructions, basedir, with_kat, with_generator):
print("Info: Updating KAT for %s" % (scheme['pretty_name_full']))
except KeyError: # new key
print("Adding new KAT for %s" % (scheme['pretty_name_full']))
+ # either a new scheme or a new KAT
+ if scheme['pretty_name_full'] not in kats['kem']:
+ kats['kem'][scheme['pretty_name_full']] = {}
pass
kats['kem'][scheme['pretty_name_full']]['single'] = scheme['metadata']['nistkat-sha256']
if 'alias_pretty_name_full' in scheme:
@@ -558,6 +561,9 @@ def process_families(instructions, basedir, with_kat, with_generator):
print("Info: Updating KAT for %s" % (scheme['pretty_name_full']))
except KeyError: # new key
print("Adding new KAT for %s" % (scheme['pretty_name_full']))
+ # either a new scheme or a new KAT
+ if scheme['pretty_name_full'] not in kats['sig']:
+ kats['sig'][scheme['pretty_name_full']] = {}
pass
kats['sig'][scheme['pretty_name_full']]['single'] = scheme['metadata']['nistkat-sha256']
if 'alias_pretty_name_full' in scheme:
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index f55b8798b..d8a9a4d12 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -14,7 +14,7 @@ upstreams:
name: pqclean
git_url: https://github.com/PQClean/PQClean.git
git_branch: master
- git_commit: 0657749a785db30e7f49e9435452cb042edb1852
+ git_commit: 8e221ae797b229858a0b0d784577a8cb149d5789
kem_meta_path: 'crypto_kem/{pqclean_scheme}/META.yml'
sig_meta_path: 'crypto_sign/{pqclean_scheme}/META.yml'
kem_scheme_path: 'crypto_kem/{pqclean_scheme}'
@@ -226,6 +226,16 @@ sigs:
pqclean_scheme: falcon-1024
pretty_name_full: Falcon-1024
signed_msg_order: falcon
+ -
+ scheme: "padded_512"
+ pqclean_scheme: falcon-padded-512
+ pretty_name_full: Falcon-padded-512
+ signed_msg_order: sig_then_msg
+ -
+ scheme: "padded_1024"
+ pqclean_scheme: falcon-padded-1024
+ pretty_name_full: Falcon-padded-1024
+ signed_msg_order: sig_then_msg
-
name: sphincs
default_implementation: clean
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index 4abe5c2ae..1b9b5a2d4 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -149,6 +149,12 @@
#cmakedefine OQS_ENABLE_SIG_falcon_1024 1
#cmakedefine OQS_ENABLE_SIG_falcon_1024_avx2 1
#cmakedefine OQS_ENABLE_SIG_falcon_1024_aarch64 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_512 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_512_avx2 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_512_aarch64 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_1024 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_1024_avx2 1
+#cmakedefine OQS_ENABLE_SIG_falcon_padded_1024_aarch64 1
#cmakedefine OQS_ENABLE_SIG_SPHINCS 1
#cmakedefine OQS_ENABLE_SIG_sphincs_sha2_128f_simple 1
diff --git a/src/sig/falcon/CMakeLists.txt b/src/sig/falcon/CMakeLists.txt
index ff5a41b43..4be3ae829 100644
--- a/src/sig/falcon/CMakeLists.txt
+++ b/src/sig/falcon/CMakeLists.txt
@@ -51,4 +51,50 @@ if(OQS_ENABLE_SIG_falcon_1024_aarch64)
set(_FALCON_OBJS ${_FALCON_OBJS} $)
endif()
+if(OQS_ENABLE_SIG_falcon_padded_512)
+ add_library(falcon_padded_512_clean OBJECT sig_falcon_padded_512.c pqclean_falcon-padded-512_clean/codec.c pqclean_falcon-padded-512_clean/common.c pqclean_falcon-padded-512_clean/fft.c pqclean_falcon-padded-512_clean/fpr.c pqclean_falcon-padded-512_clean/keygen.c pqclean_falcon-padded-512_clean/pqclean.c pqclean_falcon-padded-512_clean/rng.c pqclean_falcon-padded-512_clean/sign.c pqclean_falcon-padded-512_clean/vrfy.c)
+ target_include_directories(falcon_padded_512_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-512_clean)
+ target_include_directories(falcon_padded_512_clean PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ set(_FALCON_OBJS ${_FALCON_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+ add_library(falcon_padded_512_avx2 OBJECT pqclean_falcon-padded-512_avx2/codec.c pqclean_falcon-padded-512_avx2/common.c pqclean_falcon-padded-512_avx2/fft.c pqclean_falcon-padded-512_avx2/fpr.c pqclean_falcon-padded-512_avx2/keygen.c pqclean_falcon-padded-512_avx2/pqclean.c pqclean_falcon-padded-512_avx2/rng.c pqclean_falcon-padded-512_avx2/sign.c pqclean_falcon-padded-512_avx2/vrfy.c)
+ target_include_directories(falcon_padded_512_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-512_avx2)
+ target_include_directories(falcon_padded_512_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(falcon_padded_512_avx2 PRIVATE -mavx2)
+ set(_FALCON_OBJS ${_FALCON_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+ add_library(falcon_padded_512_aarch64 OBJECT pqclean_falcon-padded-512_aarch64/codec.c pqclean_falcon-padded-512_aarch64/common.c pqclean_falcon-padded-512_aarch64/fft.c pqclean_falcon-padded-512_aarch64/fft_tree.c pqclean_falcon-padded-512_aarch64/fpr.c pqclean_falcon-padded-512_aarch64/keygen.c pqclean_falcon-padded-512_aarch64/ntt.c pqclean_falcon-padded-512_aarch64/ntt_consts.c pqclean_falcon-padded-512_aarch64/poly_float.c pqclean_falcon-padded-512_aarch64/poly_int.c pqclean_falcon-padded-512_aarch64/pqclean.c pqclean_falcon-padded-512_aarch64/rng.c pqclean_falcon-padded-512_aarch64/sampler.c pqclean_falcon-padded-512_aarch64/sign.c pqclean_falcon-padded-512_aarch64/util.c pqclean_falcon-padded-512_aarch64/vrfy.c)
+ target_include_directories(falcon_padded_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-512_aarch64)
+ target_include_directories(falcon_padded_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(falcon_padded_512_aarch64 PRIVATE)
+ set(_FALCON_OBJS ${_FALCON_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_1024)
+ add_library(falcon_padded_1024_clean OBJECT sig_falcon_padded_1024.c pqclean_falcon-padded-1024_clean/codec.c pqclean_falcon-padded-1024_clean/common.c pqclean_falcon-padded-1024_clean/fft.c pqclean_falcon-padded-1024_clean/fpr.c pqclean_falcon-padded-1024_clean/keygen.c pqclean_falcon-padded-1024_clean/pqclean.c pqclean_falcon-padded-1024_clean/rng.c pqclean_falcon-padded-1024_clean/sign.c pqclean_falcon-padded-1024_clean/vrfy.c)
+ target_include_directories(falcon_padded_1024_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-1024_clean)
+ target_include_directories(falcon_padded_1024_clean PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ set(_FALCON_OBJS ${_FALCON_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+ add_library(falcon_padded_1024_avx2 OBJECT pqclean_falcon-padded-1024_avx2/codec.c pqclean_falcon-padded-1024_avx2/common.c pqclean_falcon-padded-1024_avx2/fft.c pqclean_falcon-padded-1024_avx2/fpr.c pqclean_falcon-padded-1024_avx2/keygen.c pqclean_falcon-padded-1024_avx2/pqclean.c pqclean_falcon-padded-1024_avx2/rng.c pqclean_falcon-padded-1024_avx2/sign.c pqclean_falcon-padded-1024_avx2/vrfy.c)
+ target_include_directories(falcon_padded_1024_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-1024_avx2)
+ target_include_directories(falcon_padded_1024_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(falcon_padded_1024_avx2 PRIVATE -mavx2)
+ set(_FALCON_OBJS ${_FALCON_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+ add_library(falcon_padded_1024_aarch64 OBJECT pqclean_falcon-padded-1024_aarch64/codec.c pqclean_falcon-padded-1024_aarch64/common.c pqclean_falcon-padded-1024_aarch64/fft.c pqclean_falcon-padded-1024_aarch64/fft_tree.c pqclean_falcon-padded-1024_aarch64/fpr.c pqclean_falcon-padded-1024_aarch64/keygen.c pqclean_falcon-padded-1024_aarch64/ntt.c pqclean_falcon-padded-1024_aarch64/ntt_consts.c pqclean_falcon-padded-1024_aarch64/poly_float.c pqclean_falcon-padded-1024_aarch64/poly_int.c pqclean_falcon-padded-1024_aarch64/pqclean.c pqclean_falcon-padded-1024_aarch64/rng.c pqclean_falcon-padded-1024_aarch64/sampler.c pqclean_falcon-padded-1024_aarch64/sign.c pqclean_falcon-padded-1024_aarch64/util.c pqclean_falcon-padded-1024_aarch64/vrfy.c)
+ target_include_directories(falcon_padded_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_falcon-padded-1024_aarch64)
+ target_include_directories(falcon_padded_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(falcon_padded_1024_aarch64 PRIVATE)
+ set(_FALCON_OBJS ${_FALCON_OBJS} $)
+endif()
+
set(FALCON_OBJS ${_FALCON_OBJS} PARENT_SCOPE)
diff --git a/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h b/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h
index cc2d49cf1..06787aaca 100644
--- a/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h
+++ b/src/sig/falcon/pqclean_falcon-1024_aarch64/api.h
@@ -6,10 +6,12 @@
#define PQCLEAN_FALCON1024_AARCH64_CRYPTO_SECRETKEYBYTES 2305
#define PQCLEAN_FALCON1024_AARCH64_CRYPTO_PUBLICKEYBYTES 1793
-#define PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES 1280
+#define PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES 1462
#define PQCLEAN_FALCON1024_AARCH64_CRYPTO_ALGNAME "Falcon-1024"
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES 1280 // used in signature verification
+
/*
* Generate a new key pair. Public key goes into pk[], private key in sk[].
* Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c
index dfd6d8aea..e90daf2b7 100644
--- a/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c
+++ b/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c
@@ -281,10 +281,10 @@ int PQCLEAN_FALCON1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int1
uint16x8_t neon_q; // 1
neon_127 = vdupq_n_s16(127);
neon__127 = vdupq_n_s16(-127);
+ neon_q = vdupq_n_u16(FALCON_Q);
neon_q_2 = vdupq_n_s16(FALCON_Q >> 1);
neon__q_2 = vdupq_n_s16(-(FALCON_Q >> 1));
- neon_q = vdupq_n_u16(FALCON_Q);
e.val[1] = vdupq_n_u16(0);
for (int i = 0; i < FALCON_N; i += 64) {
diff --git a/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c
index 1eea81fa8..7355b07db 100644
--- a/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-1024_aarch64/pqclean.c
@@ -27,15 +27,15 @@
*
* signature:
* header byte: 0011nnnn
- * nonce 40 bytes
- * value (12 bits by element)
+ * nonce (r) 40 bytes
+ * value (s) compressed format
*
* message + signature:
* signature length (2 bytes, big-endian)
* nonce 40 bytes
* message
* header byte: 0010nnnn
- * value (12 bits by element)
+ * value compressed format
* (signature length is 1+len(value), not counting the nonce)
*/
@@ -115,10 +115,7 @@ PQCLEAN_FALCON1024_AARCH64_crypto_sign_keypair(
* receiving the actual value length.
*
* If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
*
* Return value: 0 on success, -1 on error.
*/
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
inner_shake256_flip(&sc);
/*
- * Compute and return the signature. This loops until a signature
- * value is found that fits in the provided buffer.
+ * Compute and return the signature.
*/
- for (;;) {
- PQCLEAN_FALCON1024_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
- v = PQCLEAN_FALCON1024_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
- if (v != 0) {
- inner_shake256_ctx_release(&sc);
- *sigbuflen = v;
- return 0;
- }
+ PQCLEAN_FALCON1024_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+ v = PQCLEAN_FALCON1024_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ *sigbuflen = v;
+ return 0;
}
+ return -1;
}
/*
@@ -230,6 +225,7 @@ do_verify(
int16_t hm[FALCON_N];
int16_t sig[FALCON_N];
inner_shake256_context sc;
+ size_t v;
/*
* Decode public key.
@@ -242,6 +238,7 @@ do_verify(
!= PQCLEAN_FALCON1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
return -1;
}
+ // We move the conversion to NTT domain of `h` inside verify_raw()
/*
* Decode signature.
@@ -249,9 +246,22 @@ do_verify(
if (sigbuflen == 0) {
return -1;
}
- if (PQCLEAN_FALCON1024_AARCH64_comp_decode(sig, sigbuf, sigbuflen) != sigbuflen) {
+
+ v = PQCLEAN_FALCON1024_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+ if (v == 0) {
return -1;
}
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
/*
* Hash nonce + message into a vector.
@@ -277,20 +287,9 @@ int
PQCLEAN_FALCON1024_AARCH64_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk) {
- /*
- * The PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES constant is used for
- * the signed message object (as produced by crypto_sign())
- * and includes a two-byte length value, so we take care here
- * to only generate signatures that are two bytes shorter than
- * the maximum. This is done to ensure that crypto_sign()
- * and crypto_sign_signature() produce the exact same signature
- * value, if used on the same message, with the same private key,
- * and using the same output from randombytes() (this is for
- * reproducibility of tests).
- */
size_t vlen;
- vlen = PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES - NONCELEN - 3;
+ vlen = PQCLEAN_FALCON1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
return -1;
}
diff --git a/src/sig/falcon/pqclean_falcon-1024_avx2/api.h b/src/sig/falcon/pqclean_falcon-1024_avx2/api.h
index a0f6db1f4..85e201fc2 100644
--- a/src/sig/falcon/pqclean_falcon-1024_avx2/api.h
+++ b/src/sig/falcon/pqclean_falcon-1024_avx2/api.h
@@ -6,10 +6,12 @@
#define PQCLEAN_FALCON1024_AVX2_CRYPTO_SECRETKEYBYTES 2305
#define PQCLEAN_FALCON1024_AVX2_CRYPTO_PUBLICKEYBYTES 1793
-#define PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES 1280
+#define PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES 1462
#define PQCLEAN_FALCON1024_AVX2_CRYPTO_ALGNAME "Falcon-1024"
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES 1280 // used in signature verification
+
/*
* Generate a new key pair. Public key goes into pk[], private key in sk[].
* Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c
index 27708cd68..ea214a19f 100644
--- a/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-1024_avx2/pqclean.c
@@ -27,15 +27,15 @@
*
* signature:
* header byte: 0011nnnn
- * nonce 40 bytes
- * value (12 bits by element)
+ * nonce (r) 40 bytes
+ * value (s) compressed format
*
* message + signature:
* signature length (2 bytes, big-endian)
* nonce 40 bytes
* message
* header byte: 0010nnnn
- * value (12 bits by element)
+ * value compressed format
* (signature length is 1+len(value), not counting the nonce)
*/
@@ -115,10 +115,7 @@ PQCLEAN_FALCON1024_AVX2_crypto_sign_keypair(
* receiving the actual value length.
*
* If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
*
* Return value: 0 on success, -1 on error.
*/
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
inner_shake256_flip(&sc);
/*
- * Compute and return the signature. This loops until a signature
- * value is found that fits in the provided buffer.
+ * Compute and return the signature.
*/
- for (;;) {
- PQCLEAN_FALCON1024_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
- v = PQCLEAN_FALCON1024_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
- if (v != 0) {
- inner_shake256_ctx_release(&sc);
- *sigbuflen = v;
- return 0;
- }
+ PQCLEAN_FALCON1024_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+ v = PQCLEAN_FALCON1024_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ *sigbuflen = v;
+ return 0;
}
+ return -1;
}
/*
@@ -229,6 +224,7 @@ do_verify(
uint16_t h[1024], hm[1024];
int16_t sig[1024];
inner_shake256_context sc;
+ size_t v;
/*
* Decode public key.
@@ -249,9 +245,22 @@ do_verify(
if (sigbuflen == 0) {
return -1;
}
- if (PQCLEAN_FALCON1024_AVX2_comp_decode(sig, 10, sigbuf, sigbuflen) != sigbuflen) {
+
+ v = PQCLEAN_FALCON1024_AVX2_comp_decode(sig, 10, sigbuf, sigbuflen);
+ if (v == 0) {
return -1;
}
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
/*
* Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
PQCLEAN_FALCON1024_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk) {
- /*
- * The PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES constant is used for
- * the signed message object (as produced by crypto_sign())
- * and includes a two-byte length value, so we take care here
- * to only generate signatures that are two bytes shorter than
- * the maximum. This is done to ensure that crypto_sign()
- * and crypto_sign_signature() produce the exact same signature
- * value, if used on the same message, with the same private key,
- * and using the same output from randombytes() (this is for
- * reproducibility of tests).
- */
size_t vlen;
- vlen = PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES - NONCELEN - 3;
+ vlen = PQCLEAN_FALCON1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
return -1;
}
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/api.h b/src/sig/falcon/pqclean_falcon-1024_clean/api.h
index 74fe34958..cc6557fde 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/api.h
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/api.h
@@ -6,10 +6,12 @@
#define PQCLEAN_FALCON1024_CLEAN_CRYPTO_SECRETKEYBYTES 2305
#define PQCLEAN_FALCON1024_CLEAN_CRYPTO_PUBLICKEYBYTES 1793
-#define PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES 1280
+#define PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES 1462
#define PQCLEAN_FALCON1024_CLEAN_CRYPTO_ALGNAME "Falcon-1024"
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES 1280 // used in signature verification
+
/*
* Generate a new key pair. Public key goes into pk[], private key in sk[].
* Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c
index 7ced3ff0b..086d249ef 100644
--- a/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-1024_clean/pqclean.c
@@ -27,15 +27,15 @@
*
* signature:
* header byte: 0011nnnn
- * nonce 40 bytes
- * value (12 bits by element)
+ * nonce (r) 40 bytes
+ * value (s) compressed format
*
* message + signature:
* signature length (2 bytes, big-endian)
* nonce 40 bytes
* message
* header byte: 0010nnnn
- * value (12 bits by element)
+ * value compressed format
* (signature length is 1+len(value), not counting the nonce)
*/
@@ -115,10 +115,7 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair(
* receiving the actual value length.
*
* If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
*
* Return value: 0 on success, -1 on error.
*/
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
inner_shake256_flip(&sc);
/*
- * Compute and return the signature. This loops until a signature
- * value is found that fits in the provided buffer.
+ * Compute and return the signature.
*/
- for (;;) {
- PQCLEAN_FALCON1024_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
- v = PQCLEAN_FALCON1024_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
- if (v != 0) {
- inner_shake256_ctx_release(&sc);
- *sigbuflen = v;
- return 0;
- }
+ PQCLEAN_FALCON1024_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+ v = PQCLEAN_FALCON1024_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 10);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ *sigbuflen = v;
+ return 0;
}
+ return -1;
}
/*
@@ -229,6 +224,7 @@ do_verify(
uint16_t h[1024], hm[1024];
int16_t sig[1024];
inner_shake256_context sc;
+ size_t v;
/*
* Decode public key.
@@ -249,9 +245,22 @@ do_verify(
if (sigbuflen == 0) {
return -1;
}
- if (PQCLEAN_FALCON1024_CLEAN_comp_decode(sig, 10, sigbuf, sigbuflen) != sigbuflen) {
+
+ v = PQCLEAN_FALCON1024_CLEAN_comp_decode(sig, 10, sigbuf, sigbuflen);
+ if (v == 0) {
return -1;
}
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
/*
* Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
PQCLEAN_FALCON1024_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk) {
- /*
- * The PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES constant is used for
- * the signed message object (as produced by crypto_sign())
- * and includes a two-byte length value, so we take care here
- * to only generate signatures that are two bytes shorter than
- * the maximum. This is done to ensure that crypto_sign()
- * and crypto_sign_signature() produce the exact same signature
- * value, if used on the same message, with the same private key,
- * and using the same output from randombytes() (this is for
- * reproducibility of tests).
- */
size_t vlen;
- vlen = PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES - NONCELEN - 3;
+ vlen = PQCLEAN_FALCON1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
return -1;
}
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/api.h b/src/sig/falcon/pqclean_falcon-512_aarch64/api.h
index 996bf6185..d70db344b 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/api.h
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/api.h
@@ -6,10 +6,12 @@
#define PQCLEAN_FALCON512_AARCH64_CRYPTO_SECRETKEYBYTES 1281
#define PQCLEAN_FALCON512_AARCH64_CRYPTO_PUBLICKEYBYTES 897
-#define PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES 666
+#define PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES 752
#define PQCLEAN_FALCON512_AARCH64_CRYPTO_ALGNAME "Falcon-512"
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES 666 // used in signature verification
+
/*
* Generate a new key pair. Public key goes into pk[], private key in sk[].
* Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h b/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h
index d1a49920b..c8f82991e 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/macrof.h
@@ -123,4 +123,3 @@
#define vfmla_lane(d, c, a, b, i) d = vfmaq_laneq_f64(c, a, b, i);
// d = c - a * b[i]
#define vfmls_lane(d, c, a, b, i) d = vfmsq_laneq_f64(c, a, b, i);
-
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h b/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h
index dcacf718d..3702fa1bd 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/poly.h
@@ -2,6 +2,7 @@
#define POLY_H
#include "inner.h"
+#include "params.h"
typedef enum ntt_domain {
NTT_NONE = 0,
diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c
index 8adf73821..b898d746a 100644
--- a/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-512_aarch64/pqclean.c
@@ -27,15 +27,15 @@
*
* signature:
* header byte: 0011nnnn
- * nonce 40 bytes
- * value (12 bits by element)
+ * nonce (r) 40 bytes
+ * value (s) compressed format
*
* message + signature:
* signature length (2 bytes, big-endian)
* nonce 40 bytes
* message
* header byte: 0010nnnn
- * value (12 bits by element)
+ * value compressed format
* (signature length is 1+len(value), not counting the nonce)
*/
@@ -44,7 +44,7 @@ int
PQCLEAN_FALCON512_AARCH64_crypto_sign_keypair(
uint8_t *pk, uint8_t *sk) {
union {
- uint8_t b[FALCON_KEYGEN_TEMP_9];
+ uint8_t b[28 * FALCON_N];
uint64_t dummy_u64;
fpr dummy_fpr;
} tmp;
@@ -115,10 +115,7 @@ PQCLEAN_FALCON512_AARCH64_crypto_sign_keypair(
* receiving the actual value length.
*
* If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
*
* Return value: 0 on success, -1 on error.
*/
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
inner_shake256_flip(&sc);
/*
- * Compute and return the signature. This loops until a signature
- * value is found that fits in the provided buffer.
+ * Compute and return the signature.
*/
- for (;;) {
- PQCLEAN_FALCON512_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
- v = PQCLEAN_FALCON512_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
- if (v != 0) {
- inner_shake256_ctx_release(&sc);
- *sigbuflen = v;
- return 0;
- }
+ PQCLEAN_FALCON512_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+ v = PQCLEAN_FALCON512_AARCH64_comp_encode(sigbuf, *sigbuflen, r.sig);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ *sigbuflen = v;
+ return 0;
}
+ return -1;
}
/*
@@ -230,6 +225,7 @@ do_verify(
int16_t hm[FALCON_N];
int16_t sig[FALCON_N];
inner_shake256_context sc;
+ size_t v;
/*
* Decode public key.
@@ -250,9 +246,22 @@ do_verify(
if (sigbuflen == 0) {
return -1;
}
- if (PQCLEAN_FALCON512_AARCH64_comp_decode(sig, sigbuf, sigbuflen) != sigbuflen) {
+
+ v = PQCLEAN_FALCON512_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+ if (v == 0) {
return -1;
}
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
/*
* Hash nonce + message into a vector.
@@ -278,20 +287,9 @@ int
PQCLEAN_FALCON512_AARCH64_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk) {
- /*
- * The PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES constant is used for
- * the signed message object (as produced by crypto_sign())
- * and includes a two-byte length value, so we take care here
- * to only generate signatures that are two bytes shorter than
- * the maximum. This is done to ensure that crypto_sign()
- * and crypto_sign_signature() produce the exact same signature
- * value, if used on the same message, with the same private key,
- * and using the same output from randombytes() (this is for
- * reproducibility of tests).
- */
size_t vlen;
- vlen = PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES - NONCELEN - 3;
+ vlen = PQCLEAN_FALCON512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
return -1;
}
diff --git a/src/sig/falcon/pqclean_falcon-512_avx2/api.h b/src/sig/falcon/pqclean_falcon-512_avx2/api.h
index acae41ae3..2f74f2627 100644
--- a/src/sig/falcon/pqclean_falcon-512_avx2/api.h
+++ b/src/sig/falcon/pqclean_falcon-512_avx2/api.h
@@ -6,10 +6,12 @@
#define PQCLEAN_FALCON512_AVX2_CRYPTO_SECRETKEYBYTES 1281
#define PQCLEAN_FALCON512_AVX2_CRYPTO_PUBLICKEYBYTES 897
-#define PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES 666
+#define PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES 752
#define PQCLEAN_FALCON512_AVX2_CRYPTO_ALGNAME "Falcon-512"
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES 666 // used in signature verification
+
/*
* Generate a new key pair. Public key goes into pk[], private key in sk[].
* Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c
index 143246ebe..84e393d69 100644
--- a/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-512_avx2/pqclean.c
@@ -27,15 +27,15 @@
*
* signature:
* header byte: 0011nnnn
- * nonce 40 bytes
- * value (12 bits by element)
+ * nonce (r) 40 bytes
+ * value (s) compressed format
*
* message + signature:
* signature length (2 bytes, big-endian)
* nonce 40 bytes
* message
* header byte: 0010nnnn
- * value (12 bits by element)
+ * value compressed format
* (signature length is 1+len(value), not counting the nonce)
*/
@@ -115,10 +115,7 @@ PQCLEAN_FALCON512_AVX2_crypto_sign_keypair(
* receiving the actual value length.
*
* If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
*
* Return value: 0 on success, -1 on error.
*/
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
inner_shake256_flip(&sc);
/*
- * Compute and return the signature. This loops until a signature
- * value is found that fits in the provided buffer.
+ * Compute and return the signature.
*/
- for (;;) {
- PQCLEAN_FALCON512_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
- v = PQCLEAN_FALCON512_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
- if (v != 0) {
- inner_shake256_ctx_release(&sc);
- *sigbuflen = v;
- return 0;
- }
+ PQCLEAN_FALCON512_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+ v = PQCLEAN_FALCON512_AVX2_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ *sigbuflen = v;
+ return 0;
}
+ return -1;
}
/*
@@ -229,6 +224,7 @@ do_verify(
uint16_t h[512], hm[512];
int16_t sig[512];
inner_shake256_context sc;
+ size_t v;
/*
* Decode public key.
@@ -249,9 +245,22 @@ do_verify(
if (sigbuflen == 0) {
return -1;
}
- if (PQCLEAN_FALCON512_AVX2_comp_decode(sig, 9, sigbuf, sigbuflen) != sigbuflen) {
+
+ v = PQCLEAN_FALCON512_AVX2_comp_decode(sig, 9, sigbuf, sigbuflen);
+ if (v == 0) {
return -1;
}
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
/*
* Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
PQCLEAN_FALCON512_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk) {
- /*
- * The PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES constant is used for
- * the signed message object (as produced by crypto_sign())
- * and includes a two-byte length value, so we take care here
- * to only generate signatures that are two bytes shorter than
- * the maximum. This is done to ensure that crypto_sign()
- * and crypto_sign_signature() produce the exact same signature
- * value, if used on the same message, with the same private key,
- * and using the same output from randombytes() (this is for
- * reproducibility of tests).
- */
size_t vlen;
- vlen = PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES - NONCELEN - 3;
+ vlen = PQCLEAN_FALCON512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
return -1;
}
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/api.h b/src/sig/falcon/pqclean_falcon-512_clean/api.h
index 5c85f3834..49489d2b1 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/api.h
+++ b/src/sig/falcon/pqclean_falcon-512_clean/api.h
@@ -6,10 +6,12 @@
#define PQCLEAN_FALCON512_CLEAN_CRYPTO_SECRETKEYBYTES 1281
#define PQCLEAN_FALCON512_CLEAN_CRYPTO_PUBLICKEYBYTES 897
-#define PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES 666
+#define PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES 752
#define PQCLEAN_FALCON512_CLEAN_CRYPTO_ALGNAME "Falcon-512"
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES 666 // used in signature verification
+
/*
* Generate a new key pair. Public key goes into pk[], private key in sk[].
* Key sizes are exact (in bytes):
diff --git a/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c
index 979146a7d..80d8cbe32 100644
--- a/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c
+++ b/src/sig/falcon/pqclean_falcon-512_clean/pqclean.c
@@ -27,15 +27,15 @@
*
* signature:
* header byte: 0011nnnn
- * nonce 40 bytes
- * value (12 bits by element)
+ * nonce (r) 40 bytes
+ * value (s) compressed format
*
* message + signature:
* signature length (2 bytes, big-endian)
* nonce 40 bytes
* message
* header byte: 0010nnnn
- * value (12 bits by element)
+ * value compressed format
* (signature length is 1+len(value), not counting the nonce)
*/
@@ -115,10 +115,7 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair(
* receiving the actual value length.
*
* If a signature could be computed but not encoded because it would
- * exceed the output buffer size, then a new signature is computed. If
- * the provided buffer size is too low, this could loop indefinitely, so
- * the caller must provide a size that can accommodate signatures with a
- * large enough probability.
+ * exceed the output buffer size, then an error is returned.
*
* Return value: 0 on success, -1 on error.
*/
@@ -198,18 +195,16 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
inner_shake256_flip(&sc);
/*
- * Compute and return the signature. This loops until a signature
- * value is found that fits in the provided buffer.
+ * Compute and return the signature.
*/
- for (;;) {
- PQCLEAN_FALCON512_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
- v = PQCLEAN_FALCON512_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
- if (v != 0) {
- inner_shake256_ctx_release(&sc);
- *sigbuflen = v;
- return 0;
- }
+ PQCLEAN_FALCON512_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+ v = PQCLEAN_FALCON512_CLEAN_comp_encode(sigbuf, *sigbuflen, r.sig, 9);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ *sigbuflen = v;
+ return 0;
}
+ return -1;
}
/*
@@ -229,6 +224,7 @@ do_verify(
uint16_t h[512], hm[512];
int16_t sig[512];
inner_shake256_context sc;
+ size_t v;
/*
* Decode public key.
@@ -249,9 +245,22 @@ do_verify(
if (sigbuflen == 0) {
return -1;
}
- if (PQCLEAN_FALCON512_CLEAN_comp_decode(sig, 9, sigbuf, sigbuflen) != sigbuflen) {
+
+ v = PQCLEAN_FALCON512_CLEAN_comp_decode(sig, 9, sigbuf, sigbuflen);
+ if (v == 0) {
return -1;
}
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
/*
* Hash nonce + message into a vector.
@@ -277,20 +286,9 @@ int
PQCLEAN_FALCON512_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk) {
- /*
- * The PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES constant is used for
- * the signed message object (as produced by crypto_sign())
- * and includes a two-byte length value, so we take care here
- * to only generate signatures that are two bytes shorter than
- * the maximum. This is done to ensure that crypto_sign()
- * and crypto_sign_signature() produce the exact same signature
- * value, if used on the same message, with the same private key,
- * and using the same output from randombytes() (this is for
- * reproducibility of tests).
- */
size_t vlen;
- vlen = PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES - NONCELEN - 3;
+ vlen = PQCLEAN_FALCON512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
if (do_sign(sig + 1, sig + 1 + NONCELEN, &vlen, m, mlen, sk) < 0) {
return -1;
}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/LICENSE b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/LICENSE
new file mode 100644
index 000000000..4df2d7836
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/LICENSE
@@ -0,0 +1,57 @@
+This ARMv8 NEON implementation is provided under the Apache 2.0 license:
+
+/*
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+Based on the reference code provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin .
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/api.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/api.h
new file mode 100644
index 000000000..9b6299841
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED1024_AARCH64_API_H
+#define PQCLEAN_FALCONPADDED1024_AARCH64_API_H
+
+#include
+#include
+
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES 2305
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES 1793
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES 1280
+
+#define PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_ALGNAME "Falcon-padded-1024"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ * public (pk): PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES
+ * private (sk): PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/codec.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/codec.c
new file mode 100644
index 000000000..05a8e49f3
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/codec.c
@@ -0,0 +1,554 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_modq_encode(
+ void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn) {
+ size_t n, out_len, u;
+ uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = 1 << logn;
+ out_len = ((n * 14) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+
+ for (u = 0; u < n; u ++) {
+ if (x[u] >= FALCON_Q) {
+ return 0;
+ }
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << 14) | x[u];
+ acc_len += 14;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_modq_decode(uint16_t *x, const void *in, size_t max_in_len, unsigned logn) {
+ size_t n, in_len, u;
+ const uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = 1 << logn;
+ in_len = ((n * 14) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ u = 0;
+ while (u < n) {
+ acc = (acc << 8) | (*buf ++);
+ acc_len += 8;
+ if (acc_len >= 14) {
+ unsigned w;
+
+ acc_len -= 14;
+ w = (acc >> acc_len) & 0x3FFF;
+ if (w >= 12289) {
+ return 0;
+ }
+ x[u ++] = (uint16_t)w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint16_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_decode(
+ int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ w |= -(w & mask2);
+ x[u ++] = (int16_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(void *out, size_t max_out_len,
+ const int8_t *x, uint8_t bits) {
+ size_t u, out_len;
+ int8_t minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ out_len = (size_t) ((FALCON_N * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+
+ maxv = (int8_t) (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int8(x, minv, maxv)) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < FALCON_N; u ++) {
+ acc = (acc << bits) | ((uint8_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(int8_t *x, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ in_len = ((FALCON_N * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < FALCON_N) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < FALCON_N) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ x[u ++] = (int8_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x) {
+ uint8_t *buf;
+ size_t u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ buf = out;
+
+ /*
+ * Make sure that all values are within the -2047..+2047 range.
+ */
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int16(x, -2047, 2047)) {
+ return 0;
+ }
+
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < FALCON_N; u ++) {
+ int t;
+ unsigned w;
+
+ /*
+ * Get sign and absolute value of next integer; push the
+ * sign bit.
+ */
+ acc <<= 1;
+ t = x[u];
+ if (t < 0) {
+ t = -t;
+ acc |= 1;
+ }
+ w = (unsigned)t;
+
+ /*
+ * Push the low 7 bits of the absolute value.
+ */
+ acc <<= 7;
+ acc |= w & 127u;
+ w >>= 7;
+
+ /*
+ * We pushed exactly 8 bits.
+ */
+ acc_len += 8;
+
+ /*
+ * Push as many zeros as necessary, then a one. Since the
+ * absolute value is at most 2047, w can only range up to
+ * 15 at this point, thus we will add at most 16 bits
+ * here. With the 8 bits above and possibly up to 7 bits
+ * from previous iterations, we may go up to 31 bits, which
+ * will fit in the accumulator, which is an uint32_t.
+ */
+ acc <<= (w + 1);
+ acc |= 1;
+ acc_len += w + 1;
+
+ /*
+ * Produce all full bytes.
+ */
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc >> acc_len);
+ }
+ v ++;
+ }
+ }
+
+ /*
+ * Flush remaining bits (if any).
+ */
+ if (acc_len > 0) {
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc << (8 - acc_len));
+ }
+ v ++;
+ }
+
+ return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len) {
+ const uint8_t *buf;
+ size_t u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < FALCON_N; u ++) {
+ unsigned b, s, m;
+
+ /*
+ * Get next eight bits: sign and low seven bits of the
+ * absolute value.
+ */
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ b = acc >> acc_len;
+ s = b & 128;
+ m = b & 127;
+
+ /*
+ * Get next bits until a 1 is reached.
+ */
+ for (;;) {
+ if (acc_len == 0) {
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ acc_len = 8;
+ }
+ acc_len --;
+ if (((acc >> acc_len) & 1) != 0) {
+ break;
+ }
+ m += 128;
+ if (m > 2047) {
+ return 0;
+ }
+ }
+
+ /*
+ * "-0" is forbidden.
+ */
+ if (s && m == 0) {
+ return 0;
+ }
+
+ x[u] = (int16_t)(s ? -(int)m : (int)m);
+ }
+
+ /*
+ * Unused bits in the last byte must be zero.
+ */
+ if ((acc & ((1u << acc_len) - 1u)) != 0) {
+ return 0;
+ }
+
+ return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ * log(n) n max(f,g) std(f,g) max(F,G) std(F,G)
+ * 1 2 129 56.31 143 60.02
+ * 2 4 123 40.93 160 46.52
+ * 3 8 97 28.97 159 38.01
+ * 4 16 100 21.48 154 32.50
+ * 5 32 71 15.41 151 29.36
+ * 6 64 59 11.07 138 27.77
+ * 7 128 39 7.91 144 27.00
+ * 8 256 32 5.63 148 26.61
+ * 9 512 22 4.00 137 26.46
+ * 10 1024 15 2.84 146 26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 7,
+ 7,
+ 6,
+ 6,
+ 5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ * log(n) n max sig coeff (theoretical)
+ * 1 2 412
+ * 2 4 583
+ * 3 8 824
+ * 4 16 1166
+ * 5 32 1649
+ * 6 64 2332
+ * 7 128 3299
+ * 8 256 4665
+ * 9 512 6598
+ * 10 1024 9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_sig_bits[] = {
+ 0, /* unused */
+ 10,
+ 11,
+ 11,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/common.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/common.c
new file mode 100644
index 000000000..883d89055
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/common.c
@@ -0,0 +1,549 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "macrous.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_vartime(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn) {
+ /*
+ * This is the straightforward per-the-spec implementation. It
+ * is not constant-time, thus it might reveal information on the
+ * plaintext (at least, enough to check the plaintext against a
+ * list of potential plaintexts) in a scenario where the
+ * attacker does not have access to the signature value or to
+ * the public key, but knows the nonce (without knowledge of the
+ * nonce, the hashed output cannot be matched against potential
+ * plaintexts).
+ */
+ size_t n;
+
+ n = (size_t)1 << logn;
+ while (n > 0) {
+ uint8_t buf[2];
+ uint32_t w;
+
+ inner_shake256_extract(sc, (void *)buf, sizeof buf);
+ w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+ if (w < 5 * FALCON_Q) {
+ while (w >= FALCON_Q) {
+ w -= FALCON_Q;
+ }
+ *x++ = (uint16_t)w;
+ n--;
+ }
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp) {
+ /*
+ * Each 16-bit sample is a value in 0..65535. The value is
+ * kept if it falls in 0..61444 (because 61445 = 5*12289)
+ * and rejected otherwise; thus, each sample has probability
+ * about 0.93758 of being selected.
+ *
+ * We want to oversample enough to be sure that we will
+ * have enough values with probability at least 1 - 2^(-256).
+ * Depending on degree N, this leads to the following
+ * required oversampling:
+ *
+ * logn n oversampling
+ * 1 2 65
+ * 2 4 67
+ * 3 8 71
+ * 4 16 77
+ * 5 32 86
+ * 6 64 100
+ * 7 128 122
+ * 8 256 154
+ * 9 512 205
+ * 10 1024 287
+ *
+ * If logn >= 7, then the provided temporary buffer is large
+ * enough. Otherwise, we use a stack buffer of 63 entries
+ * (i.e. 126 bytes) for the values that do not fit in tmp[].
+ */
+
+ static const uint16_t overtab[] = {
+ 0, /* unused */
+ 65,
+ 67,
+ 71,
+ 77,
+ 86,
+ 100,
+ 122,
+ 154,
+ 205,
+ 287
+ };
+
+ unsigned n, n2, u, m, p, over;
+ uint16_t *tt1, tt2[63];
+
+ /*
+ * We first generate m 16-bit value. Values 0..n-1 go to x[].
+ * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+ * We also reduce modulo q the values; rejected values are set
+ * to 0xFFFF.
+ */
+ n = 1U << logn;
+ n2 = n << 1;
+ over = overtab[logn];
+ m = n + over;
+ tt1 = (uint16_t *)tmp;
+ for (u = 0; u < m; u++) {
+ uint8_t buf[2];
+ uint32_t w, wr;
+
+ inner_shake256_extract(sc, buf, sizeof buf);
+ w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+ wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+ wr |= ((w - 61445) >> 31) - 1;
+ if (u < n) {
+ x[u] = (uint16_t)wr;
+ } else if (u < n2) {
+ tt1[u - n] = (uint16_t)wr;
+ } else {
+ tt2[u - n2] = (uint16_t)wr;
+ }
+ }
+
+ /*
+ * Now we must "squeeze out" the invalid values. We do this in
+ * a logarithmic sequence of passes; each pass computes where a
+ * value should go, and moves it down by 'p' slots if necessary,
+ * where 'p' uses an increasing powers-of-two scale. It can be
+ * shown that in all cases where the loop decides that a value
+ * has to be moved down by p slots, the destination slot is
+ * "free" (i.e. contains an invalid value).
+ */
+ for (p = 1; p <= over; p <<= 1) {
+ unsigned v;
+
+ /*
+ * In the loop below:
+ *
+ * - v contains the index of the final destination of
+ * the value; it is recomputed dynamically based on
+ * whether values are valid or not.
+ *
+ * - u is the index of the value we consider ("source");
+ * its address is s.
+ *
+ * - The loop may swap the value with the one at index
+ * u-p. The address of the swap destination is d.
+ */
+ v = 0;
+ for (u = 0; u < m; u++) {
+ uint16_t *s, *d;
+ unsigned j, sv, dv, mk;
+
+ if (u < n) {
+ s = &x[u];
+ } else if (u < n2) {
+ s = &tt1[u - n];
+ } else {
+ s = &tt2[u - n2];
+ }
+ sv = *s;
+
+ /*
+ * The value in sv should ultimately go to
+ * address v, i.e. jump back by u-v slots.
+ */
+ j = u - v;
+
+ /*
+ * We increment v for the next iteration, but
+ * only if the source value is valid. The mask
+ * 'mk' is -1 if the value is valid, 0 otherwise,
+ * so we _subtract_ mk.
+ */
+ mk = (sv >> 15) - 1U;
+ v -= mk;
+
+ /*
+ * In this loop we consider jumps by p slots; if
+ * u < p then there is nothing more to do.
+ */
+ if (u < p) {
+ continue;
+ }
+
+ /*
+ * Destination for the swap: value at address u-p.
+ */
+ if ((u - p) < n) {
+ d = &x[u - p];
+ } else if ((u - p) < n2) {
+ d = &tt1[(u - p) - n];
+ } else {
+ d = &tt2[(u - p) - n2];
+ }
+ dv = *d;
+
+ /*
+ * The swap should be performed only if the source
+ * is valid AND the jump j has its 'p' bit set.
+ */
+ mk &= -(((j & p) + 0x1FF) >> 9);
+
+ *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+ *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+ }
+ }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+ 0, /* unused */
+ 101498,
+ 208714,
+ 428865,
+ 892039,
+ 1852696,
+ 3842630,
+ 7959734,
+ 16468416,
+ 34034726,
+ 70265242
+};
+
+/* see inner.h
+ * In NEON, there is sign saturating doubling add instruction sqdmlal/sqdmlal2,
+ * thus, we enable 2 parallel dependency rather than 1 for better scheduling.
+ * Each for loop is tuned for cache locality.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short(const int16_t *s1, const int16_t *s2) {
+ // Total SIMD register 18 = 16 + 2
+ int16x8x4_t neon_s1, neon_s2, neon_s3, neon_s4; // 16
+ int32x4_t neon_s, neon_sh; // 2
+ int32x2_t tmp;
+ uint32_t s;
+ neon_s = vdupq_n_s32(0);
+ neon_sh = vdupq_n_s32(0);
+
+ for (unsigned u = 0; u < FALCON_N; u += 128) {
+ vload_s16_x4(neon_s1, &s1[u]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+ vload_s16_x4(neon_s2, &s1[u + 32]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+ vload_s16_x4(neon_s3, &s1[u + 64]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+ vload_s16_x4(neon_s4, &s1[u + 96]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+ }
+ for (unsigned u = 0; u < FALCON_N; u += 128) {
+ vload_s16_x4(neon_s1, &s2[u]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+ vload_s16_x4(neon_s2, &s2[u + 32]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+ vload_s16_x4(neon_s3, &s2[u + 64]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+ vload_s16_x4(neon_s4, &s2[u + 96]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+ }
+ // 32x4
+ neon_s = vhaddq_s32(neon_s, neon_sh);
+ // 32x4 -> 32x2
+ tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+ // 32x2 -> 32x1
+ // Use saturating add to prevent overflow
+ s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+ return s <= l2bound[FALCON_LOGN];
+}
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+ const int16_t *hm, const fpr *t0,
+ const fpr *t1) {
+ // Total SIMD registers: 26 = 16 + 8 + 2
+ int16x8x4_t neon_hm, neon_ts; // 8
+ float64x2x4_t neon_tf0, neon_tf1, neon_tf2, neon_tf3; // 16
+ int64x2x4_t neon_ts0, neon_ts1, neon_ts2, neon_ts3; // 16
+ int32x4x4_t neon_ts4, neon_ts5; // 8
+ int32x4_t neon_s, neon_sh; // 2
+ int32x2_t tmp;
+ uint32_t s;
+
+ neon_s = vdupq_n_s32(0);
+ neon_sh = vdupq_n_s32(0);
+
+ // s1tmp
+ for (int i = 0; i < FALCON_N; i += 32) {
+ vloadx4(neon_tf0, &t0[i]);
+ vloadx4(neon_tf1, &t0[i + 8]);
+ vfrintx4(neon_ts0, neon_tf0);
+ vfrintx4(neon_ts1, neon_tf1);
+
+ neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+ neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+ neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+ neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+ vloadx4(neon_tf2, &t0[i + 16]);
+ vloadx4(neon_tf3, &t0[i + 24]);
+ vfrintx4(neon_ts2, neon_tf2);
+ vfrintx4(neon_ts3, neon_tf3);
+
+ neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+ neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+ neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+ neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+ neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+ neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+ neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+ neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+ // hm = hm - fpr_rint(t0)
+ vload_s16_x4(neon_hm, &hm[i]);
+ neon_hm.val[0] = vsubq_s16(neon_hm.val[0], neon_ts.val[0]);
+ neon_hm.val[1] = vsubq_s16(neon_hm.val[1], neon_ts.val[1]);
+ neon_hm.val[2] = vsubq_s16(neon_hm.val[2], neon_ts.val[2]);
+ neon_hm.val[3] = vsubq_s16(neon_hm.val[3], neon_ts.val[3]);
+ vstore_s16_x4(&s1tmp[i], neon_hm);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[0]), vget_low_s16(neon_hm.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[1]), vget_low_s16(neon_hm.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[2]), vget_low_s16(neon_hm.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[3]), vget_low_s16(neon_hm.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[0], neon_hm.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[1], neon_hm.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[2], neon_hm.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[3], neon_hm.val[3]);
+ }
+
+ // s2tmp
+ for (int i = 0; i < FALCON_N; i += 32) {
+ vloadx4(neon_tf0, &t1[i]);
+ vloadx4(neon_tf1, &t1[i + 8]);
+
+ vfrintx4(neon_ts0, neon_tf0);
+ vfrintx4(neon_ts1, neon_tf1);
+
+ neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+ neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+ neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+ neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+ vloadx4(neon_tf2, &t1[i + 16]);
+ vloadx4(neon_tf3, &t1[i + 24]);
+
+ vfrintx4(neon_ts2, neon_tf2);
+ vfrintx4(neon_ts3, neon_tf3);
+
+ neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+ neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+ neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+ neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+ neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+ neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+ neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+ neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+ neon_ts.val[0] = vnegq_s16(neon_ts.val[0]);
+ neon_ts.val[1] = vnegq_s16(neon_ts.val[1]);
+ neon_ts.val[2] = vnegq_s16(neon_ts.val[2]);
+ neon_ts.val[3] = vnegq_s16(neon_ts.val[3]);
+ vstore_s16_x4(&s2tmp[i], neon_ts);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[0]), vget_low_s16(neon_ts.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[1]), vget_low_s16(neon_ts.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[2]), vget_low_s16(neon_ts.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[3]), vget_low_s16(neon_ts.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[0], neon_ts.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[1], neon_ts.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[2], neon_ts.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[3], neon_ts.val[3]);
+ }
+
+ // 32x4
+ neon_s = vhaddq_s32(neon_s, neon_sh);
+ // 32x4 -> 32x2
+ tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+ // 32x2 -> 32x1
+ // Use saturating add to prevent overflow
+ s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+ return s <= l2bound[FALCON_LOGN];
+}
+
+int32_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_small_sqnorm(const int8_t *f) {
+ int8x16x4_t a;
+ int16x8x4_t b, c;
+ int32x4_t norm, norm_sh;
+
+ norm = vdupq_n_s32(0);
+ norm_sh = vdupq_n_s32(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ a = vld1q_s8_x4(&f[0]);
+
+ b.val[0] = vmovl_s8(vget_low_s8(a.val[0]));
+ b.val[1] = vmovl_high_s8(a.val[0]);
+ b.val[2] = vmovl_s8(vget_low_s8(a.val[1]));
+ b.val[3] = vmovl_high_s8(a.val[1]);
+
+ c.val[0] = vmovl_s8(vget_low_s8(a.val[2]));
+ c.val[1] = vmovl_high_s8(a.val[2]);
+ c.val[2] = vmovl_s8(vget_low_s8(a.val[3]));
+ c.val[3] = vmovl_high_s8(a.val[3]);
+
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[0]), vget_low_s16(b.val[0]));
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[1]), vget_low_s16(b.val[1]));
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[2]), vget_low_s16(b.val[2]));
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[3]), vget_low_s16(b.val[3]));
+
+ norm = vqdmlal_high_s16(norm, b.val[0], b.val[0]);
+ norm = vqdmlal_high_s16(norm, b.val[1], b.val[1]);
+ norm = vqdmlal_high_s16(norm, b.val[2], b.val[2]);
+ norm = vqdmlal_high_s16(norm, b.val[3], b.val[3]);
+
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[0]), vget_low_s16(c.val[0]));
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[1]), vget_low_s16(c.val[1]));
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[2]), vget_low_s16(c.val[2]));
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[3]), vget_low_s16(c.val[3]));
+
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[0], c.val[0]);
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[1], c.val[1]);
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[2], c.val[2]);
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[3], c.val[3]);
+ }
+ // 32x4
+ norm = vhaddq_s32(norm, norm_sh);
+ // 32x4 -> 32x2
+ int32x2_t tmp;
+ tmp = vqadd_s32(vget_low_s32(norm), vget_high_s32(norm));
+
+ // 32x2 -> 32x1
+ // Use saturating add to prevent overflow
+ int32_t s;
+ s = vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+ return s;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft.c
new file mode 100644
index 000000000..652a306b0
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft.c
@@ -0,0 +1,1038 @@
+/*
+ * High-speed vectorize FFT code for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Forward FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log2(fpr *f) {
+ fpr x_re, x_im, y_re, y_im, v_re, v_im, t_re, t_im, s;
+
+ x_re = f[0];
+ y_re = f[1];
+ x_im = f[2];
+ y_im = f[3];
+ s = fpr_tab_log2[0];
+
+ t_re = y_re * s;
+ t_im = y_im * s;
+
+ v_re = t_re - t_im;
+ v_im = t_re + t_im;
+
+ f[0] = x_re + v_re;
+ f[1] = x_re - v_re;
+ f[2] = x_im + v_im;
+ f[3] = x_im - v_im;
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 4 complex points (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log3(fpr *f) {
+ // Total SIMD registers: 18 = 4 + 6 + 8
+ float64x2x4_t tmp; // 4
+ float64x2x2_t s_re_im, x, y; // 6
+ float64x2_t v_re, v_im, x_re, x_im, y_re, y_im, t_x, t_y; // 8
+
+ vloadx4(tmp, &f[0]);
+ s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ vfmul(v_re, tmp.val[1], s_re_im.val[0]);
+ vfmul(v_im, tmp.val[3], s_re_im.val[0]);
+
+ vfsub(t_x, v_re, v_im);
+ vfadd(t_y, v_re, v_im);
+
+ vfsub(tmp.val[1], tmp.val[0], t_x);
+ vfsub(tmp.val[3], tmp.val[2], t_y);
+
+ vfadd(tmp.val[0], tmp.val[0], t_x);
+ vfadd(tmp.val[2], tmp.val[2], t_y);
+
+ x_re = vtrn1q_f64(tmp.val[0], tmp.val[1]);
+ y_re = vtrn2q_f64(tmp.val[0], tmp.val[1]);
+ x_im = vtrn1q_f64(tmp.val[2], tmp.val[3]);
+ y_im = vtrn2q_f64(tmp.val[2], tmp.val[3]);
+
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ FWD_TOP(v_re, v_im, y_re, y_im, s_re_im.val[0], s_re_im.val[1]);
+
+ FPC_ADD(x.val[0], y.val[0], x_re, x_im, v_re, v_im);
+ FPC_SUB(x.val[1], y.val[1], x_re, x_im, v_re, v_im);
+
+ vstore2(&f[0], x);
+ vstore2(&f[4], y);
+}
+
+/*
+ * Vectorized 3 layers of Forward FFT for 8 complex points (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log4(fpr *f) {
+ // Total SIMD register: 26 = 8 + 18
+ float64x2x4_t t0, t1; // 8
+ float64x2x2_t x_re, x_im, y_re, y_im, v1, v2, tx, ty, s_re_im; // 18
+
+ vloadx4(t0, &f[0]);
+ vloadx4(t1, &f[8]);
+ vload(s_re_im.val[0], &fpr_tab_log2[0]);
+
+ vfmul(v1.val[0], t0.val[2], s_re_im.val[0]);
+ vfmul(v1.val[1], t0.val[3], s_re_im.val[0]);
+
+ vfmul(v2.val[0], t1.val[2], s_re_im.val[0]);
+ vfmul(v2.val[1], t1.val[3], s_re_im.val[0]);
+
+ vfsub(tx.val[0], v1.val[0], v2.val[0]);
+ vfsub(tx.val[1], v1.val[1], v2.val[1]);
+
+ vfadd(ty.val[0], v1.val[0], v2.val[0]);
+ vfadd(ty.val[1], v1.val[1], v2.val[1]);
+
+ FWD_BOT(t0.val[0], t1.val[0], t0.val[2], t1.val[2], tx.val[0], ty.val[0]);
+ FWD_BOT(t0.val[1], t1.val[1], t0.val[3], t1.val[3], tx.val[1], ty.val[1]);
+
+ vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+ FWD_TOP_LANE(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0]);
+ FWD_TOP_LANE(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0]);
+
+ FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+ FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+ x_re.val[0] = t0.val[0];
+ x_re.val[1] = t0.val[2];
+ y_re.val[0] = t0.val[1];
+ y_re.val[1] = t0.val[3];
+
+ x_im.val[0] = t1.val[0];
+ x_im.val[1] = t1.val[2];
+ y_im.val[0] = t1.val[1];
+ y_im.val[1] = t1.val[3];
+
+ t0.val[0] = vzip1q_f64(x_re.val[0], x_re.val[1]);
+ t0.val[1] = vzip2q_f64(x_re.val[0], x_re.val[1]);
+ t0.val[2] = vzip1q_f64(y_re.val[0], y_re.val[1]);
+ t0.val[3] = vzip2q_f64(y_re.val[0], y_re.val[1]);
+
+ t1.val[0] = vzip1q_f64(x_im.val[0], x_im.val[1]);
+ t1.val[1] = vzip2q_f64(x_im.val[0], x_im.val[1]);
+ t1.val[2] = vzip1q_f64(y_im.val[0], y_im.val[1]);
+ t1.val[3] = vzip2q_f64(y_im.val[0], y_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab_log4[0]);
+
+ FWD_TOP(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0], s_re_im.val[1]);
+ FWD_TOP(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+ FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+ vstore4(&f[0], t0);
+ vstore4(&f[8], t1);
+}
+
+/*
+ * Vectorized 4 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(fpr *f, const unsigned logn) {
+ // Total SIMD register: 34 = 2 + 32
+ float64x2x2_t s_re_im; // 2
+ float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im, v_re, v_im; // 32
+
+ const unsigned int falcon_n = 1 << logn;
+ const unsigned int hn = falcon_n >> 1;
+
+ unsigned int level = logn - 3;
+ const fpr *fpr_tab2 = fpr_table[level++],
+ *fpr_tab3 = fpr_table[level++],
+ *fpr_tab4 = fpr_table[level++],
+ *fpr_tab5 = fpr_table[level];
+ int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+ for (unsigned j = 0; j < hn; j += 16) {
+ vload(s_re_im.val[0], &fpr_tab2[k2]);
+
+ /*
+ * We only increase k2 when j value has the form j = 32*x + 16
+ * Modulo 32 both sides, then check if (j % 32) == 16.
+ */
+ k2 += 2 * ((j & 31) == 16);
+
+ vloadx4(y_re, &f[j + 8]);
+ vloadx4(y_im, &f[j + 8 + hn]);
+
+ if (logn == 5) {
+ // Handle special case when use fpr_tab_log2, where re == im
+ // This reduce number of multiplications,
+ // although equal number of instructions as the "else" branch
+ vfmulx4_i(t_im, y_im, s_re_im.val[0]);
+ vfmulx4_i(t_re, y_re, s_re_im.val[0]);
+ vfsubx4(v_re, t_re, t_im);
+ vfaddx4(v_im, t_re, t_im);
+ } else {
+ FWD_TOP_LANEx4(v_re, v_im, y_re, y_im, s_re_im.val[0]);
+ }
+
+ vloadx4(x_re, &f[j]);
+ vloadx4(x_im, &f[j + hn]);
+
+ if ((j >> 4) & 1) {
+ FWD_BOTJx4(x_re, x_im, y_re, y_im, v_re, v_im);
+ } else {
+ FWD_BOTx4(x_re, x_im, y_re, y_im, v_re, v_im);
+ }
+
+ vload(s_re_im.val[0], &fpr_tab3[k3]);
+ k3 += 2;
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[2], x_im.val[2], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[2], y_im.val[2], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0]);
+
+ FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+ FWD_BOTJ(y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+ vloadx2(s_re_im, &fpr_tab4[k4]);
+ k4 += 4;
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[1]);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[1]);
+
+ FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+ FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+ FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+ transpose_f64(x_re, x_re, v_re, 0, 2, 0);
+ transpose_f64(x_re, x_re, v_re, 1, 3, 1);
+ transpose_f64(x_im, x_im, v_im, 0, 2, 0);
+ transpose_f64(x_im, x_im, v_im, 1, 3, 1);
+
+ v_re.val[0] = x_re.val[2];
+ x_re.val[2] = x_re.val[1];
+ x_re.val[1] = v_re.val[0];
+
+ v_im.val[0] = x_im.val[2];
+ x_im.val[2] = x_im.val[1];
+ x_im.val[1] = v_im.val[0];
+
+ transpose_f64(y_re, y_re, v_re, 0, 2, 2);
+ transpose_f64(y_re, y_re, v_re, 1, 3, 3);
+ transpose_f64(y_im, y_im, v_im, 0, 2, 2);
+ transpose_f64(y_im, y_im, v_im, 1, 3, 3);
+
+ v_re.val[0] = y_re.val[2];
+ y_re.val[2] = y_re.val[1];
+ y_re.val[1] = v_re.val[0];
+
+ v_im.val[0] = y_im.val[2];
+ y_im.val[2] = y_im.val[1];
+ y_im.val[1] = v_im.val[0];
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ FWD_TOP(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ FWD_TOP(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ FWD_TOP(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ FWD_TOP(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+ FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+
+ vstore4(&f[j], x_re);
+ vstore4(&f[j + hn], x_im);
+
+ FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+ vstore4(&f[j + 8], y_re);
+ vstore4(&f[j + 8 + hn], y_im);
+ }
+}
+
+/*
+ * Vectorized 1 layer of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn1(fpr *f, const unsigned logn) {
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+ const unsigned ht = n >> 2;
+
+ // Total SIMD register: 25 = 1 + 24
+ float64x2_t s_re_im; // 1
+ float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im, v_re, v_im; // 24
+
+ s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+ for (unsigned j = 0; j < ht; j += 8) {
+ vloadx4(b_re, &f[j + ht]);
+ vfmulx4_i(t_re, b_re, s_re_im);
+
+ vloadx4(b_im, &f[j + ht + hn]);
+ vfmulx4_i(t_im, b_im, s_re_im);
+
+ vfsubx4(v_re, t_re, t_im);
+ vfaddx4(v_im, t_re, t_im);
+
+ vloadx4(a_re, &f[j]);
+ vloadx4(a_im, &f[j + hn]);
+
+ FWD_BOTx4(a_re, a_im, b_re, b_im, v_re, v_im);
+ vstorex4(&f[j + ht], b_re);
+ vstorex4(&f[j], a_re);
+
+ vstorex4(&f[j + ht + hn], b_im);
+ vstorex4(&f[j + hn], a_im);
+ }
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn2(fpr *f, const unsigned logn, const unsigned level) {
+ const unsigned int falcon_n = 1 << logn;
+ const unsigned int hn = falcon_n >> 1;
+
+ // Total SIMD register: 26 = 8 + 16 + 2
+ float64x2x4_t t_re, t_im; // 8
+ float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+ y1_re, y2_re, y1_im, y2_im; // 16
+ float64x2_t s1_re_im, s2_re_im; // 2
+
+ const fpr *fpr_tab1 = NULL, *fpr_tab2 = NULL;
+ unsigned l, len, start, j, k1, k2;
+ unsigned bar = logn - level + 2;
+
+ for (l = level - 1; l > 4; l -= 2) {
+ len = 1 << (l - 2);
+ fpr_tab1 = fpr_table[bar++];
+ fpr_tab2 = fpr_table[bar++];
+ k1 = 0;
+ k2 = 0;
+
+ for (start = 0; start < hn; start += 1U << l) {
+ vload(s1_re_im, &fpr_tab1[k1]);
+ vload(s2_re_im, &fpr_tab2[k2]);
+ k1 += 2U * ((start & 127) == 64);
+ k2 += 2;
+
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(y1_re, &f[j + 2 * len]);
+ vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(x2_re, &f[j + len]);
+ vloadx2(x2_im, &f[j + len + hn]);
+
+ FWD_BOT(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+ FWD_BOT(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOT(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+ FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+ vstorex2(&f[j + len], x2_re);
+ vstorex2(&f[j + len + hn], x2_im);
+
+ FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ vstorex2(&f[j + 2 * len], y1_re);
+ vstorex2(&f[j + 2 * len + hn], y1_im);
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+ }
+
+ start += 1U << l;
+ if (start >= hn) {
+ break;
+ }
+
+ vload(s1_re_im, &fpr_tab1[k1]);
+ vload(s2_re_im, &fpr_tab2[k2]);
+ k1 += 2U * ((start & 127) == 64);
+ k2 += 2;
+
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(y1_re, &f[j + 2 * len]);
+ vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(x2_re, &f[j + len]);
+ vloadx2(x2_im, &f[j + len + hn]);
+
+ FWD_BOTJ(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOTJ(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+ FWD_BOTJ(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+ FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+ vstorex2(&f[j + len], x2_re);
+ vstorex2(&f[j + len + hn], x2_im);
+
+ FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ vstorex2(&f[j + 2 * len], y1_re);
+ vstorex2(&f[j + 2 * len + hn], y1_im);
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+ }
+ }
+ }
+}
+
+/*
+ * 1 layer of Inverse FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log2(fpr *f) {
+ fpr x_re, x_im, y_re, y_im, s;
+ x_re = f[0];
+ y_re = f[1];
+ x_im = f[2];
+ y_im = f[3];
+ s = fpr_tab_log2[0] * 0.5;
+
+ f[0] = (x_re + y_re) * 0.5;
+ f[2] = (x_im + y_im) * 0.5;
+
+ x_re = (x_re - y_re) * s;
+ x_im = (x_im - y_im) * s;
+
+ f[1] = x_im + x_re;
+ f[3] = x_im - x_re;
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 4 complex point (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log3(fpr *f) {
+ // Total SIMD registers: 12 = 4 + 8
+ float64x2x4_t tmp; // 4
+ float64x2x2_t x_re_im, y_re_im, v, s_re_im; // 8
+
+ vload2(x_re_im, &f[0]);
+ vload2(y_re_im, &f[4]);
+
+ vfsub(v.val[0], x_re_im.val[0], x_re_im.val[1]);
+ vfsub(v.val[1], y_re_im.val[0], y_re_im.val[1]);
+ vfadd(x_re_im.val[0], x_re_im.val[0], x_re_im.val[1]);
+ vfadd(x_re_im.val[1], y_re_im.val[0], y_re_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ vfmul(y_re_im.val[0], v.val[1], s_re_im.val[1]);
+ vfmla(y_re_im.val[0], y_re_im.val[0], v.val[0], s_re_im.val[0]);
+ vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+ vfmls(y_re_im.val[1], y_re_im.val[1], v.val[0], s_re_im.val[1]);
+
+ tmp.val[0] = vtrn1q_f64(x_re_im.val[0], y_re_im.val[0]);
+ tmp.val[1] = vtrn2q_f64(x_re_im.val[0], y_re_im.val[0]);
+ tmp.val[2] = vtrn1q_f64(x_re_im.val[1], y_re_im.val[1]);
+ tmp.val[3] = vtrn2q_f64(x_re_im.val[1], y_re_im.val[1]);
+
+ s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ vfadd(x_re_im.val[0], tmp.val[0], tmp.val[1]);
+ vfadd(x_re_im.val[1], tmp.val[2], tmp.val[3]);
+ vfsub(v.val[0], tmp.val[0], tmp.val[1]);
+ vfsub(v.val[1], tmp.val[2], tmp.val[3]);
+
+ vfmuln(tmp.val[0], x_re_im.val[0], 0.25);
+ vfmuln(tmp.val[2], x_re_im.val[1], 0.25);
+
+ vfmuln(s_re_im.val[0], s_re_im.val[0], 0.25);
+
+ vfmul(y_re_im.val[0], v.val[0], s_re_im.val[0]);
+ vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+
+ vfadd(tmp.val[1], y_re_im.val[1], y_re_im.val[0]);
+ vfsub(tmp.val[3], y_re_im.val[1], y_re_im.val[0]);
+
+ vstorex4(&f[0], tmp);
+}
+
+/*
+ * Vectorized 3 layers of Inverse FFT for 8 complex point (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log4(fpr *f) {
+ // Total SIMD registers: 18 = 12 + 6
+ float64x2x4_t re, im, t; // 12
+ float64x2x2_t t_re, t_im, s_re_im; // 6
+
+ vload4(re, &f[0]);
+ vload4(im, &f[8]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+ vload2(s_re_im, &fpr_tab_log4[0]);
+
+ INV_BOTJ(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ // re: 0, 4 | 1, 5 | 2, 6 | 3, 7
+ // im: 8, 12| 9, 13|10, 14|11, 15
+ transpose_f64(re, re, t, 0, 1, 0);
+ transpose_f64(re, re, t, 2, 3, 1);
+ transpose_f64(im, im, t, 0, 1, 2);
+ transpose_f64(im, im, t, 2, 3, 3);
+
+ // re: 0, 1 | 4, 5 | 2, 3 | 6, 7
+ // im: 8, 9 | 12, 13|10, 11| 14, 15
+ t.val[0] = re.val[1];
+ re.val[1] = re.val[2];
+ re.val[2] = t.val[0];
+
+ t.val[1] = im.val[1];
+ im.val[1] = im.val[2];
+ im.val[2] = t.val[1];
+
+ // re: 0, 1 | 2, 3| 4, 5 | 6, 7
+ // im: 8, 9 | 10, 11| 12, 13| 14, 15
+ INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+ vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+ INV_BOTJ_LANE(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+ INV_BOTJm_LANE(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[2], im.val[2]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], re.val[1], im.val[1], re.val[3], im.val[3]);
+
+ vfmuln(re.val[0], re.val[0], 0.12500000000);
+ vfmuln(re.val[1], re.val[1], 0.12500000000);
+ vfmuln(im.val[0], im.val[0], 0.12500000000);
+ vfmuln(im.val[1], im.val[1], 0.12500000000);
+
+ s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ vfmuln(s_re_im.val[0], s_re_im.val[0], 0.12500000000);
+
+ vfmul(t_re.val[0], t_re.val[0], s_re_im.val[0]);
+ vfmul(t_re.val[1], t_re.val[1], s_re_im.val[0]);
+ vfmul(t_im.val[0], t_im.val[0], s_re_im.val[0]);
+ vfmul(t_im.val[1], t_im.val[1], s_re_im.val[0]);
+
+ vfsub(im.val[2], t_im.val[0], t_re.val[0]);
+ vfsub(im.val[3], t_im.val[1], t_re.val[1]);
+ vfadd(re.val[2], t_im.val[0], t_re.val[0]);
+ vfadd(re.val[3], t_im.val[1], t_re.val[1]);
+
+ vstorex4(&f[0], re);
+ vstorex4(&f[8], im);
+}
+
+/*
+ * Vectorized 4 layers of Inverse FFT for 16 complex point (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(fpr *f, const unsigned logn, const unsigned last) {
+ // Total SIMD register: 26 = 24 + 2
+ float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im; // 24
+ float64x2x2_t s_re_im; // 2
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+
+ unsigned int level = logn;
+ const fpr *fpr_tab5 = fpr_table[level--],
+ *fpr_tab4 = fpr_table[level--],
+ *fpr_tab3 = fpr_table[level--],
+ *fpr_tab2 = fpr_table[level];
+ int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+ for (unsigned j = 0; j < hn; j += 16) {
+
+ vload4(x_re, &f[j]);
+ vload4(x_im, &f[j + hn]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+ INV_TOPJm(t_re.val[2], t_im.val[2], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+ vload4(y_re, &f[j + 8]);
+ vload4(y_im, &f[j + 8 + hn]);
+
+ INV_TOPJ(t_re.val[1], t_im.val[1], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ INV_BOTJ(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(x_re.val[3], x_im.val[3], t_re.val[2], t_im.val[2], s_re_im.val[0], s_re_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ INV_BOTJ(y_re.val[1], y_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ transpose_f64(x_re, x_re, t_re, 0, 1, 0);
+ transpose_f64(x_re, x_re, t_re, 2, 3, 1);
+ transpose_f64(y_re, y_re, t_re, 0, 1, 2);
+ transpose_f64(y_re, y_re, t_re, 2, 3, 3);
+
+ transpose_f64(x_im, x_im, t_im, 0, 1, 0);
+ transpose_f64(x_im, x_im, t_im, 2, 3, 1);
+ transpose_f64(y_im, y_im, t_im, 0, 1, 2);
+ transpose_f64(y_im, y_im, t_im, 2, 3, 3);
+
+ t_re.val[0] = x_re.val[1];
+ x_re.val[1] = x_re.val[2];
+ x_re.val[2] = t_re.val[0];
+
+ t_re.val[1] = y_re.val[1];
+ y_re.val[1] = y_re.val[2];
+ y_re.val[2] = t_re.val[1];
+
+ t_im.val[0] = x_im.val[1];
+ x_im.val[1] = x_im.val[2];
+ x_im.val[2] = t_im.val[0];
+
+ t_im.val[1] = y_im.val[1];
+ y_im.val[1] = y_im.val[2];
+ y_im.val[2] = t_im.val[1];
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+ INV_TOPJ(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+ vloadx2(s_re_im, &fpr_tab4[k4]);
+ k4 += 4;
+
+ INV_BOTJ_LANE(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+ INV_BOTJm_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+ INV_BOTJ_LANE(y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2], s_re_im.val[1]);
+ INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[1]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3]);
+
+ vload(s_re_im.val[0], &fpr_tab3[k3]);
+ k3 += 2;
+
+ INV_BOTJ_LANE(x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+ INV_BOTJ_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+ INV_BOTJm_LANE(y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2], s_re_im.val[0]);
+ INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0]);
+
+ if ((j >> 4) & 1) {
+ INV_TOPJmx4(t_re, t_im, x_re, x_im, y_re, y_im);
+ } else {
+ INV_TOPJx4(t_re, t_im, x_re, x_im, y_re, y_im);
+ }
+
+ vload(s_re_im.val[0], &fpr_tab2[k2]);
+ k2 += 2 * ((j & 31) == 16);
+
+ if (last) {
+ vfmuln(s_re_im.val[0], s_re_im.val[0], fpr_p2_tab[logn]);
+ vfmulnx4(x_re, x_re, fpr_p2_tab[logn]);
+ vfmulnx4(x_im, x_im, fpr_p2_tab[logn]);
+ }
+ vstorex4(&f[j], x_re);
+ vstorex4(&f[j + hn], x_im);
+
+ if (logn == 5) {
+ // Special case in fpr_tab_log2 where re == im
+ vfmulx4_i(t_re, t_re, s_re_im.val[0]);
+ vfmulx4_i(t_im, t_im, s_re_im.val[0]);
+
+ vfaddx4(y_re, t_im, t_re);
+ vfsubx4(y_im, t_im, t_re);
+ } else {
+ if ((j >> 4) & 1) {
+ INV_BOTJm_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+ } else {
+ INV_BOTJ_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+ }
+ }
+
+ vstorex4(&f[j + 8], y_re);
+ vstorex4(&f[j + 8 + hn], y_im);
+ }
+}
+
+/*
+ * Vectorized 1 layer of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn1(fpr *f, const unsigned logn, const unsigned last) {
+ // Total SIMD register 26 = 24 + 2
+ float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im; // 24
+ float64x2_t s_re_im; // 2
+
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+ const unsigned ht = n >> 2;
+
+ for (unsigned j = 0; j < ht; j += 8) {
+ vloadx4(a_re, &f[j]);
+ vloadx4(a_im, &f[j + hn]);
+ vloadx4(b_re, &f[j + ht]);
+ vloadx4(b_im, &f[j + ht + hn]);
+
+ INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im);
+
+ s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ if (last) {
+ vfmuln(s_re_im, s_re_im, fpr_p2_tab[logn]);
+ vfmulnx4(a_re, a_re, fpr_p2_tab[logn]);
+ vfmulnx4(a_im, a_im, fpr_p2_tab[logn]);
+ }
+
+ vstorex4(&f[j], a_re);
+ vstorex4(&f[j + hn], a_im);
+
+ vfmulx4_i(t_re, t_re, s_re_im);
+ vfmulx4_i(t_im, t_im, s_re_im);
+
+ vfaddx4(b_re, t_im, t_re);
+ vfsubx4(b_im, t_im, t_re);
+
+ vstorex4(&f[j + ht], b_re);
+ vstorex4(&f[j + ht + hn], b_im);
+ }
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn2(fpr *f, const unsigned logn, const unsigned level, unsigned last) {
+ const unsigned int falcon_n = 1 << logn;
+ const unsigned int hn = falcon_n >> 1;
+
+ // Total SIMD register: 26 = 16 + 8 + 2
+ float64x2x4_t t_re, t_im; // 8
+ float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+ y1_re, y2_re, y1_im, y2_im; // 16
+ float64x2_t s1_re_im, s2_re_im; // 2
+
+ const fpr *fpr_inv_tab1 = NULL, *fpr_inv_tab2 = NULL;
+ unsigned l, len, start, j, k1, k2;
+ unsigned bar = logn - 4;
+
+ for (l = 4; l < logn - level - 1; l += 2) {
+ len = 1 << l;
+ last -= 1;
+ fpr_inv_tab1 = fpr_table[bar--];
+ fpr_inv_tab2 = fpr_table[bar--];
+ k1 = 0;
+ k2 = 0;
+
+ for (start = 0; start < hn; start += 1U << (l + 2)) {
+ vload(s1_re_im, &fpr_inv_tab1[k1]);
+ vload(s2_re_im, &fpr_inv_tab2[k2]);
+ k1 += 2;
+ k2 += 2U * ((start & 127) == 64);
+ if (!last) {
+ vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+ }
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(y1_re, &f[j + len]);
+ vloadx2(y1_im, &f[j + len + hn]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+ vloadx2(x2_re, &f[j + 2 * len]);
+ vloadx2(x2_im, &f[j + 2 * len + hn]);
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+ INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+ INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+ INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+ INV_TOPJ(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJ(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJ_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+ INV_BOTJ_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+ INV_BOTJ_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+ INV_BOTJ_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+ vstorex2(&f[j + 2 * len], x2_re);
+ vstorex2(&f[j + 2 * len + hn], x2_im);
+
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+
+ if (!last) {
+ vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+ vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+ }
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+
+ vstorex2(&f[j + len], y1_re);
+ vstorex2(&f[j + len + hn], y1_im);
+ }
+
+ start += 1U << (l + 2);
+ if (start >= hn) {
+ break;
+ }
+
+ vload(s1_re_im, &fpr_inv_tab1[k1]);
+ vload(s2_re_im, &fpr_inv_tab2[k2]);
+ k1 += 2;
+ k2 += 2U * ((start & 127) == 64);
+ if (!last) {
+ vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+ }
+
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(y1_re, &f[j + len]);
+ vloadx2(y1_im, &f[j + len + hn]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+ vloadx2(x2_re, &f[j + 2 * len]);
+ vloadx2(x2_im, &f[j + 2 * len + hn]);
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+ INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+ INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+ INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+ INV_TOPJm(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJm_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+ INV_BOTJm_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+ INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+ INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+ vstorex2(&f[j + 2 * len], x2_re);
+ vstorex2(&f[j + 2 * len + hn], x2_im);
+
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+
+ if (!last) {
+ vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+ vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+ }
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+
+ vstorex2(&f[j + len], y1_re);
+ vstorex2(&f[j + len + hn], y1_im);
+ }
+ }
+ }
+}
+
+/*
+ * Scalable vectorized Forward FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_FFT(fpr *f, const unsigned logn) {
+ unsigned level = logn;
+ switch (logn) {
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log2(f);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log3(f);
+ break;
+
+ case 4:
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log4(f);
+ break;
+
+ case 5:
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, 5);
+ break;
+
+ case 6:
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn1(f, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, logn);
+ break;
+
+ case 7:
+ case 9:
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn2(f, logn, level);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, logn);
+ break;
+
+ case 8:
+ case 10:
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn1(f, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_logn2(f, logn, level - 1);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT_log5(f, logn);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/*
+ * Scalable vectorized Inverse FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(fpr *f, const unsigned logn) {
+ const unsigned level = (logn - 5) & 1;
+
+ switch (logn) {
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log2(f);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log3(f);
+ break;
+
+ case 4:
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log4(f);
+ break;
+
+ case 5:
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, 5, 1);
+ break;
+
+ case 6:
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, logn, 0);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn1(f, logn, 1);
+ break;
+
+ case 7:
+ case 9:
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, logn, 0);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn2(f, logn, level, 1);
+ break;
+
+ case 8:
+ case 10:
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_log5(f, logn, 0);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn2(f, logn, level, 0);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT_logn1(f, logn, 1);
+ break;
+
+ default:
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft_tree.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft_tree.c
new file mode 100644
index 000000000..6e5432e25
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fft_tree.c
@@ -0,0 +1,247 @@
+/*
+ * High-speed vectorize FFT tree for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Merge FFT for 2 complex points (4 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log2(fpr *f, const fpr *f0, const fpr *f1) {
+ fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+ a_re = f0[0];
+ a_im = f0[1];
+ s = fpr_tab_log2[0];
+ b_re = f1[0] * s;
+ b_im = f1[1] * s;
+
+ d_re = b_re - b_im;
+ d_im = b_re + b_im;
+
+ f[0] = a_re + d_re;
+ f[2] = a_im + d_im;
+ f[1] = a_re - d_re;
+ f[3] = a_im - d_im;
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log3(fpr *f, const fpr *f0, const fpr *f1) {
+ // Total SIMD registers: 12 = 10 + 2
+ float64x2x2_t g1, g0, g_re, g_im, s_re_im; // 10
+ float64x2_t t_re, t_im; // 2
+
+ vloadx2(g1, &f1[0]);
+
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ FWD_TOP(t_re, t_im, g1.val[0], g1.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ vloadx2(g0, &f0[0]);
+
+ FPC_ADD(g_re.val[0], g_im.val[0], g0.val[0], g0.val[1], t_re, t_im);
+ FPC_SUB(g_re.val[1], g_im.val[1], g0.val[0], g0.val[1], t_re, t_im);
+
+ vstore2(&f[0], g_re);
+ vstore2(&f[4], g_im);
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log4(fpr *f, const fpr *f0, const fpr *f1, const unsigned logn) {
+ const unsigned n = 1 << logn;
+ const unsigned ht = n >> 2;
+ const fpr *fpr_merge = fpr_table[logn];
+
+ // Total SIMD register 22 = 14 + 8
+ float64x2x2_t g1_re, g1_im, g0_re, g0_im, s_re_im, t_re, t_im; // 14
+ float64x2x4_t g_re, g_im; // 8
+
+ for (unsigned j = 0; j < ht; j += 4) {
+ vload2(g1_re, &f1[j]);
+ vload2(g1_im, &f1[j + ht]);
+
+ vload2(s_re_im, &fpr_merge[j]);
+
+ FWD_TOP(t_re.val[0], t_im.val[0], g1_re.val[0], g1_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ vload2(g0_re, &f0[j]);
+
+ FWD_TOP(t_re.val[1], t_im.val[1], g1_re.val[1], g1_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ vload2(g0_im, &f0[j + ht]);
+
+ FPC_ADD(g_re.val[0], g_im.val[0], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+ FPC_SUB(g_re.val[1], g_im.val[1], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+ FPC_ADDJ(g_re.val[2], g_im.val[2], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+ FPC_SUBJ(g_re.val[3], g_im.val[3], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+
+ vstore4(&f[j << 1], g_re);
+ vstore4(&f[(j + ht) << 1], g_im);
+ }
+}
+
+/*
+ * 1 layer of Split FFT for 2 complex points (4 coefficients).
+ */
+static void
+PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log2(fpr *restrict f0, fpr *restrict f1, const fpr *restrict f) {
+ fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+ a_re = f[0];
+ b_re = f[1];
+ a_im = f[2];
+ b_im = f[3];
+ s = fpr_tab_log2[0] * 0.5;
+
+ f0[0] = (a_re + b_re) * 0.5;
+ f0[1] = (a_im + b_im) * 0.5;
+
+ d_re = (a_re - b_re) * s;
+ d_im = (a_im - b_im) * s;
+
+ f1[0] = d_im + d_re;
+ f1[1] = d_im - d_re;
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log3(fpr *f0, fpr *f1, const fpr *f) {
+ // Total SIMD registers: 12
+ float64x2x2_t re, im, g0, g1, s_re_im, tm; // 12
+
+ vload2(re, &f[0]);
+ vload2(im, &f[4]);
+
+ FPC_ADD(g0.val[0], g0.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+ FPC_SUB(tm.val[0], tm.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ vfmuln(g0.val[0], g0.val[0], 0.5);
+ vfmuln(g0.val[1], g0.val[1], 0.5);
+ vstorex2(&f0[0], g0);
+
+ vfmuln(s_re_im.val[0], s_re_im.val[0], 0.5);
+ vfmuln(s_re_im.val[1], s_re_im.val[1], 0.5);
+
+ INV_BOTJ(g1.val[0], g1.val[1], tm.val[0], tm.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ vstorex2(&f1[0], g1);
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log4(fpr *f0, fpr *f1, const fpr *f, const unsigned logn) {
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+ const unsigned ht = n >> 2;
+ const fpr *fpr_split = fpr_table[logn];
+
+ // Total SIMD register 23 = 1 + 8 + 14
+ float64x2_t half; // 1
+ float64x2x4_t g_re, g_im; // 8
+ float64x2x2_t s_re_im, t_re, t_im, g1_re, g1_im, g0_re, g0_im; // 14
+
+ half = vdupq_n_f64(0.5);
+ for (unsigned j = 0; j < ht; j += 4) {
+ unsigned j2 = j << 1;
+ vload4(g_re, &f[j2]);
+ vload4(g_im, &f[j2 + hn]);
+
+ FPC_ADD(g0_re.val[0], g0_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+ FPC_ADD(g0_re.val[1], g0_im.val[1], g_re.val[2], g_im.val[2], g_re.val[3], g_im.val[3]);
+
+ FPC_SUB(t_re.val[0], t_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+ FPC_SUB(t_re.val[1], t_im.val[1], g_re.val[3], g_im.val[3], g_re.val[2], g_im.val[2]);
+
+ vload2(s_re_im, &fpr_split[j]);
+
+ vfmul(g0_re.val[0], g0_re.val[0], half);
+ vfmul(g0_re.val[1], g0_re.val[1], half);
+ vstore2(&f0[j], g0_re);
+
+ vfmul(g0_im.val[0], g0_im.val[0], half);
+ vfmul(g0_im.val[1], g0_im.val[1], half);
+ vstore2(&f0[j + ht], g0_im);
+
+ vfmul(s_re_im.val[0], s_re_im.val[0], half);
+ vfmul(s_re_im.val[1], s_re_im.val[1], half);
+
+ INV_BOTJ(g1_re.val[0], g1_im.val[0], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(g1_re.val[1], g1_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ vstore2(&f1[j], g1_re);
+ vstore2(&f1[j + ht], g1_im);
+ }
+}
+
+/*
+ * Vectorized Split FFT implementation
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1, const fpr *f, const unsigned logn) {
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1; qn = 0;
+ f0[0] = f[0];
+ f1[0] = f[1];
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log2(f0, f1, f);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log3(f0, f1, f);
+ break;
+
+ default:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log4(f0, f1, f, logn);
+ break;
+ }
+}
+
+/*
+ * Vectorized Merge FFT implementation
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(fpr *restrict f, const fpr *restrict f0,
+ const fpr *restrict f1, const unsigned logn) {
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1;
+ f[0] = f0[0];
+ f[1] = f1[0];
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log2(f, f0, f1);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log3(f, f0, f1);
+ break;
+
+ default:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log4(f, f0, f1, logn);
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.c
new file mode 100644
index 000000000..3270c0d38
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.c
@@ -0,0 +1,336 @@
+/*
+ * Compressed floating-point Twiddle Factor.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+
+const fpr fpr_p2_tab[] = {
+ 2.00000000000,
+ 1.00000000000,
+ 0.50000000000,
+ 0.25000000000,
+ 0.12500000000,
+ 0.06250000000,
+ 0.03125000000,
+ 0.01562500000,
+ 0.00781250000,
+ 0.00390625000,
+ 0.00195312500
+};
+
+const fpr fpr_tab_log2[] = {
+ 0.707106781186547524400844362, 0.707106781186547524400844362, // 4, 5
+};
+
+const fpr fpr_tab_log3[] = {
+ 0.923879532511286756128183189, 0.382683432365089771728459984, // 8, 9
+ -0.382683432365089771728459984, 0.923879532511286756128183189,
+};
+
+const fpr fpr_tab_log4[] = {
+ 0.980785280403230449126182236, 0.195090322016128267848284868, // 16
+ 0.555570233019602224742830814, 0.831469612302545237078788378, // 20
+};
+
+const fpr fpr_tab_log5[] = {
+ 0.995184726672196886244836953, 0.098017140329560601994195564, // 32
+ 0.634393284163645498215171613, 0.773010453362736960810906610, // 36
+ 0.881921264348355029712756864, 0.471396736825997648556387626, // 40
+ 0.290284677254462367636192376, 0.956940335732208864935797887, // 44
+};
+
+const fpr fpr_tab_log6[] = {
+ 0.998795456205172392714771605, 0.049067674327418014254954977, // 64
+ 0.671558954847018400625376850, 0.740951125354959091175616897, // 68
+ 0.903989293123443331586200297, 0.427555093430282094320966857, // 72
+ 0.336889853392220050689253213, 0.941544065183020778412509403, // 76
+ 0.970031253194543992603984207, 0.242980179903263889948274162, // 80
+ 0.514102744193221726593693839, 0.857728610000272069902269984, // 84
+ 0.803207531480644909806676513, 0.595699304492433343467036529, // 88
+ 0.146730474455361751658850130, 0.989176509964780973451673738, // 92
+};
+
+const fpr fpr_tab_log7[] = {
+ 0.999698818696204220115765650, 0.024541228522912288031734529, // 128
+ 0.689540544737066924616730630, 0.724247082951466920941069243, // 132
+ 0.914209755703530654635014829, 0.405241314004989870908481306, // 136
+ 0.359895036534988148775104572, 0.932992798834738887711660256, // 140
+ 0.975702130038528544460395766, 0.219101240156869797227737547, // 144
+ 0.534997619887097210663076905, 0.844853565249707073259571205, // 148
+ 0.817584813151583696504920884, 0.575808191417845300745972454, // 152
+ 0.170961888760301226363642357, 0.985277642388941244774018433, // 156
+ 0.992479534598709998156767252, 0.122410675199216198498704474, // 160
+ 0.615231590580626845484913563, 0.788346427626606262009164705, // 164
+ 0.870086991108711418652292404, 0.492898192229784036873026689, // 168
+ 0.266712757474898386325286515, 0.963776065795439866686464356, // 172
+ 0.949528180593036667195936074, 0.313681740398891476656478846, // 176
+ 0.449611329654606600046294579, 0.893224301195515320342416447, // 180
+ 0.757208846506484547575464054, 0.653172842953776764084203014, // 184
+ 0.073564563599667423529465622, 0.997290456678690216135597140, // 188
+};
+
+const fpr fpr_tab_log8[] = {
+ 0.999924701839144540921646491, 0.012271538285719926079408262, // 256
+ 0.698376249408972853554813503, 0.715730825283818654125532623, // 260
+ 0.919113851690057743908477789, 0.393992040061048108596188661, // 264
+ 0.371317193951837543411934967, 0.928506080473215565937167396, // 268
+ 0.978317370719627633106240097, 0.207111376192218549708116020, // 272
+ 0.545324988422046422313987347, 0.838224705554838043186996856, // 276
+ 0.824589302785025264474803737, 0.565731810783613197389765011, // 280
+ 0.183039887955140958516532578, 0.983105487431216327180301155, // 284
+ 0.993906970002356041546922813, 0.110222207293883058807899140, // 288
+ 0.624859488142386377084072816, 0.780737228572094478301588484, // 292
+ 0.876070094195406607095844268, 0.482183772079122748517344481, // 296
+ 0.278519689385053105207848526, 0.960430519415565811199035138, // 300
+ 0.953306040354193836916740383, 0.302005949319228067003463232, // 304
+ 0.460538710958240023633181487, 0.887639620402853947760181617, // 308
+ 0.765167265622458925888815999, 0.643831542889791465068086063, // 312
+ 0.085797312344439890461556332, 0.996312612182778012627226190, // 316
+ 0.998118112900149207125155861, 0.061320736302208577782614593, // 320
+ 0.662415777590171761113069817, 0.749136394523459325469203257, // 324
+ 0.898674465693953843041976744, 0.438616238538527637647025738, // 328
+ 0.325310292162262934135954708, 0.945607325380521325730945387, // 332
+ 0.966976471044852109087220226, 0.254865659604514571553980779, // 336
+ 0.503538383725717558691867071, 0.863972856121586737918147054, // 340
+ 0.795836904608883536262791915, 0.605511041404325513920626941, // 344
+ 0.134580708507126186316358409, 0.990902635427780025108237011, // 348
+ 0.987301418157858382399815802, 0.158858143333861441684385360, // 352
+ 0.585797857456438860328080838, 0.810457198252594791726703434, // 356
+ 0.851355193105265142261290312, 0.524589682678468906215098464, // 360
+ 0.231058108280671119643236018, 0.972939952205560145467720114, // 364
+ 0.937339011912574923201899593, 0.348418680249434568419308588, // 368
+ 0.416429560097637182562598911, 0.909167983090522376563884788, // 372
+ 0.732654271672412834615546649, 0.680600997795453050594430464, // 376
+ 0.036807222941358832324332691, 0.999322384588349500896221011, // 380
+};
+
+const fpr fpr_tab_log9[] = {
+ 0.999981175282601142656990438, 0.006135884649154475359640235, // 512
+ 0.702754744457225302452914421, 0.711432195745216441522130290, // 516
+ 0.921514039342041943465396332, 0.388345046698826291624993541, // 520
+ 0.377007410216418256726567823, 0.926210242138311341974793388, // 524
+ 0.979569765685440534439326110, 0.201104634842091911558443546, // 528
+ 0.550457972936604802977289893, 0.834862874986380056304401383, // 532
+ 0.828045045257755752067527592, 0.560661576197336023839710223, // 536
+ 0.189068664149806212754997837, 0.981963869109555264072848154, // 540
+ 0.994564570734255452119106243, 0.104121633872054579120943880, // 544
+ 0.629638238914927025372981341, 0.776888465673232450040827983, // 548
+ 0.879012226428633477831323711, 0.476799230063322133342158117, // 552
+ 0.284407537211271843618310615, 0.958703474895871555374645792, // 556
+ 0.955141168305770721498157712, 0.296150888243623824121786128, // 560
+ 0.465976495767966177902756065, 0.884797098430937780104007041, // 564
+ 0.769103337645579639346626069, 0.639124444863775743801488193, // 568
+ 0.091908956497132728624990979, 0.995767414467659793982495643, // 572
+ 0.998475580573294752208559038, 0.055195244349689939809447526, // 576
+ 0.666999922303637506650154222, 0.745057785441465962407907310, // 580
+ 0.901348847046022014570746093, 0.433093818853151968484222638, // 584
+ 0.331106305759876401737190737, 0.943593458161960361495301445, // 588
+ 0.968522094274417316221088329, 0.248927605745720168110682816, // 592
+ 0.508830142543107036931749324, 0.860866938637767279344583877, // 596
+ 0.799537269107905033500246232, 0.600616479383868926653875896, // 600
+ 0.140658239332849230714788846, 0.990058210262297105505906464, // 604
+ 0.988257567730749491404792538, 0.152797185258443427720336613, // 608
+ 0.590759701858874228423887908, 0.806847553543799272206514313, // 612
+ 0.854557988365400520767862276, 0.519355990165589587361829932, // 616
+ 0.237023605994367206867735915, 0.971503890986251775537099622, // 620
+ 0.939459223602189911962669246, 0.342660717311994397592781983, // 624
+ 0.422000270799799685941287941, 0.906595704514915365332960588, // 628
+ 0.736816568877369875090132520, 0.676092703575315960360419228, // 632
+ 0.042938256934940823077124540, 0.999077727752645382888781997, // 636
+ 0.999529417501093163079703322, 0.030674803176636625934021028, // 640
+ 0.685083667772700381362052545, 0.728464390448225196492035438, // 644
+ 0.911706032005429851404397325, 0.410843171057903942183466675, // 648
+ 0.354163525420490382357395796, 0.935183509938947577642207480, // 652
+ 0.974339382785575860518721668, 0.225083911359792835991642120, // 656
+ 0.529803624686294668216054671, 0.848120344803297251279133563, // 660
+ 0.814036329705948361654516690, 0.580813958095764545075595272, // 664
+ 0.164913120489969921418189113, 0.986308097244598647863297524, // 668
+ 0.991709753669099522860049931, 0.128498110793793172624415589, // 672
+ 0.610382806276309452716352152, 0.792106577300212351782342879, // 676
+ 0.867046245515692651480195629, 0.498227666972781852410983869, // 680
+ 0.260794117915275518280186509, 0.965394441697689374550843858, // 684
+ 0.947585591017741134653387321, 0.319502030816015677901518272, // 688
+ 0.444122144570429231642069418, 0.895966249756185155914560282, // 692
+ 0.753186799043612482483430486, 0.657806693297078656931182264, // 696
+ 0.067443919563664057897972422, 0.997723066644191609848546728, // 700
+ 0.996820299291165714972629398, 0.079682437971430121147120656, // 704
+ 0.648514401022112445084560551, 0.761202385484261814029709836, // 708
+ 0.890448723244757889952150560, 0.455083587126343823535869268, // 712
+ 0.307849640041534893682063646, 0.951435020969008369549175569, // 716
+ 0.962121404269041595429604316, 0.272621355449948984493347477, // 720
+ 0.487550160148435954641485027, 0.873094978418290098636085973, // 724
+ 0.784556597155575233023892575, 0.620057211763289178646268191, // 728
+ 0.116318630911904767252544319, 0.993211949234794533104601012, // 732
+ 0.984210092386929073193874387, 0.177004220412148756196839844, // 736
+ 0.570780745886967280232652864, 0.821102514991104679060430820, // 740
+ 0.841554977436898409603499520, 0.540171472729892881297845480, // 744
+ 0.213110319916091373967757518, 0.977028142657754351485866211, // 748
+ 0.930766961078983731944872340, 0.365612997804773870011745909, // 752
+ 0.399624199845646828544117031, 0.916679059921042663116457013, // 756
+ 0.720002507961381629076682999, 0.693971460889654009003734389, // 760
+ 0.018406729905804820927366313, 0.999830581795823422015722275, // 764
+};
+
+const fpr fpr_tab_log10[] = {
+ 0.999995293809576171511580126, 0.003067956762965976270145365, // 1024
+ 0.704934080375904908852523758, 0.709272826438865651316533772, // 1028
+ 0.922701128333878570437264227, 0.385516053843918864075607949, // 1032
+ 0.379847208924051170576281147, 0.925049240782677590302371869, // 1036
+ 0.980182135968117392690210009, 0.198098410717953586179324918, // 1040
+ 0.553016705580027531764226988, 0.833170164701913186439915922, // 1044
+ 0.829761233794523042469023765, 0.558118531220556115693702964, // 1048
+ 0.192080397049892441679288205, 0.981379193313754574318224190, // 1052
+ 0.994879330794805620591166107, 0.101069862754827824987887585, // 1056
+ 0.632018735939809021909403706, 0.774953106594873878359129282, // 1060
+ 0.880470889052160770806542929, 0.474100214650550014398580015, // 1064
+ 0.287347459544729526477331841, 0.957826413027532890321037029, // 1068
+ 0.956045251349996443270479823, 0.293219162694258650606608599, // 1072
+ 0.468688822035827933697617870, 0.883363338665731594736308015, // 1076
+ 0.771060524261813773200605759, 0.636761861236284230413943435, // 1080
+ 0.094963495329638998938034312, 0.995480755491926941769171600, // 1084
+ 0.998640218180265222418199049, 0.052131704680283321236358216, // 1088
+ 0.669282588346636065720696366, 0.743007952135121693517362293, // 1092
+ 0.902673318237258806751502391, 0.430326481340082633908199031, // 1096
+ 0.333999651442009404650865481, 0.942573197601446879280758735, // 1100
+ 0.969281235356548486048290738, 0.245955050335794611599924709, // 1104
+ 0.511468850437970399504391001, 0.859301818357008404783582139, // 1108
+ 0.801376171723140219430247777, 0.598160706996342311724958652, // 1112
+ 0.143695033150294454819773349, 0.989622017463200834623694454, // 1116
+ 0.988721691960323767604516485, 0.149764534677321517229695737, // 1120
+ 0.593232295039799808047809426, 0.805031331142963597922659282, // 1124
+ 0.856147328375194481019630732, 0.516731799017649881508753876, // 1128
+ 0.240003022448741486568922365, 0.970772140728950302138169611, // 1132
+ 0.940506070593268323787291309, 0.339776884406826857828825803, // 1136
+ 0.424779681209108833357226189, 0.905296759318118774354048329, // 1140
+ 0.738887324460615147933116508, 0.673829000378756060917568372, // 1144
+ 0.046003182130914628814301788, 0.998941293186856850633930266, // 1148
+ 0.999618822495178597116830637, 0.027608145778965741612354872, // 1152
+ 0.687315340891759108199186948, 0.726359155084345976817494315, // 1156
+ 0.912962190428398164628018233, 0.408044162864978680820747499, // 1160
+ 0.357030961233430032614954036, 0.934092550404258914729877883, // 1164
+ 0.975025345066994146844913468, 0.222093620973203534094094721, // 1168
+ 0.532403127877197971442805218, 0.846490938774052078300544488, // 1172
+ 0.815814410806733789010772660, 0.578313796411655563342245019, // 1176
+ 0.167938294974731178054745536, 0.985797509167567424700995000, // 1180
+ 0.992099313142191757112085445, 0.125454983411546238542336453, // 1184
+ 0.612810082429409703935211936, 0.790230221437310055030217152, // 1188
+ 0.868570705971340895340449876, 0.495565261825772531150266670, // 1192
+ 0.263754678974831383611349322, 0.964589793289812723836432159, // 1196
+ 0.948561349915730288158494826, 0.316593375556165867243047035, // 1200
+ 0.446868840162374195353044389, 0.894599485631382678433072126, // 1204
+ 0.755201376896536527598710756, 0.655492852999615385312679701, // 1208
+ 0.070504573389613863027351471, 0.997511456140303459699448390, // 1212
+ 0.997060070339482978987989949, 0.076623861392031492278332463, // 1216
+ 0.650846684996380915068975573, 0.759209188978388033485525443, // 1220
+ 0.891840709392342727796478697, 0.452349587233770874133026703, // 1224
+ 0.310767152749611495835997250, 0.950486073949481721759926101, // 1228
+ 0.962953266873683886347921481, 0.269668325572915106525464462, // 1232
+ 0.490226483288291154229598449, 0.871595086655951034842481435, // 1236
+ 0.786455213599085757522319464, 0.617647307937803932403979402, // 1240
+ 0.119365214810991364593637790, 0.992850414459865090793563344, // 1244
+ 0.984748501801904218556553176, 0.173983873387463827950700807, // 1248
+ 0.573297166698042212820171239, 0.819347520076796960824689637, // 1252
+ 0.843208239641845437161743865, 0.537587076295645482502214932, // 1256
+ 0.216106797076219509948385131, 0.976369731330021149312732194, // 1260
+ 0.931884265581668106718557199, 0.362755724367397216204854462, // 1264
+ 0.402434650859418441082533934, 0.915448716088267819566431292, // 1268
+ 0.722128193929215321243607198, 0.691759258364157774906734132, // 1272
+ 0.021474080275469507418374898, 0.999769405351215321657617036, // 1276
+ 0.999882347454212525633049627, 0.015339206284988101044151868, // 1280
+ 0.696177131491462944788582591, 0.717870045055731736211325329, // 1284
+ 0.917900775621390457642276297, 0.396809987416710328595290911, // 1288
+ 0.368466829953372331712746222, 0.929640895843181265457918066, // 1292
+ 0.977677357824509979943404762, 0.210111836880469621717489972, // 1296
+ 0.542750784864515906586768661, 0.839893794195999504583383987, // 1300
+ 0.822849781375826332046780034, 0.568258952670131549790548489, // 1304
+ 0.180022901405699522679906590, 0.983662419211730274396237776, // 1308
+ 0.993564135520595333782021697, 0.113270952177564349018228733, // 1312
+ 0.622461279374149972519166721, 0.782650596166575738458949301, // 1316
+ 0.874586652278176112634431897, 0.484869248000791101822951699, // 1320
+ 0.275571819310958163076425168, 0.961280485811320641748659653, // 1324
+ 0.952375012719765858529893608, 0.304929229735402406490728633, // 1328
+ 0.457813303598877221904961155, 0.889048355854664562540777729, // 1332
+ 0.763188417263381271704838297, 0.646176012983316364832802220, // 1336
+ 0.082740264549375693111987083, 0.996571145790554847093566910, // 1340
+ 0.997925286198596012623025462, 0.064382630929857460819324537, // 1344
+ 0.660114342067420478559490747, 0.751165131909686411205819422, // 1348
+ 0.897324580705418281231391836, 0.441371268731716692879988968, // 1352
+ 0.322407678801069848384807478, 0.946600913083283570044599823, // 1356
+ 0.966190003445412555433832961, 0.257831102162159005614471295, // 1360
+ 0.500885382611240786241285004, 0.865513624090569082825488358, // 1364
+ 0.793975477554337164895083757, 0.607949784967773667243642671, // 1368
+ 0.131540028702883111103387493, 0.991310859846115418957349799, // 1372
+ 0.986809401814185476970235952, 0.161886393780111837641387995, // 1376
+ 0.583308652937698294392830961, 0.812250586585203913049744181, // 1380
+ 0.849741768000852489471268395, 0.527199134781901348464274575, // 1384
+ 0.228072083170885739254457379, 0.973644249650811925318383912, // 1388
+ 0.936265667170278246576310996, 0.351292756085567125601307623, // 1392
+ 0.413638312238434547471944324, 0.910441292258067196934095369, // 1396
+ 0.730562769227827561177758850, 0.682845546385248068164596123, // 1400
+ 0.033741171851377584833716112, 0.999430604555461772019008327, // 1404
+ 0.999204758618363895492950001, 0.039872927587739811128578738, // 1408
+ 0.678350043129861486873655042, 0.734738878095963464563223604, // 1412
+ 0.907886116487666212038681480, 0.419216888363223956433010020, // 1416
+ 0.345541324963989065539191723, 0.938403534063108112192420774, // 1420
+ 0.972226497078936305708321144, 0.234041958583543423191242045, // 1424
+ 0.521975292937154342694258318, 0.852960604930363657746588082, // 1428
+ 0.808656181588174991946968128, 0.588281548222645304786439813, // 1432
+ 0.155828397654265235743101486, 0.987784141644572154230969032, // 1436
+ 0.990485084256457037998682243, 0.137620121586486044948441663, // 1440
+ 0.603066598540348201693430617, 0.797690840943391108362662755, // 1444
+ 0.862423956111040538690933878, 0.506186645345155291048942344, // 1448
+ 0.251897818154216950498106628, 0.967753837093475465243391912, // 1452
+ 0.944604837261480265659265493, 0.328209843579092526107916817, // 1456
+ 0.435857079922255491032544080, 0.900015892016160228714535267, // 1460
+ 0.747100605980180144323078847, 0.664710978203344868130324985, // 1464
+ 0.058258264500435759613979782, 0.998301544933892840738782163, // 1468
+ 0.996044700901251989887944810, 0.088853552582524596561586535, // 1472
+ 0.641481012808583151988739898, 0.767138911935820381181694573, // 1476
+ 0.886222530148880631647990821, 0.463259783551860197390719637, // 1480
+ 0.299079826308040476750336973, 0.954228095109105629780430732, // 1484
+ 0.959571513081984528335528181, 0.281464937925757984095231007, // 1488
+ 0.479493757660153026679839798, 0.877545290207261291668470750, // 1492
+ 0.778816512381475953374724325, 0.627251815495144113509622565, // 1496
+ 0.107172424956808849175529148, 0.994240449453187946358413442, // 1500
+ 0.982539302287441255907040396, 0.186055151663446648105438304, // 1504
+ 0.563199344013834115007363772, 0.826321062845663480311195452, // 1508
+ 0.836547727223511984524285790, 0.547894059173100165608820571, // 1512
+ 0.204108966092816874181696950, 0.978948175319062194715480124, // 1516
+ 0.927362525650401087274536959, 0.374164062971457997104393020, // 1520
+ 0.391170384302253888687512949, 0.920318276709110566440076541, // 1524
+ 0.713584868780793592903125099, 0.700568793943248366792866380, // 1528
+ 0.009203754782059819315102378, 0.999957644551963866333120920, // 1532
+};
+
+const fpr *fpr_table[] = {
+ NULL, NULL,
+ fpr_tab_log2,
+ fpr_tab_log3,
+ fpr_tab_log4,
+ fpr_tab_log5,
+ fpr_tab_log6,
+ fpr_tab_log7,
+ fpr_tab_log8,
+ fpr_tab_log9,
+ fpr_tab_log10,
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.h
new file mode 100644
index 000000000..ae99a0bd6
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/fpr.h
@@ -0,0 +1,247 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/* ====================================================================== */
+
+#include
+#include
+
+#include "macrof.h"
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef double fpr;
+
+static inline fpr
+FPR(double v) {
+ fpr x;
+
+ x = v;
+ return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+ return (double)i;
+}
+
+static const fpr fpr_q = 12289.0 ;
+static const fpr fpr_inverse_of_q = 1.0 / 12289.0 ;
+static const fpr fpr_inv_2sqrsigma0 = .150865048875372721532312163019 ;
+static const fpr fpr_inv_sigma_10 = 0.0059386453095331159950250124336477482 ;
+static const fpr fpr_sigma_min_10 = 1.2982803343442918539708792538826807 ;
+static const fpr fpr_log2 = 0.69314718055994530941723212146 ;
+static const fpr fpr_inv_log2 = 1.4426950408889634073599246810 ;
+static const fpr fpr_bnorm_max = 16822.4121 ;
+static const fpr fpr_zero = 0.0 ;
+static const fpr fpr_one = 1.0 ;
+static const fpr fpr_two = 2.0 ;
+static const fpr fpr_onehalf = 0.5 ;
+static const fpr fpr_invsqrt2 = 0.707106781186547524400844362105 ;
+static const fpr fpr_invsqrt8 = 0.353553390593273762200422181052 ;
+static const fpr fpr_ptwo31 = 2147483648.0 ;
+static const fpr fpr_ptwo31m1 = 2147483647.0 ;
+static const fpr fpr_mtwo31m1 = -2147483647.0 ;
+static const fpr fpr_ptwo63m1 = 9223372036854775807.0 ;
+static const fpr fpr_mtwo63m1 = -9223372036854775807.0 ;
+static const fpr fpr_ptwo63 = 9223372036854775808.0 ;
+
+static inline int64_t
+fpr_rint(fpr x) {
+ int64_t t;
+ __asm__ ( "fcvtns %x0, %d1": "=r" (t) : "w" (x));
+ return t;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+ int64_t r;
+
+ /*
+ * The cast performs a trunc() (rounding toward 0) and thus is
+ * wrong by 1 for most negative values. The correction below is
+ * constant-time as long as the compiler turns the
+ * floating-point conversion result into a 0/1 integer without a
+ * conditional branch or another non-constant-time construction.
+ * This should hold on all modern architectures with an FPU (and
+ * if it is false on a given arch, then chances are that the FPU
+ * itself is not constant-time, making the point moot).
+ */
+ r = (int64_t)x;
+ return r - (x < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+ return (int64_t)x;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+ return (x + y);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+ return (x - y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+ return (-x);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+ return (x * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+ return (x + x);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+ return (x * y);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+ return (x * x);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+ return (1.0 / x);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+ return (x / y);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+ __asm__ ( "fsqrt %d0, %d0" : "+w" (x) : : );
+ return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+ return x < y;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+ static const double C_expm[] = {
+ 1.000000000000000000000000000000, // c0
+ -0.999999999999994892974086724280, // c1
+ 0.500000000000019206858326015208, // c2
+ -0.166666666666984014666397229121, // c3
+ 0.041666666666110491190622155955, // c4
+ -0.008333333327800835146903501993, // c5
+ 0.001388888894063186997887560103, // c6
+ -0.000198412739277311890541063977, // c7
+ 0.000024801566833585381209939524, // c8
+ -0.000002755586350219122514855659, // c9
+ 0.000000275607356160477811864927, // c10
+ -0.000000025299506379442070029551, // c11
+ 0.000000002073772366009083061987, // c12
+ 0.000000000000000000000000000000,
+ };
+ float64x2_t neon_x, neon_1x, neon_x2,
+ neon_x4, neon_x8, neon_x12, neon_ccs;
+ float64x2x4_t neon_exp0;
+ float64x2x3_t neon_exp1;
+ float64x2_t y1, y2, y3, y;
+ double ret;
+
+ neon_exp0 = vld1q_f64_x4(&C_expm[0]);
+ neon_exp1 = vld1q_f64_x3(&C_expm[8]);
+ neon_ccs = vdupq_n_f64(ccs);
+ neon_ccs = vmulq_n_f64(neon_ccs, fpr_ptwo63);
+
+ // x | x
+ neon_x = vdupq_n_f64(x);
+ // 1 | x
+ neon_1x = vsetq_lane_f64(1.0, neon_x, 0);
+ neon_x2 = vmulq_f64(neon_x, neon_x);
+ neon_x4 = vmulq_f64(neon_x2, neon_x2);
+ neon_x8 = vmulq_f64(neon_x4, neon_x4);
+ neon_x12 = vmulq_f64(neon_x8, neon_x4);
+
+ vfmla(y1, neon_exp0.val[0], neon_exp0.val[1], neon_x2);
+ vfmla(y2, neon_exp0.val[2], neon_exp0.val[3], neon_x2);
+ vfmla(y3, neon_exp1.val[0], neon_exp1.val[1], neon_x2);
+
+ y1 = vmulq_f64(y1, neon_1x);
+ y2 = vmulq_f64(y2, neon_1x);
+ y3 = vmulq_f64(y3, neon_1x);
+
+ vfmla(y, y1, y2, neon_x4);
+ vfmla(y, y, y3, neon_x8);
+ vfmla(y, y, neon_exp1.val[2], neon_x12);
+ y = vmulq_f64( y, neon_ccs);
+ ret = vaddvq_f64(y);
+
+ return (uint64_t) ret;
+}
+
+#define fpr_p2_tab PQCLEAN_FALCONPADDED1024_AARCH64_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+#define fpr_tab_log2 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log2
+#define fpr_tab_log3 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log3
+#define fpr_tab_log4 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log4
+#define fpr_tab_log5 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log5
+#define fpr_tab_log6 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log6
+#define fpr_tab_log7 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log7
+#define fpr_tab_log8 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log8
+#define fpr_tab_log9 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log9
+#define fpr_tab_log10 PQCLEAN_FALCONPADDED1024_AARCH64_fpr_tab_log10
+#define fpr_table PQCLEAN_FALCONPADDED1024_AARCH64_fpr_table
+
+extern const fpr fpr_tab_log2[];
+extern const fpr fpr_tab_log3[];
+extern const fpr fpr_tab_log4[];
+extern const fpr fpr_tab_log5[];
+extern const fpr fpr_tab_log6[];
+extern const fpr fpr_tab_log7[];
+extern const fpr fpr_tab_log8[];
+extern const fpr fpr_tab_log9[];
+extern const fpr fpr_tab_log10[];
+extern const fpr *fpr_table[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/inner.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/inner.h
new file mode 100644
index 000000000..9674aecfc
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/inner.h
@@ -0,0 +1,825 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+#include "params.h"
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ * - All public functions (i.e. the non-static ones) must be referenced
+ * with the PQCLEAN_FALCONPADDED1024_AARCH64_ macro (e.g. PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw for the verify_raw()
+ * function). That macro adds a prefix to the name, which is
+ * configurable with the FALCON_PREFIX macro. This allows compiling
+ * the code into a specific "namespace" and potentially including
+ * several versions of this code into a single application (e.g. to
+ * have an AVX2 and a non-AVX2 variants and select the one to use at
+ * runtime based on availability of AVX2 opcodes).
+ *
+ * - Functions that need temporary buffers expects them as a final
+ * tmp[] array of type uint8_t*, with a size which is documented for
+ * each function. However, most have some alignment requirements,
+ * because they will use the array to store 16-bit, 32-bit or 64-bit
+ * values (e.g. uint64_t or double). The caller must ensure proper
+ * alignment. What happens on unaligned access depends on the
+ * underlying architecture, ranging from a slight time penalty
+ * to immediate termination of the process.
+ *
+ * - Some functions rely on specific rounding rules and precision for
+ * floating-point numbers. On some systems (in particular 32-bit x86
+ * with the 387 FPU), this requires setting an hardware control
+ * word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ * oldcw = set_fpu_cw(2);
+ * PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(...);
+ * set_fpu_cw(oldcw);
+ *
+ * On systems where the native floating-point precision is already
+ * proper, or integer-based emulation is used, the set_fpu_cw()
+ * function does nothing, so it can be called systematically.
+ */
+
+#include
+#include
+#include
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+ return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context shake256incctx
+#define inner_shake256_init(sc) shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ * - modq: sequence of values modulo 12289, each encoded over exactly
+ * 14 bits. The encoder and decoder verify that integers are within
+ * the valid range (0..12288). Values are arrays of uint16.
+ *
+ * - trim: sequence of signed integers, a specified number of bits
+ * each. The number of bits is provided as parameter and includes
+ * the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ * (which means that the -2^(bits-1) value is forbidden); encode and
+ * decode functions check that property. Values are arrays of
+ * int16_t or int8_t, corresponding to names 'trim_i16' and
+ * 'trim_i8', respectively.
+ *
+ * - comp: variable-length encoding for signed integers; each integer
+ * uses a minimum of 9 bits, possibly more. This is normally used
+ * only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_modq_encode(void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(void *out, size_t max_out_len, const int8_t *x, uint8_t bits);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x);
+
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_modq_decode(uint16_t *x, const void *in,
+ size_t max_in_len, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(int8_t *x, unsigned bits, const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AARCH64_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_vartime(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short(const int16_t *s1, const int16_t *s2);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+ const int16_t *hm, const double *t0,
+ const double *t1);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+/*
+ * Convert a public key to NTT. Conversion is done in place.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt(int16_t *h);
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt_monty(int16_t *h);
+
+/*
+ * Internal signature verification code:
+ * c0[] contains the hashed nonce+message
+ * s2[] is the decoded signature
+ * h[] contains the public key, in NTT + Montgomery format
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+ int16_t *h, int16_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_compute_public(int16_t *h, const int8_t *f,
+ const int8_t *g, int16_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_complete_private(int8_t *G, const int8_t *f,
+ const int8_t *g, const int8_t *F,
+ uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ * h[] receives the public key (NOT in NTT/Montgomery format)
+ * c0[] contains the hashed nonce+message
+ * s1[] is the first signature half
+ * s2[] is the second signature half
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+ const int16_t *s1, const int16_t *s2,
+ uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ * fpr
+ * type for a real number
+ *
+ * fpr fpr_of(int64_t i)
+ * cast an integer into a real number; source must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_scaled(int64_t i, int sc)
+ * compute i*2^sc as a real number; source 'i' must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_ldexp(fpr x, int e)
+ * compute x*2^e
+ *
+ * int64_t fpr_rint(fpr x)
+ * round x to the nearest integer; x must be in the -(2^63-1)
+ * to +(2^63-1) range
+ *
+ * int64_t fpr_trunc(fpr x)
+ * round to an integer; this rounds towards zero; value must
+ * be in the -(2^63-1) to +(2^63-1) range
+ *
+ * fpr fpr_add(fpr x, fpr y)
+ * compute x + y
+ *
+ * fpr fpr_sub(fpr x, fpr y)
+ * compute x - y
+ *
+ * fpr fpr_neg(fpr x)
+ * compute -x
+ *
+ * fpr fpr_half(fpr x)
+ * compute x/2
+ *
+ * fpr fpr_double(fpr x)
+ * compute x*2
+ *
+ * fpr fpr_mul(fpr x, fpr y)
+ * compute x * y
+ *
+ * fpr fpr_sqr(fpr x)
+ * compute x * x
+ *
+ * fpr fpr_inv(fpr x)
+ * compute 1/x
+ *
+ * fpr fpr_div(fpr x, fpr y)
+ * compute x/y
+ *
+ * fpr fpr_sqrt(fpr x)
+ * compute the square root of x
+ *
+ * int fpr_lt(fpr x, fpr y)
+ * return 1 if x < y, 0 otherwise
+ *
+ * uint64_t fpr_expm_p63(fpr x)
+ * return exp(x), assuming that 0 <= x < log(2). Returned value
+ * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ * rounded to the nearest integer). Computation should have a
+ * precision of at least 45 bits.
+ *
+ * const fpr fpr_gm_tab[]
+ * array of constants for FFT / iFFT
+ *
+ * const fpr fpr_p2_tab[]
+ * precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ * fpr fpr_q 12289
+ * fpr fpr_inverse_of_q 1/12289
+ * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2))
+ * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10)
+ * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10)
+ * fpr fpr_log2 log(2)
+ * fpr fpr_inv_log2 1/log(2)
+ * fpr fpr_bnorm_max 16822.4121
+ * fpr fpr_zero 0
+ * fpr fpr_one 1
+ * fpr fpr_two 2
+ * fpr fpr_onehalf 0.5
+ * fpr fpr_ptwo31 2^31
+ * fpr fpr_ptwo31m1 2^31-1
+ * fpr fpr_mtwo31m1 -(2^31-1)
+ * fpr fpr_ptwo63m1 2^63-1
+ * fpr fpr_mtwo63m1 -(2^63-1)
+ * fpr fpr_ptwo63 2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+ union {
+ uint8_t d[512]; /* MUST be 512, exactly */
+ uint64_t dummy_u64;
+ } buf;
+ size_t ptr;
+ union {
+ uint8_t d[256];
+ uint64_t dummy_u64;
+ } state;
+ int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+ size_t u;
+
+ /*
+ * If there are less than 9 bytes in the buffer, we refill it.
+ * This means that we may drop the last few bytes, but this allows
+ * for faster extraction code. Also, it means that we never leave
+ * an empty buffer.
+ */
+ u = p->ptr;
+ if (u >= (sizeof p->buf.d) - 9) {
+ PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+ u = 0;
+ }
+ p->ptr = u + 8;
+
+ return (uint64_t)p->buf.d[u + 0]
+ | ((uint64_t)p->buf.d[u + 1] << 8)
+ | ((uint64_t)p->buf.d[u + 2] << 16)
+ | ((uint64_t)p->buf.d[u + 3] << 24)
+ | ((uint64_t)p->buf.d[u + 4] << 32)
+ | ((uint64_t)p->buf.d[u + 5] << 40)
+ | ((uint64_t)p->buf.d[u + 6] << 48)
+ | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+ unsigned v;
+
+ v = p->buf.d[p->ptr ++];
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+ }
+ return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(fpr *c, const fpr *a, const fpr *restrict b, const fpr *restrict d, unsigned logn);
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(fpr *d, fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d,
+ const fpr *a, const fpr *restrict b, unsigned logn);
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(fpr *c, const fpr *restrict d, const fpr *restrict a, unsigned logn);
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_fft(fpr *restrict c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+ const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add_muladj_fft(fpr *restrict d,
+ const fpr *restrict F, const fpr *restrict G,
+ const fpr *restrict f, const fpr *restrict g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+ fpr *restrict g01, fpr *restrict g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(fpr *restrict d11, fpr *restrict l10,
+ const fpr *restrict g00, const fpr *restrict g01,
+ const fpr *restrict g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1,
+ const fpr *restrict f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(fpr *restrict f,
+ const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm, const unsigned falcon_n);
+
+fpr PQCLEAN_FALCONPADDED1024_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2);
+
+int32_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_small_sqnorm(const int8_t *f); // common.c
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1 136
+#define FALCON_KEYGEN_TEMP_2 272
+#define FALCON_KEYGEN_TEMP_3 224
+#define FALCON_KEYGEN_TEMP_4 448
+#define FALCON_KEYGEN_TEMP_5 896
+#define FALCON_KEYGEN_TEMP_6 1792
+#define FALCON_KEYGEN_TEMP_7 3584
+#define FALCON_KEYGEN_TEMP_8 7168
+#define FALCON_KEYGEN_TEMP_9 14336
+#define FALCON_KEYGEN_TEMP_10 28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_expand_privkey(fpr *restrict expanded_key,
+ const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+ uint8_t *restrict tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED1024_AARCH64_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *restrict expanded_key,
+ const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *restrict f, const int8_t *restrict g,
+ const int8_t *restrict F, const int8_t *restrict G,
+ const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ * ctx pointer to the sampler_context structure
+ * mu center for the distribution
+ * isigma inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+ prng p;
+ fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/keygen.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/keygen.c
new file mode 100644
index 000000000..d023e58c0
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/keygen.c
@@ -0,0 +1,4200 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "util.h"
+
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ * Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ * Montgomery multiplication computes (x*y)/R mod p for
+ * operands x and y. Therefore:
+ *
+ * - if operands are x*R and y*R (Montgomery representations of x and
+ * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ * mod p, which is the Montgomery representation of the product x*y;
+ *
+ * - if operands are x*R and y (or x and y*R), then Montgomery
+ * multiplication returns x*y mod p: mixed-representation
+ * multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ * p The prime itself.
+ *
+ * g A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ * s The inverse of the product of all previous primes in the array,
+ * computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+ uint32_t p;
+ uint32_t g;
+ uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+ { 2147473409, 383167813, 10239 },
+ { 2147389441, 211808905, 471403745 },
+ { 2147387393, 37672282, 1329335065 },
+ { 2147377153, 1977035326, 968223422 },
+ { 2147358721, 1067163706, 132460015 },
+ { 2147352577, 1606082042, 598693809 },
+ { 2147346433, 2033915641, 1056257184 },
+ { 2147338241, 1653770625, 421286710 },
+ { 2147309569, 631200819, 1111201074 },
+ { 2147297281, 2038364663, 1042003613 },
+ { 2147295233, 1962540515, 19440033 },
+ { 2147239937, 2100082663, 353296760 },
+ { 2147235841, 1991153006, 1703918027 },
+ { 2147217409, 516405114, 1258919613 },
+ { 2147205121, 409347988, 1089726929 },
+ { 2147196929, 927788991, 1946238668 },
+ { 2147178497, 1136922411, 1347028164 },
+ { 2147100673, 868626236, 701164723 },
+ { 2147082241, 1897279176, 617820870 },
+ { 2147074049, 1888819123, 158382189 },
+ { 2147051521, 25006327, 522758543 },
+ { 2147043329, 327546255, 37227845 },
+ { 2147039233, 766324424, 1133356428 },
+ { 2146988033, 1862817362, 73861329 },
+ { 2146963457, 404622040, 653019435 },
+ { 2146959361, 1936581214, 995143093 },
+ { 2146938881, 1559770096, 634921513 },
+ { 2146908161, 422623708, 1985060172 },
+ { 2146885633, 1751189170, 298238186 },
+ { 2146871297, 578919515, 291810829 },
+ { 2146846721, 1114060353, 915902322 },
+ { 2146834433, 2069565474, 47859524 },
+ { 2146818049, 1552824584, 646281055 },
+ { 2146775041, 1906267847, 1597832891 },
+ { 2146756609, 1847414714, 1228090888 },
+ { 2146744321, 1818792070, 1176377637 },
+ { 2146738177, 1118066398, 1054971214 },
+ { 2146736129, 52057278, 933422153 },
+ { 2146713601, 592259376, 1406621510 },
+ { 2146695169, 263161877, 1514178701 },
+ { 2146656257, 685363115, 384505091 },
+ { 2146650113, 927727032, 537575289 },
+ { 2146646017, 52575506, 1799464037 },
+ { 2146643969, 1276803876, 1348954416 },
+ { 2146603009, 814028633, 1521547704 },
+ { 2146572289, 1846678872, 1310832121 },
+ { 2146547713, 919368090, 1019041349 },
+ { 2146508801, 671847612, 38582496 },
+ { 2146492417, 283911680, 532424562 },
+ { 2146490369, 1780044827, 896447978 },
+ { 2146459649, 327980850, 1327906900 },
+ { 2146447361, 1310561493, 958645253 },
+ { 2146441217, 412148926, 287271128 },
+ { 2146437121, 293186449, 2009822534 },
+ { 2146430977, 179034356, 1359155584 },
+ { 2146418689, 1517345488, 1790248672 },
+ { 2146406401, 1615820390, 1584833571 },
+ { 2146404353, 826651445, 607120498 },
+ { 2146379777, 3816988, 1897049071 },
+ { 2146363393, 1221409784, 1986921567 },
+ { 2146355201, 1388081168, 849968120 },
+ { 2146336769, 1803473237, 1655544036 },
+ { 2146312193, 1023484977, 273671831 },
+ { 2146293761, 1074591448, 467406983 },
+ { 2146283521, 831604668, 1523950494 },
+ { 2146203649, 712865423, 1170834574 },
+ { 2146154497, 1764991362, 1064856763 },
+ { 2146142209, 627386213, 1406840151 },
+ { 2146127873, 1638674429, 2088393537 },
+ { 2146099201, 1516001018, 690673370 },
+ { 2146093057, 1294931393, 315136610 },
+ { 2146091009, 1942399533, 973539425 },
+ { 2146078721, 1843461814, 2132275436 },
+ { 2146060289, 1098740778, 360423481 },
+ { 2146048001, 1617213232, 1951981294 },
+ { 2146041857, 1805783169, 2075683489 },
+ { 2146019329, 272027909, 1753219918 },
+ { 2145986561, 1206530344, 2034028118 },
+ { 2145976321, 1243769360, 1173377644 },
+ { 2145964033, 887200839, 1281344586 },
+ { 2145906689, 1651026455, 906178216 },
+ { 2145875969, 1673238256, 1043521212 },
+ { 2145871873, 1226591210, 1399796492 },
+ { 2145841153, 1465353397, 1324527802 },
+ { 2145832961, 1150638905, 554084759 },
+ { 2145816577, 221601706, 427340863 },
+ { 2145785857, 608896761, 316590738 },
+ { 2145755137, 1712054942, 1684294304 },
+ { 2145742849, 1302302867, 724873116 },
+ { 2145728513, 516717693, 431671476 },
+ { 2145699841, 524575579, 1619722537 },
+ { 2145691649, 1925625239, 982974435 },
+ { 2145687553, 463795662, 1293154300 },
+ { 2145673217, 771716636, 881778029 },
+ { 2145630209, 1509556977, 837364988 },
+ { 2145595393, 229091856, 851648427 },
+ { 2145587201, 1796903241, 635342424 },
+ { 2145525761, 715310882, 1677228081 },
+ { 2145495041, 1040930522, 200685896 },
+ { 2145466369, 949804237, 1809146322 },
+ { 2145445889, 1673903706, 95316881 },
+ { 2145390593, 806941852, 1428671135 },
+ { 2145372161, 1402525292, 159350694 },
+ { 2145361921, 2124760298, 1589134749 },
+ { 2145359873, 1217503067, 1561543010 },
+ { 2145355777, 338341402, 83865711 },
+ { 2145343489, 1381532164, 641430002 },
+ { 2145325057, 1883895478, 1528469895 },
+ { 2145318913, 1335370424, 65809740 },
+ { 2145312769, 2000008042, 1919775760 },
+ { 2145300481, 961450962, 1229540578 },
+ { 2145282049, 910466767, 1964062701 },
+ { 2145232897, 816527501, 450152063 },
+ { 2145218561, 1435128058, 1794509700 },
+ { 2145187841, 33505311, 1272467582 },
+ { 2145181697, 269767433, 1380363849 },
+ { 2145175553, 56386299, 1316870546 },
+ { 2145079297, 2106880293, 1391797340 },
+ { 2145021953, 1347906152, 720510798 },
+ { 2145015809, 206769262, 1651459955 },
+ { 2145003521, 1885513236, 1393381284 },
+ { 2144960513, 1810381315, 31937275 },
+ { 2144944129, 1306487838, 2019419520 },
+ { 2144935937, 37304730, 1841489054 },
+ { 2144894977, 1601434616, 157985831 },
+ { 2144888833, 98749330, 2128592228 },
+ { 2144880641, 1772327002, 2076128344 },
+ { 2144864257, 1404514762, 2029969964 },
+ { 2144827393, 801236594, 406627220 },
+ { 2144806913, 349217443, 1501080290 },
+ { 2144796673, 1542656776, 2084736519 },
+ { 2144778241, 1210734884, 1746416203 },
+ { 2144759809, 1146598851, 716464489 },
+ { 2144757761, 286328400, 1823728177 },
+ { 2144729089, 1347555695, 1836644881 },
+ { 2144727041, 1795703790, 520296412 },
+ { 2144696321, 1302475157, 852964281 },
+ { 2144667649, 1075877614, 504992927 },
+ { 2144573441, 198765808, 1617144982 },
+ { 2144555009, 321528767, 155821259 },
+ { 2144550913, 814139516, 1819937644 },
+ { 2144536577, 571143206, 962942255 },
+ { 2144524289, 1746733766, 2471321 },
+ { 2144512001, 1821415077, 124190939 },
+ { 2144468993, 917871546, 1260072806 },
+ { 2144458753, 378417981, 1569240563 },
+ { 2144421889, 175229668, 1825620763 },
+ { 2144409601, 1699216963, 351648117 },
+ { 2144370689, 1071885991, 958186029 },
+ { 2144348161, 1763151227, 540353574 },
+ { 2144335873, 1060214804, 919598847 },
+ { 2144329729, 663515846, 1448552668 },
+ { 2144327681, 1057776305, 590222840 },
+ { 2144309249, 1705149168, 1459294624 },
+ { 2144296961, 325823721, 1649016934 },
+ { 2144290817, 738775789, 447427206 },
+ { 2144243713, 962347618, 893050215 },
+ { 2144237569, 1655257077, 900860862 },
+ { 2144161793, 242206694, 1567868672 },
+ { 2144155649, 769415308, 1247993134 },
+ { 2144137217, 320492023, 515841070 },
+ { 2144120833, 1639388522, 770877302 },
+ { 2144071681, 1761785233, 964296120 },
+ { 2144065537, 419817825, 204564472 },
+ { 2144028673, 666050597, 2091019760 },
+ { 2144010241, 1413657615, 1518702610 },
+ { 2143952897, 1238327946, 475672271 },
+ { 2143940609, 307063413, 1176750846 },
+ { 2143918081, 2062905559, 786785803 },
+ { 2143899649, 1338112849, 1562292083 },
+ { 2143891457, 68149545, 87166451 },
+ { 2143885313, 921750778, 394460854 },
+ { 2143854593, 719766593, 133877196 },
+ { 2143836161, 1149399850, 1861591875 },
+ { 2143762433, 1848739366, 1335934145 },
+ { 2143756289, 1326674710, 102999236 },
+ { 2143713281, 808061791, 1156900308 },
+ { 2143690753, 388399459, 1926468019 },
+ { 2143670273, 1427891374, 1756689401 },
+ { 2143666177, 1912173949, 986629565 },
+ { 2143645697, 2041160111, 371842865 },
+ { 2143641601, 1279906897, 2023974350 },
+ { 2143635457, 720473174, 1389027526 },
+ { 2143621121, 1298309455, 1732632006 },
+ { 2143598593, 1548762216, 1825417506 },
+ { 2143567873, 620475784, 1073787233 },
+ { 2143561729, 1932954575, 949167309 },
+ { 2143553537, 354315656, 1652037534 },
+ { 2143541249, 577424288, 1097027618 },
+ { 2143531009, 357862822, 478640055 },
+ { 2143522817, 2017706025, 1550531668 },
+ { 2143506433, 2078127419, 1824320165 },
+ { 2143488001, 613475285, 1604011510 },
+ { 2143469569, 1466594987, 502095196 },
+ { 2143426561, 1115430331, 1044637111 },
+ { 2143383553, 9778045, 1902463734 },
+ { 2143377409, 1557401276, 2056861771 },
+ { 2143363073, 652036455, 1965915971 },
+ { 2143260673, 1464581171, 1523257541 },
+ { 2143246337, 1876119649, 764541916 },
+ { 2143209473, 1614992673, 1920672844 },
+ { 2143203329, 981052047, 2049774209 },
+ { 2143160321, 1847355533, 728535665 },
+ { 2143129601, 965558457, 603052992 },
+ { 2143123457, 2140817191, 8348679 },
+ { 2143100929, 1547263683, 694209023 },
+ { 2143092737, 643459066, 1979934533 },
+ { 2143082497, 188603778, 2026175670 },
+ { 2143062017, 1657329695, 377451099 },
+ { 2143051777, 114967950, 979255473 },
+ { 2143025153, 1698431342, 1449196896 },
+ { 2143006721, 1862741675, 1739650365 },
+ { 2142996481, 756660457, 996160050 },
+ { 2142976001, 927864010, 1166847574 },
+ { 2142965761, 905070557, 661974566 },
+ { 2142916609, 40932754, 1787161127 },
+ { 2142892033, 1987985648, 675335382 },
+ { 2142885889, 797497211, 1323096997 },
+ { 2142871553, 2068025830, 1411877159 },
+ { 2142861313, 1217177090, 1438410687 },
+ { 2142830593, 409906375, 1767860634 },
+ { 2142803969, 1197788993, 359782919 },
+ { 2142785537, 643817365, 513932862 },
+ { 2142779393, 1717046338, 218943121 },
+ { 2142724097, 89336830, 416687049 },
+ { 2142707713, 5944581, 1356813523 },
+ { 2142658561, 887942135, 2074011722 },
+ { 2142638081, 151851972, 1647339939 },
+ { 2142564353, 1691505537, 1483107336 },
+ { 2142533633, 1989920200, 1135938817 },
+ { 2142529537, 959263126, 1531961857 },
+ { 2142527489, 453251129, 1725566162 },
+ { 2142502913, 1536028102, 182053257 },
+ { 2142498817, 570138730, 701443447 },
+ { 2142416897, 326965800, 411931819 },
+ { 2142363649, 1675665410, 1517191733 },
+ { 2142351361, 968529566, 1575712703 },
+ { 2142330881, 1384953238, 1769087884 },
+ { 2142314497, 1977173242, 1833745524 },
+ { 2142289921, 95082313, 1714775493 },
+ { 2142283777, 109377615, 1070584533 },
+ { 2142277633, 16960510, 702157145 },
+ { 2142263297, 553850819, 431364395 },
+ { 2142208001, 241466367, 2053967982 },
+ { 2142164993, 1795661326, 1031836848 },
+ { 2142097409, 1212530046, 712772031 },
+ { 2142087169, 1763869720, 822276067 },
+ { 2142078977, 644065713, 1765268066 },
+ { 2142074881, 112671944, 643204925 },
+ { 2142044161, 1387785471, 1297890174 },
+ { 2142025729, 783885537, 1000425730 },
+ { 2142011393, 905662232, 1679401033 },
+ { 2141974529, 799788433, 468119557 },
+ { 2141943809, 1932544124, 449305555 },
+ { 2141933569, 1527403256, 841867925 },
+ { 2141931521, 1247076451, 743823916 },
+ { 2141902849, 1199660531, 401687910 },
+ { 2141890561, 150132350, 1720336972 },
+ { 2141857793, 1287438162, 663880489 },
+ { 2141833217, 618017731, 1819208266 },
+ { 2141820929, 999578638, 1403090096 },
+ { 2141786113, 81834325, 1523542501 },
+ { 2141771777, 120001928, 463556492 },
+ { 2141759489, 122455485, 2124928282 },
+ { 2141749249, 141986041, 940339153 },
+ { 2141685761, 889088734, 477141499 },
+ { 2141673473, 324212681, 1122558298 },
+ { 2141669377, 1175806187, 1373818177 },
+ { 2141655041, 1113654822, 296887082 },
+ { 2141587457, 991103258, 1585913875 },
+ { 2141583361, 1401451409, 1802457360 },
+ { 2141575169, 1571977166, 712760980 },
+ { 2141546497, 1107849376, 1250270109 },
+ { 2141515777, 196544219, 356001130 },
+ { 2141495297, 1733571506, 1060744866 },
+ { 2141483009, 321552363, 1168297026 },
+ { 2141458433, 505818251, 733225819 },
+ { 2141360129, 1026840098, 948342276 },
+ { 2141325313, 945133744, 2129965998 },
+ { 2141317121, 1871100260, 1843844634 },
+ { 2141286401, 1790639498, 1750465696 },
+ { 2141267969, 1376858592, 186160720 },
+ { 2141255681, 2129698296, 1876677959 },
+ { 2141243393, 2138900688, 1340009628 },
+ { 2141214721, 1933049835, 1087819477 },
+ { 2141212673, 1898664939, 1786328049 },
+ { 2141202433, 990234828, 940682169 },
+ { 2141175809, 1406392421, 993089586 },
+ { 2141165569, 1263518371, 289019479 },
+ { 2141073409, 1485624211, 507864514 },
+ { 2141052929, 1885134788, 311252465 },
+ { 2141040641, 1285021247, 280941862 },
+ { 2141028353, 1527610374, 375035110 },
+ { 2141011969, 1400626168, 164696620 },
+ { 2140999681, 632959608, 966175067 },
+ { 2140997633, 2045628978, 1290889438 },
+ { 2140993537, 1412755491, 375366253 },
+ { 2140942337, 719477232, 785367828 },
+ { 2140925953, 45224252, 836552317 },
+ { 2140917761, 1157376588, 1001839569 },
+ { 2140887041, 278480752, 2098732796 },
+ { 2140837889, 1663139953, 924094810 },
+ { 2140788737, 802501511, 2045368990 },
+ { 2140766209, 1820083885, 1800295504 },
+ { 2140764161, 1169561905, 2106792035 },
+ { 2140696577, 127781498, 1885987531 },
+ { 2140684289, 16014477, 1098116827 },
+ { 2140653569, 665960598, 1796728247 },
+ { 2140594177, 1043085491, 377310938 },
+ { 2140579841, 1732838211, 1504505945 },
+ { 2140569601, 302071939, 358291016 },
+ { 2140567553, 192393733, 1909137143 },
+ { 2140557313, 406595731, 1175330270 },
+ { 2140549121, 1748850918, 525007007 },
+ { 2140477441, 499436566, 1031159814 },
+ { 2140469249, 1886004401, 1029951320 },
+ { 2140426241, 1483168100, 1676273461 },
+ { 2140420097, 1779917297, 846024476 },
+ { 2140413953, 522948893, 1816354149 },
+ { 2140383233, 1931364473, 1296921241 },
+ { 2140366849, 1917356555, 147196204 },
+ { 2140354561, 16466177, 1349052107 },
+ { 2140348417, 1875366972, 1860485634 },
+ { 2140323841, 456498717, 1790256483 },
+ { 2140321793, 1629493973, 150031888 },
+ { 2140315649, 1904063898, 395510935 },
+ { 2140280833, 1784104328, 831417909 },
+ { 2140250113, 256087139, 697349101 },
+ { 2140229633, 388553070, 243875754 },
+ { 2140223489, 747459608, 1396270850 },
+ { 2140200961, 507423743, 1895572209 },
+ { 2140162049, 580106016, 2045297469 },
+ { 2140149761, 712426444, 785217995 },
+ { 2140137473, 1441607584, 536866543 },
+ { 2140119041, 346538902, 1740434653 },
+ { 2140090369, 282642885, 21051094 },
+ { 2140076033, 1407456228, 319910029 },
+ { 2140047361, 1619330500, 1488632070 },
+ { 2140041217, 2089408064, 2012026134 },
+ { 2140008449, 1705524800, 1613440760 },
+ { 2139924481, 1846208233, 1280649481 },
+ { 2139906049, 989438755, 1185646076 },
+ { 2139867137, 1522314850, 372783595 },
+ { 2139842561, 1681587377, 216848235 },
+ { 2139826177, 2066284988, 1784999464 },
+ { 2139824129, 480888214, 1513323027 },
+ { 2139789313, 847937200, 858192859 },
+ { 2139783169, 1642000434, 1583261448 },
+ { 2139770881, 940699589, 179702100 },
+ { 2139768833, 315623242, 964612676 },
+ { 2139666433, 331649203, 764666914 },
+ { 2139641857, 2118730799, 1313764644 },
+ { 2139635713, 519149027, 519212449 },
+ { 2139598849, 1526413634, 1769667104 },
+ { 2139574273, 551148610, 820739925 },
+ { 2139568129, 1386800242, 472447405 },
+ { 2139549697, 813760130, 1412328531 },
+ { 2139537409, 1615286260, 1609362979 },
+ { 2139475969, 1352559299, 1696720421 },
+ { 2139455489, 1048691649, 1584935400 },
+ { 2139432961, 836025845, 950121150 },
+ { 2139424769, 1558281165, 1635486858 },
+ { 2139406337, 1728402143, 1674423301 },
+ { 2139396097, 1727715782, 1483470544 },
+ { 2139383809, 1092853491, 1741699084 },
+ { 2139369473, 690776899, 1242798709 },
+ { 2139351041, 1768782380, 2120712049 },
+ { 2139334657, 1739968247, 1427249225 },
+ { 2139332609, 1547189119, 623011170 },
+ { 2139310081, 1346827917, 1605466350 },
+ { 2139303937, 369317948, 828392831 },
+ { 2139301889, 1560417239, 1788073219 },
+ { 2139283457, 1303121623, 595079358 },
+ { 2139248641, 1354555286, 573424177 },
+ { 2139240449, 60974056, 885781403 },
+ { 2139222017, 355573421, 1221054839 },
+ { 2139215873, 566477826, 1724006500 },
+ { 2139150337, 871437673, 1609133294 },
+ { 2139144193, 1478130914, 1137491905 },
+ { 2139117569, 1854880922, 964728507 },
+ { 2139076609, 202405335, 756508944 },
+ { 2139062273, 1399715741, 884826059 },
+ { 2139045889, 1051045798, 1202295476 },
+ { 2139033601, 1707715206, 632234634 },
+ { 2139006977, 2035853139, 231626690 },
+ { 2138951681, 183867876, 838350879 },
+ { 2138945537, 1403254661, 404460202 },
+ { 2138920961, 310865011, 1282911681 },
+ { 2138910721, 1328496553, 103472415 },
+ { 2138904577, 78831681, 993513549 },
+ { 2138902529, 1319697451, 1055904361 },
+ { 2138816513, 384338872, 1706202469 },
+ { 2138810369, 1084868275, 405677177 },
+ { 2138787841, 401181788, 1964773901 },
+ { 2138775553, 1850532988, 1247087473 },
+ { 2138767361, 874261901, 1576073565 },
+ { 2138757121, 1187474742, 993541415 },
+ { 2138748929, 1782458888, 1043206483 },
+ { 2138744833, 1221500487, 800141243 },
+ { 2138738689, 413465368, 1450660558 },
+ { 2138695681, 739045140, 342611472 },
+ { 2138658817, 1355845756, 672674190 },
+ { 2138644481, 608379162, 1538874380 },
+ { 2138632193, 1444914034, 686911254 },
+ { 2138607617, 484707818, 1435142134 },
+ { 2138591233, 539460669, 1290458549 },
+ { 2138572801, 2093538990, 2011138646 },
+ { 2138552321, 1149786988, 1076414907 },
+ { 2138546177, 840688206, 2108985273 },
+ { 2138533889, 209669619, 198172413 },
+ { 2138523649, 1975879426, 1277003968 },
+ { 2138490881, 1351891144, 1976858109 },
+ { 2138460161, 1817321013, 1979278293 },
+ { 2138429441, 1950077177, 203441928 },
+ { 2138400769, 908970113, 628395069 },
+ { 2138398721, 219890864, 758486760 },
+ { 2138376193, 1306654379, 977554090 },
+ { 2138351617, 298822498, 2004708503 },
+ { 2138337281, 441457816, 1049002108 },
+ { 2138320897, 1517731724, 1442269609 },
+ { 2138290177, 1355911197, 1647139103 },
+ { 2138234881, 531313247, 1746591962 },
+ { 2138214401, 1899410930, 781416444 },
+ { 2138202113, 1813477173, 1622508515 },
+ { 2138191873, 1086458299, 1025408615 },
+ { 2138183681, 1998800427, 827063290 },
+ { 2138173441, 1921308898, 749670117 },
+ { 2138103809, 1620902804, 2126787647 },
+ { 2138099713, 828647069, 1892961817 },
+ { 2138085377, 179405355, 1525506535 },
+ { 2138060801, 615683235, 1259580138 },
+ { 2138044417, 2030277840, 1731266562 },
+ { 2138042369, 2087222316, 1627902259 },
+ { 2138032129, 126388712, 1108640984 },
+ { 2138011649, 715026550, 1017980050 },
+ { 2137993217, 1693714349, 1351778704 },
+ { 2137888769, 1289762259, 1053090405 },
+ { 2137853953, 199991890, 1254192789 },
+ { 2137833473, 941421685, 896995556 },
+ { 2137817089, 750416446, 1251031181 },
+ { 2137792513, 798075119, 368077456 },
+ { 2137786369, 878543495, 1035375025 },
+ { 2137767937, 9351178, 1156563902 },
+ { 2137755649, 1382297614, 1686559583 },
+ { 2137724929, 1345472850, 1681096331 },
+ { 2137704449, 834666929, 630551727 },
+ { 2137673729, 1646165729, 1892091571 },
+ { 2137620481, 778943821, 48456461 },
+ { 2137618433, 1730837875, 1713336725 },
+ { 2137581569, 805610339, 1378891359 },
+ { 2137538561, 204342388, 1950165220 },
+ { 2137526273, 1947629754, 1500789441 },
+ { 2137516033, 719902645, 1499525372 },
+ { 2137491457, 230451261, 556382829 },
+ { 2137440257, 979573541, 412760291 },
+ { 2137374721, 927841248, 1954137185 },
+ { 2137362433, 1243778559, 861024672 },
+ { 2137313281, 1341338501, 980638386 },
+ { 2137311233, 937415182, 1793212117 },
+ { 2137255937, 795331324, 1410253405 },
+ { 2137243649, 150756339, 1966999887 },
+ { 2137182209, 163346914, 1939301431 },
+ { 2137171969, 1952552395, 758913141 },
+ { 2137159681, 570788721, 218668666 },
+ { 2137147393, 1896656810, 2045670345 },
+ { 2137141249, 358493842, 518199643 },
+ { 2137139201, 1505023029, 674695848 },
+ { 2137133057, 27911103, 830956306 },
+ { 2137122817, 439771337, 1555268614 },
+ { 2137116673, 790988579, 1871449599 },
+ { 2137110529, 432109234, 811805080 },
+ { 2137102337, 1357900653, 1184997641 },
+ { 2137098241, 515119035, 1715693095 },
+ { 2137090049, 408575203, 2085660657 },
+ { 2137085953, 2097793407, 1349626963 },
+ { 2137055233, 1556739954, 1449960883 },
+ { 2137030657, 1545758650, 1369303716 },
+ { 2136987649, 332602570, 103875114 },
+ { 2136969217, 1499989506, 1662964115 },
+ { 2136924161, 857040753, 4738842 },
+ { 2136895489, 1948872712, 570436091 },
+ { 2136893441, 58969960, 1568349634 },
+ { 2136887297, 2127193379, 273612548 },
+ { 2136850433, 111208983, 1181257116 },
+ { 2136809473, 1627275942, 1680317971 },
+ { 2136764417, 1574888217, 14011331 },
+ { 2136741889, 14011055, 1129154251 },
+ { 2136727553, 35862563, 1838555253 },
+ { 2136721409, 310235666, 1363928244 },
+ { 2136698881, 1612429202, 1560383828 },
+ { 2136649729, 1138540131, 800014364 },
+ { 2136606721, 602323503, 1433096652 },
+ { 2136563713, 182209265, 1919611038 },
+ { 2136555521, 324156477, 165591039 },
+ { 2136549377, 195513113, 217165345 },
+ { 2136526849, 1050768046, 939647887 },
+ { 2136508417, 1886286237, 1619926572 },
+ { 2136477697, 609647664, 35065157 },
+ { 2136471553, 679352216, 1452259468 },
+ { 2136457217, 128630031, 824816521 },
+ { 2136422401, 19787464, 1526049830 },
+ { 2136420353, 698316836, 1530623527 },
+ { 2136371201, 1651862373, 1804812805 },
+ { 2136334337, 326596005, 336977082 },
+ { 2136322049, 63253370, 1904972151 },
+ { 2136297473, 312176076, 172182411 },
+ { 2136248321, 381261841, 369032670 },
+ { 2136242177, 358688773, 1640007994 },
+ { 2136229889, 512677188, 75585225 },
+ { 2136219649, 2095003250, 1970086149 },
+ { 2136207361, 1909650722, 537760675 },
+ { 2136176641, 1334616195, 1533487619 },
+ { 2136158209, 2096285632, 1793285210 },
+ { 2136143873, 1897347517, 293843959 },
+ { 2136133633, 923586222, 1022655978 },
+ { 2136096769, 1464868191, 1515074410 },
+ { 2136094721, 2020679520, 2061636104 },
+ { 2136076289, 290798503, 1814726809 },
+ { 2136041473, 156415894, 1250757633 },
+ { 2135996417, 297459940, 1132158924 },
+ { 2135955457, 538755304, 1688831340 },
+ { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+ uint32_t w;
+
+ w = (uint32_t)x;
+ w += p & -(w >> 31);
+ return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+ return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+ uint32_t y;
+
+ y = 2 - p;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+ /*
+ * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+ * 2^31 - p.
+ */
+ return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a + b - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a - b;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+ a += p & -(a & 1);
+ return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+ uint64_t z, w;
+ uint32_t d;
+
+ z = (uint64_t)a * (uint64_t)b;
+ w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+ d = (uint32_t)((z + w) >> 31) - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+ uint32_t z;
+
+ /*
+ * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+ * representation), then double it with an addition.
+ */
+ z = modp_R(p);
+ z = modp_add(z, z, p);
+
+ /*
+ * Square it five times to obtain 2^32 in Montgomery representation
+ * (i.e. 2^63 mod p).
+ */
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+
+ /*
+ * Halve the value mod p to get 2^62.
+ */
+ z = (z + (p & -(z & 1))) >> 1;
+ return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+ int i;
+ uint32_t r, z;
+
+ /*
+ * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+ * representation of (2^31)^e mod p, where e = x-1.
+ * R2 is 2^31 in Montgomery representation.
+ */
+ x --;
+ r = R2;
+ z = modp_R(p);
+ for (i = 0; (1U << i) <= x; i ++) {
+ if ((x & (1U << i)) != 0) {
+ z = modp_montymul(z, r, p, p0i);
+ }
+ r = modp_montymul(r, r, p, p0i);
+ }
+ return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ * a dividend
+ * b divisor
+ * p odd prime modulus
+ * p0i -1/p mod 2^31
+ * R 2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+ uint32_t z, e;
+ int i;
+
+ e = p - 2;
+ z = R;
+ for (i = 30; i >= 0; i --) {
+ uint32_t z2;
+
+ z = modp_montymul(z, z, p, p0i);
+ z2 = modp_montymul(z, b, p, p0i);
+ z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+ }
+
+ /*
+ * The loop above just assumed that b was in Montgomery
+ * representation, i.e. really contained b*R; under that
+ * assumption, it returns 1/b in Montgomery representation,
+ * which is R/b. But we gave it b in normal representation,
+ * so the loop really returned R/(b/R) = R^2/b.
+ *
+ * We want a/b, so we need one Montgomery multiplication with a,
+ * which also remove one of the R factors, and another such
+ * multiplication to remove the second R factor.
+ */
+ z = modp_montymul(z, 1, p, p0i);
+ return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+ 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832,
+ 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928,
+ 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784,
+ 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976,
+ 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880,
+ 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904,
+ 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808,
+ 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000,
+ 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856,
+ 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952,
+ 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772,
+ 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964,
+ 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868,
+ 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916,
+ 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820,
+ 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012,
+ 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844,
+ 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940,
+ 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796,
+ 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988,
+ 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892,
+ 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898,
+ 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802,
+ 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994,
+ 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850,
+ 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946,
+ 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778,
+ 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970,
+ 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874,
+ 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922,
+ 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826,
+ 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018,
+ 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838,
+ 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934,
+ 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790,
+ 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982,
+ 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886,
+ 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910,
+ 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814,
+ 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006,
+ 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862,
+ 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958,
+ 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769,
+ 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961,
+ 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865,
+ 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913,
+ 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817,
+ 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009,
+ 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841,
+ 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937,
+ 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793,
+ 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985,
+ 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889,
+ 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901,
+ 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805,
+ 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997,
+ 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853,
+ 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949,
+ 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781,
+ 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973,
+ 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877,
+ 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925,
+ 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829,
+ 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021,
+ 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835,
+ 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931,
+ 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787,
+ 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979,
+ 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883,
+ 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907,
+ 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811,
+ 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003,
+ 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859,
+ 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955,
+ 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775,
+ 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967,
+ 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871,
+ 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919,
+ 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823,
+ 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015,
+ 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847,
+ 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943,
+ 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799,
+ 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991,
+ 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895,
+ 255, 767, 511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ * gm[rev(i)] = g^i mod p
+ * igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
+ uint32_t g, uint32_t p, uint32_t p0i) {
+ size_t u, n;
+ unsigned k;
+ uint32_t ig, x1, x2, R2;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * We want g such that g^(2N) = 1 mod p, but the provided
+ * generator has order 2048. We must square it a few times.
+ */
+ R2 = modp_R2(p, p0i);
+ g = modp_montymul(g, R2, p, p0i);
+ for (k = logn; k < 10; k ++) {
+ g = modp_montymul(g, g, p, p0i);
+ }
+
+ ig = modp_div(R2, g, p, p0i, modp_R(p));
+ k = 10 - logn;
+ x1 = x2 = modp_R(p);
+ for (u = 0; u < n; u ++) {
+ size_t v;
+
+ v = REV10[u << k];
+ gm[v] = x1;
+ igm[v] = x2;
+ x1 = modp_montymul(x1, g, p, p0i);
+ x2 = modp_montymul(x2, ig, p, p0i);
+ }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, u, v1;
+
+ ht = t >> 1;
+ for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = gm[m + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + ht * stride;
+ for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = modp_montymul(*r2, s, p, p0i);
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_sub(x, y, p);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n, k;
+ uint32_t ni;
+ uint32_t *r;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = 1;
+ for (m = n; m > 1; m >>= 1) {
+ size_t hm, dt, u, v1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = igm[hm + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + t * stride;
+ for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = *r2;
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_montymul(
+ modp_sub(x, y, p), s, p, p0i);;
+ }
+ }
+ t = dt;
+ }
+
+ /*
+ * We need 1/n in Montgomery representation, i.e. R/n. Since
+ * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+ * thus a simple shift will do.
+ */
+ ni = (uint32_t)1 << (31 - logn);
+ for (k = 0, r = a; k < n; k ++, r += stride) {
+ *r = modp_montymul(*r, ni, p, p0i);
+ }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ size_t hn, u;
+
+ hn = (size_t)1 << (logn - 1);
+ for (u = 0; u < hn; u ++) {
+ uint32_t w0, w1;
+
+ w0 = f[(u << 1) + 0];
+ w1 = f[(u << 1) + 1];
+ f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ * - Rebuild the resultant and the polynomial coefficients from their
+ * values modulo small primes (of length 31 bits each).
+ *
+ * - Compute an extended GCD between the two computed resultants.
+ *
+ * - Extract top bits and add scaled values during the successive steps
+ * of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
+ uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ cc = 0;
+ m = -ctl;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, w;
+
+ aw = a[u];
+ w = aw - b[u] - cc;
+ cc = w >> 31;
+ aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+ a[u] = aw;
+ }
+ return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < mlen; u ++) {
+ uint64_t z;
+
+ z = (uint64_t)m[u] * (uint64_t)x + cc;
+ m[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ * d is unsigned
+ * p is prime
+ * 2^30 < p < 2^31
+ * p0i = -(1/p) mod 2^31
+ * R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ uint32_t x;
+ size_t u;
+
+ /*
+ * Algorithm: we inject words one by one, starting with the high
+ * word. Each step is:
+ * - multiply x by 2^31
+ * - add new word
+ */
+ x = 0;
+ u = dlen;
+ while (u -- > 0) {
+ uint32_t w;
+
+ x = modp_montymul(x, R2, p, p0i);
+ w = d[u] - p;
+ w += p & -(w >> 31);
+ x = modp_add(x, w, p);
+ }
+ return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+ uint32_t z;
+
+ if (dlen == 0) {
+ return 0;
+ }
+ z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+ z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+ return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *restrict x,
+ const uint32_t *restrict y, size_t len, uint32_t s) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t xw, yw;
+ uint64_t z;
+
+ xw = x[u];
+ yw = y[u];
+ z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) {
+ size_t u;
+ uint32_t r, bb;
+
+ /*
+ * Compare x with p/2. We use the shifted version of p, and p
+ * is odd, so we really compare with (p-1)/2; we want to perform
+ * the subtraction if and only if x > (p-1)/2.
+ */
+ r = 0;
+ bb = 0;
+ u = len;
+ while (u -- > 0) {
+ uint32_t wx, wp, cc;
+
+ /*
+ * Get the two words to compare in wx and wp (both over
+ * 31 bits exactly).
+ */
+ wx = x[u];
+ wp = (p[u] >> 1) | (bb << 30);
+ bb = p[u] & 1;
+
+ /*
+ * We set cc to -1, 0 or 1, depending on whether wp is
+ * lower than, equal to, or greater than wx.
+ */
+ cc = wp - wx;
+ cc = ((-cc) >> 31) | -(cc >> 31);
+
+ /*
+ * If r != 0 then it is either 1 or -1, and we keep its
+ * value. Otherwise, if r = 0, then we replace it with cc.
+ */
+ r |= cc & ((r & 1) - 1);
+ }
+
+ /*
+ * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+ * is lower than, equal to, or greater than x. We thus want to
+ * do the subtraction only if r = -1.
+ */
+ zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
+ size_t num, const small_prime *primes, int normalize_signed,
+ uint32_t *restrict tmp) {
+ size_t u;
+ uint32_t *x;
+
+ tmp[0] = primes[0].p;
+ for (u = 1; u < xlen; u ++) {
+ /*
+ * At the entry of each loop iteration:
+ * - the first u words of each array have been
+ * reassembled;
+ * - the first u words of tmp[] contains the
+ * product of the prime moduli processed so far.
+ *
+ * We call 'q' the product of all previous primes.
+ */
+ uint32_t p, p0i, s, R2;
+ size_t v;
+
+ p = primes[u].p;
+ s = primes[u].s;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ for (v = 0, x = xx; v < num; v ++, x += xstride) {
+ uint32_t xp, xq, xr;
+ /*
+ * xp = the integer x modulo the prime p for this
+ * iteration
+ * xq = (x mod q) mod p
+ */
+ xp = x[u];
+ xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+ /*
+ * New value is (x mod q) + q * (s * (xp - xq) mod p)
+ */
+ xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+ zint_add_mul_small(x, tmp, u, xr);
+ }
+
+ /*
+ * Update product of primes in tmp[].
+ */
+ tmp[u] = zint_mul_small(tmp, u, p);
+ }
+
+ /*
+ * Normalize the reconstructed values around 0.
+ */
+ if (normalize_signed) {
+ for (u = 0, x = xx; u < num; u ++, x += xstride) {
+ zint_norm_zero(x, tmp, xlen);
+ }
+ }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ /*
+ * If ctl = 1 then we flip the bits of a by XORing with
+ * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+ * with 0 and add 0, which leaves the value unchanged.
+ */
+ cc = ctl;
+ m = -ctl >> 1;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw;
+
+ aw = a[u];
+ aw = (aw ^ m) + cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ * 0 both values were positive
+ * 1 new a had to be negated
+ * 2 new b had to be negated
+ * 3 both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+ int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t nega, negb;
+
+ cca = 0;
+ ccb = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ nega = (uint32_t)((uint64_t)cca >> 63);
+ negb = (uint32_t)((uint64_t)ccb >> 63);
+ zint_negate(a, len, nega);
+ zint_negate(b, len, negb);
+ return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ * if neg = 1, then -m <= a < 0
+ * if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+ size_t u;
+ uint32_t cc, xm, ym;
+
+ /*
+ * First pass: compare a (assumed nonnegative) with m. Note that
+ * if the top word uses 32 bits, subtracting m must yield a
+ * value less than 2^31 since a < 2*m.
+ */
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ cc = (a[u] - m[u] - cc) >> 31;
+ }
+
+ /*
+ * If neg = 1 then we must add m (regardless of cc)
+ * If neg = 0 and cc = 0 then we must subtract m
+ * If neg = 0 and cc = 1 then we must do nothing
+ *
+ * In the loop below, we conditionally subtract either m or -m
+ * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+ * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+ */
+ xm = -neg >> 1;
+ ym = -(neg | (1 - cc));
+ cc = neg;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, mw;
+
+ aw = a[u];
+ mw = (m[u] ^ xm) & ym;
+ aw = aw - mw - cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+ uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t fa, fb;
+
+ /*
+ * These are actually four combined Montgomery multiplications.
+ */
+ cca = 0;
+ ccb = 0;
+ fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+ fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb
+ + m[u] * (uint64_t)fa + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+ + m[u] * (uint64_t)fb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ /*
+ * At this point:
+ * -m <= a < 2*m
+ * -m <= b < 2*m
+ * (this is a case of Montgomery reduction)
+ * The top words of 'a' and 'b' may have a 32-th bit set.
+ * We want to add or subtract the modulus, as required.
+ */
+ zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+ zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ * 0 <= u <= y
+ * 0 <= v <= x
+ * x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
+ const uint32_t *restrict x, const uint32_t *restrict y,
+ size_t len, uint32_t *restrict tmp) {
+ /*
+ * Algorithm is an extended binary GCD. We maintain 6 values
+ * a, b, u0, u1, v0 and v1 with the following invariants:
+ *
+ * a = x*u0 - y*v0
+ * b = x*u1 - y*v1
+ * 0 <= a <= x
+ * 0 <= b <= y
+ * 0 <= u0 < y
+ * 0 <= v0 < x
+ * 0 <= u1 <= y
+ * 0 <= v1 < x
+ *
+ * Initial values are:
+ *
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ *
+ * Each iteration reduces either a or b, and maintains the
+ * invariants. Algorithm stops when a = b, at which point their
+ * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+ * the values (u,v) we want to return.
+ *
+ * The formal definition of the algorithm is a sequence of steps:
+ *
+ * - If a is even, then:
+ * a <- a/2
+ * u0 <- u0/2 mod y
+ * v0 <- v0/2 mod x
+ *
+ * - Otherwise, if b is even, then:
+ * b <- b/2
+ * u1 <- u1/2 mod y
+ * v1 <- v1/2 mod x
+ *
+ * - Otherwise, if a > b, then:
+ * a <- (a-b)/2
+ * u0 <- (u0-u1)/2 mod y
+ * v0 <- (v0-v1)/2 mod x
+ *
+ * - Otherwise:
+ * b <- (b-a)/2
+ * u1 <- (u1-u0)/2 mod y
+ * v1 <- (v1-v0)/2 mod y
+ *
+ * We can show that the operations above preserve the invariants:
+ *
+ * - If a is even, then u0 and v0 are either both even or both
+ * odd (since a = x*u0 - y*v0, and x and y are both odd).
+ * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+ * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+ * the a = x*u0 - y*v0 invariant is preserved.
+ *
+ * - The same holds for the case where b is even.
+ *
+ * - If a and b are odd, and a > b, then:
+ *
+ * a-b = x*(u0-u1) - y*(v0-v1)
+ *
+ * In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+ * a-b > 0; therefore, it must be that v0 < v1, and the
+ * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+ * which preserves the invariants. Otherwise, if u0 > u1,
+ * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+ * b >= 0, hence a-b <= x. It follows that, in that case,
+ * v0-v1 >= 0. The first part of the update is then:
+ * (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+ * invariants.
+ *
+ * Either way, once the subtraction is done, the new value of
+ * a, which is the difference of two odd values, is even,
+ * and the remaining of this step is a subcase of the
+ * first algorithm case (i.e. when a is even).
+ *
+ * - If a and b are odd, and b > a, then the a similar
+ * argument holds.
+ *
+ * The values a and b start at x and y, respectively. Since x
+ * and y are odd, their GCD is odd, and it is easily seen that
+ * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+ * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+ * or b is reduced by at least one bit at each iteration, so
+ * the algorithm necessarily converges on the case a = b, at
+ * which point the common value is the GCD.
+ *
+ * In the algorithm expressed above, when a = b, the fourth case
+ * applies, and sets b = 0. Since a contains the GCD of x and y,
+ * which are both odd, a must be odd, and subsequent iterations
+ * (if any) will simply divide b by 2 repeatedly, which has no
+ * consequence. Thus, the algorithm can run for more iterations
+ * than necessary; the final GCD will be in a, and the (u,v)
+ * coefficients will be (u0,v0).
+ *
+ *
+ * The presentation above is bit-by-bit. It can be sped up by
+ * noticing that all decisions are taken based on the low bits
+ * and high bits of a and b. We can extract the two top words
+ * and low word of each of a and b, and compute reduction
+ * parameters pa, pb, qa and qb such that the new values for
+ * a and b are:
+ * a' = (a*pa + b*pb) / (2^31)
+ * b' = (a*qa + b*qb) / (2^31)
+ * the two divisions being exact. The coefficients are obtained
+ * just from the extracted words, and may be slightly off, requiring
+ * an optional correction: if a' < 0, then we replace pa with -pa
+ * and pb with -pb. Each such step will reduce the total length
+ * (sum of lengths of a and b) by at least 30 bits at each
+ * iteration.
+ */
+ uint32_t *u0, *u1, *v0, *v1, *a, *b;
+ uint32_t x0i, y0i;
+ uint32_t num, rc;
+ size_t j;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ /*
+ * u0 and v0 are the u and v result buffers; the four other
+ * values (u1, v1, a and b) are taken from tmp[].
+ */
+ u0 = u;
+ v0 = v;
+ u1 = tmp;
+ v1 = u1 + len;
+ a = v1 + len;
+ b = a + len;
+
+ /*
+ * We'll need the Montgomery reduction coefficients.
+ */
+ x0i = modp_ninv31(x[0]);
+ y0i = modp_ninv31(y[0]);
+
+ /*
+ * Initialize a, b, u0, u1, v0 and v1.
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ * Note that x is odd, so computing x-1 is easy.
+ */
+ memcpy(a, x, len * sizeof * x);
+ memcpy(b, y, len * sizeof * y);
+ u0[0] = 1;
+ memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+ memset(v0, 0, len * sizeof * v0);
+ memcpy(u1, y, len * sizeof * u1);
+ memcpy(v1, x, len * sizeof * v1);
+ v1[0] --;
+
+ /*
+ * Each input operand may be as large as 31*len bits, and we
+ * reduce the total length by at least 30 bits at each iteration.
+ */
+ for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+ uint32_t c0, c1;
+ uint32_t a0, a1, b0, b1;
+ uint64_t a_hi, b_hi;
+ uint32_t a_lo, b_lo;
+ int64_t pa, pb, qa, qb;
+ int i;
+ uint32_t r;
+
+ /*
+ * Extract the top words of a and b. If j is the highest
+ * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+ * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+ * If a and b are down to one word each, then we use
+ * a[0] and b[0].
+ */
+ c0 = (uint32_t) -1;
+ c1 = (uint32_t) -1;
+ a0 = 0;
+ a1 = 0;
+ b0 = 0;
+ b1 = 0;
+ j = len;
+ while (j -- > 0) {
+ uint32_t aw, bw;
+
+ aw = a[j];
+ bw = b[j];
+ a0 ^= (a0 ^ aw) & c0;
+ a1 ^= (a1 ^ aw) & c1;
+ b0 ^= (b0 ^ bw) & c0;
+ b1 ^= (b1 ^ bw) & c1;
+ c1 = c0;
+ c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+ }
+
+ /*
+ * If c1 = 0, then we grabbed two words for a and b.
+ * If c1 != 0 but c0 = 0, then we grabbed one word. It
+ * is not possible that c1 != 0 and c0 != 0, because that
+ * would mean that both integers are zero.
+ */
+ a1 |= a0 & c1;
+ a0 &= ~c1;
+ b1 |= b0 & c1;
+ b0 &= ~c1;
+ a_hi = ((uint64_t)a0 << 31) + a1;
+ b_hi = ((uint64_t)b0 << 31) + b1;
+ a_lo = a[0];
+ b_lo = b[0];
+
+ /*
+ * Compute reduction factors:
+ *
+ * a' = a*pa + b*pb
+ * b' = a*qa + b*qb
+ *
+ * such that a' and b' are both multiple of 2^31, but are
+ * only marginally larger than a and b.
+ */
+ pa = 1;
+ pb = 0;
+ qa = 0;
+ qb = 1;
+ for (i = 0; i < 31; i ++) {
+ /*
+ * At each iteration:
+ *
+ * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+ * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+ * a <- a/2 if: a is even
+ * b <- b/2 if: a is odd, b is even
+ *
+ * We multiply a_lo and b_lo by 2 at each
+ * iteration, thus a division by 2 really is a
+ * non-multiplication by 2.
+ */
+ uint32_t rt, oa, ob, cAB, cBA, cA;
+ uint64_t rz;
+
+ /*
+ * rt = 1 if a_hi > b_hi, 0 otherwise.
+ */
+ rz = b_hi - a_hi;
+ rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+ & (a_hi ^ rz))) >> 63);
+
+ /*
+ * cAB = 1 if b must be subtracted from a
+ * cBA = 1 if a must be subtracted from b
+ * cA = 1 if a must be divided by 2
+ *
+ * Rules:
+ *
+ * cAB and cBA cannot both be 1.
+ * If a is not divided by 2, b is.
+ */
+ oa = (a_lo >> i) & 1;
+ ob = (b_lo >> i) & 1;
+ cAB = oa & ob & rt;
+ cBA = oa & ob & ~rt;
+ cA = cAB | (oa ^ 1);
+
+ /*
+ * Conditional subtractions.
+ */
+ a_lo -= b_lo & -cAB;
+ a_hi -= b_hi & -(uint64_t)cAB;
+ pa -= qa & -(int64_t)cAB;
+ pb -= qb & -(int64_t)cAB;
+ b_lo -= a_lo & -cBA;
+ b_hi -= a_hi & -(uint64_t)cBA;
+ qa -= pa & -(int64_t)cBA;
+ qb -= pb & -(int64_t)cBA;
+
+ /*
+ * Shifting.
+ */
+ a_lo += a_lo & (cA - 1);
+ pa += pa & ((int64_t)cA - 1);
+ pb += pb & ((int64_t)cA - 1);
+ a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+ b_lo += b_lo & -cA;
+ qa += qa & -(int64_t)cA;
+ qb += qb & -(int64_t)cA;
+ b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+ }
+
+ /*
+ * Apply the computed parameters to our values. We
+ * may have to correct pa and pb depending on the
+ * returned value of zint_co_reduce() (when a and/or b
+ * had to be negated).
+ */
+ r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+ pa -= (pa + pa) & -(int64_t)(r & 1);
+ pb -= (pb + pb) & -(int64_t)(r & 1);
+ qa -= (qa + qa) & -(int64_t)(r >> 1);
+ qb -= (qb + qb) & -(int64_t)(r >> 1);
+ zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+ zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+ }
+
+ /*
+ * At that point, array a[] should contain the GCD, and the
+ * results (u,v) should already be set. We check that the GCD
+ * is indeed 1. We also check that the two operands x and y
+ * are odd.
+ */
+ rc = a[0] ^ 1;
+ for (j = 1; j < len; j ++) {
+ rc |= a[j];
+ }
+ return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
+ const uint32_t *restrict y, size_t ylen, int32_t k,
+ uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ int32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t wy, wys, ccu;
+ uint64_t z;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ wy = v < ylen ? y[v] : ysign;
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ /*
+ * The expression below does not overflow.
+ */
+ z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+ /*
+ * Right-shifting the signed value z would yield
+ * implementation-defined results (arithmetic shift is
+ * not guaranteed). However, we can cast to unsigned,
+ * and get the next carry as an unsigned word. We can
+ * then convert it back to signed by using the guaranteed
+ * fact that 'int32_t' uses two's complement with no
+ * trap representation or padding bit, and with a layout
+ * compatible with that of 'uint32_t'.
+ */
+ ccu = (uint32_t)(z >> 31);
+ cc = *(int32_t *)&ccu;
+ }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *restrict x, size_t xlen,
+ const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ uint32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t w, wy, wys;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ wy = v < ylen ? y[v] : ysign;
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ w = x[u] - wys - cc;
+ x[u] = w & 0x7FFFFFFF;
+ cc = w >> 31;
+ }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+ uint32_t w;
+
+ w = x[0];
+ w |= (w & 0x40000000) << 1;
+ return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+ unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ if (flen == 0) {
+ for (u = 0; u < n; u ++) {
+ d[u] = fpr_zero;
+ }
+ return;
+ }
+ for (u = 0; u < n; u ++, f += fstride) {
+ size_t v;
+ uint32_t neg, cc, xm;
+ fpr x, fsc;
+
+ /*
+ * Get sign of the integer; if it is negative, then we
+ * will load its absolute value instead, and negate the
+ * result.
+ */
+ neg = -(f[flen - 1] >> 30);
+ xm = neg >> 1;
+ cc = neg & 1;
+ x = fpr_zero;
+ fsc = fpr_one;
+ for (v = 0; v < flen; v++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+ uint32_t w;
+
+ w = (f[v] ^ xm) + cc;
+ cc = w >> 31;
+ w &= 0x7FFFFFFF;
+ w -= (w << 1) & neg;
+ x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+ }
+ d[u] = x;
+ }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = zint_one_to_plain(s + u);
+ if (z < -lim || z > lim) {
+ return 0;
+ }
+ d[u] = (int8_t)z;
+ }
+ return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
+ const uint32_t *restrict f, size_t flen, size_t fstride,
+ const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t kf;
+ size_t v;
+ uint32_t *x;
+ const uint32_t *y;
+
+ kf = -k[u];
+ x = F + u * Fstride;
+ y = f;
+ for (v = 0; v < n; v ++) {
+ zint_add_scaled_mul_small(
+ x, Flen, y, flen, kf, sch, scl);
+ if (u + v == n - 1) {
+ x = F;
+ kf = -kf;
+ } else {
+ x += Fstride;
+ }
+ y += fstride;
+ }
+ }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
+ const uint32_t *restrict f, size_t flen, size_t fstride,
+ const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
+ uint32_t *restrict tmp) {
+ uint32_t *gm, *igm, *fk, *t1, *x;
+ const uint32_t *y;
+ size_t n, u, tlen;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ tlen = flen + 1;
+ gm = tmp;
+ igm = gm + MKN(logn);
+ fk = igm + MKN(logn);
+ t1 = fk + n * tlen;
+
+ primes = PRIMES;
+
+ /*
+ * Compute k*f in fk[], in RNS notation.
+ */
+ for (u = 0; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0; v < n; v ++) {
+ t1[v] = modp_set(k[v], p);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, y = f, x = fk + u;
+ v < n; v ++, y += fstride, x += tlen) {
+ *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+ }
+ modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+ for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+ *x = modp_montymul(
+ modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+ }
+ modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild k*f.
+ */
+ zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+ /*
+ * Subtract k*f, scaled, from F.
+ */
+ for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+ zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+ }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+ /*
+ * We enforce little-endian representation.
+ */
+
+ uint8_t tmp[8];
+
+ inner_shake256_extract(rng, tmp, sizeof tmp);
+ return (uint64_t)tmp[0]
+ | ((uint64_t)tmp[1] << 8)
+ | ((uint64_t)tmp[2] << 16)
+ | ((uint64_t)tmp[3] << 24)
+ | ((uint64_t)tmp[4] << 32)
+ | ((uint64_t)tmp[5] << 40)
+ | ((uint64_t)tmp[6] << 48)
+ | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ * D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+ 1283868770400643928u, 6416574995475331444u, 4078260278032692663u,
+ 2353523259288686585u, 1227179971273316331u, 575931623374121527u,
+ 242543240509105209u, 91437049221049666u, 30799446349977173u,
+ 9255276791179340u, 2478152334826140u, 590642893610164u,
+ 125206034929641u, 23590435911403u, 3948334035941u,
+ 586753615614u, 77391054539u, 9056793210u,
+ 940121950u, 86539696u, 7062824u,
+ 510971u, 32764u, 1862u,
+ 94u, 4u, 0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+ unsigned u, g;
+ int val;
+
+ g = 1U << (10 - logn);
+ val = 0;
+ for (u = 0; u < g; u ++) {
+ /*
+ * Each iteration generates one value with the
+ * Gaussian distribution for N = 1024.
+ *
+ * We use two random 64-bit values. First value
+ * decides on whether the generated value is 0, and,
+ * if not, the sign of the value. Second random 64-bit
+ * word is used to generate the non-zero value.
+ *
+ * For constant-time code we have to read the complete
+ * table. This has negligible cost, compared with the
+ * remainder of the keygen process (solving the NTRU
+ * equation).
+ */
+ uint64_t r;
+ uint32_t f, v, k, neg;
+
+ /*
+ * First value:
+ * - flag 'neg' is randomly selected to be 0 or 1.
+ * - flag 'f' is set to 1 if the generated value is zero,
+ * or set to 0 otherwise.
+ */
+ r = get_rng_u64(rng);
+ neg = (uint32_t)(r >> 63);
+ r &= ~((uint64_t)1 << 63);
+ f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+ /*
+ * We produce a new random 63-bit integer r, and go over
+ * the array, starting at index 1. We store in v the
+ * index of the first array element which is not greater
+ * than r, unless the flag f was already 1.
+ */
+ v = 0;
+ r = get_rng_u64(rng);
+ r &= ~((uint64_t)1 << 63);
+ for (k = 1; k < (sizeof gauss_1024_12289)
+ / (sizeof gauss_1024_12289[0]); k ++) {
+ uint32_t t;
+
+ t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+ v |= k & -(t & (f ^ 1));
+ f |= t;
+ }
+
+ /*
+ * We apply the sign ('neg' flag). If the value is zero,
+ * the sign has no effect.
+ */
+ v = (v ^ -neg) + neg;
+
+ /*
+ * Generated value is added to val.
+ */
+ val += *(int32_t *)&v;
+ }
+ return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ * MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ * - Within an array, values grow.
+ *
+ * - The 'SMALL' array must have an entry for maximum depth, corresponding
+ * to the size of values used in the binary GCD. There is no such value
+ * for the 'LARGE' array (the binary GCD yields already reduced
+ * coefficients).
+ *
+ * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ * - Values must be large enough to handle the common cases, with some
+ * margins.
+ *
+ * - Values must not be "too large" either because we will convert some
+ * integers into floating-point values by considering the top 10 words,
+ * i.e. 310 bits; hence, for values of length more than 10 words, we
+ * should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51)
+ * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55)
+ * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77)
+ * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31)
+ * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04)
+ * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87)
+ * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38)
+ * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39)
+ * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73)
+ * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41)
+ * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+ 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+ 2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+ int avg;
+ int std;
+} BITLENGTH[] = {
+ { 4, 0 },
+ { 11, 1 },
+ { 24, 1 },
+ { 50, 1 },
+ { 102, 1 },
+ { 202, 2 },
+ { 401, 4 },
+ { 794, 5 },
+ { 1577, 8 },
+ { 3138, 13 },
+ { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG 4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = MKN(logn);
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = f[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(fpr);
+ if (km) {
+ k += (sizeof(fpr)) - km;
+ }
+ return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(uint32_t);
+ if (km) {
+ k += (sizeof(uint32_t)) - km;
+ }
+ return (uint32_t *)(cb + k);
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+ int in_ntt, int out_ntt) {
+ size_t n, hn, u;
+ size_t slen, tlen;
+ uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+ const small_prime *primes;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ slen = MAX_BL_SMALL[depth];
+ tlen = MAX_BL_SMALL[depth + 1];
+ primes = PRIMES;
+
+ /*
+ * Prepare room for the result.
+ */
+ fd = data;
+ gd = fd + hn * tlen;
+ fs = gd + hn * tlen;
+ gs = fs + n * slen;
+ gm = gs + n * slen;
+ igm = gm + n;
+ t1 = igm + n;
+ memmove(fs, data, 2 * n * slen * sizeof * data);
+
+ /*
+ * First slen words: we use the input values directly, and apply
+ * inverse NTT as we go.
+ */
+ for (u = 0; u < slen; u ++) {
+ uint32_t p, p0i, R2;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+ }
+
+ for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+
+ /*
+ * Since the fs and gs words have been de-NTTized, we can use the
+ * CRT to rebuild the values.
+ */
+ zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+ zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+ /*
+ * Remaining words: use modular reductions to extract the values.
+ */
+ for (u = slen; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+ for (v = 0, x = fs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ for (v = 0, x = gs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ * 0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+ unsigned logn, unsigned depth, int out_ntt) {
+ size_t n, u;
+ uint32_t *ft, *gt, p0;
+ unsigned d;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ ft = data;
+ gt = ft + n;
+ primes = PRIMES;
+ p0 = primes[0].p;
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p0);
+ gt[u] = modp_set(g[u], p0);
+ }
+
+ if (depth == 0 && out_ntt) {
+ uint32_t *gm, *igm;
+ uint32_t p, p0i;
+
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ gm = gt + n;
+ igm = gm + MKN(logn);
+ modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ return;
+ }
+
+ for (d = 0; d < depth; d ++) {
+ make_fg_step(data, logn - d, d,
+ d != 0, (d + 1) < depth || out_ntt);
+ }
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t len;
+ uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+ const small_prime *primes;
+
+ len = MAX_BL_SMALL[logn_top];
+ primes = PRIMES;
+
+ Fp = tmp;
+ Gp = Fp + len;
+ fp = Gp + len;
+ gp = fp + len;
+ t1 = gp + len;
+
+ make_fg(fp, f, g, logn_top, logn_top, 0);
+
+ /*
+ * We use the CRT to rebuild the resultants as big integers.
+ * There are two such big integers. The resultants are always
+ * nonnegative.
+ */
+ zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+ /*
+ * Apply the binary GCD. The zint_bezout() function works only
+ * if both inputs are odd.
+ *
+ * We can test on the result and return 0 because that would
+ * imply failure of the NTRU solving equation, and the (f,g)
+ * values will be abandoned in that case.
+ */
+ if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+ return 0;
+ }
+
+ /*
+ * Multiply the two values by the target value q. Values must
+ * fit in the destination arrays.
+ * We can again test on the returned words: a non-zero output
+ * of zint_mul_small() means that we exceeded our array
+ * capacity, and that implies failure and rejection of (f,g).
+ */
+ q = 12289;
+ if (zint_mul_small(Fp, len, q) != 0
+ || zint_mul_small(Gp, len, q) != 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+ const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+ /*
+ * In this function, 'logn' is the log2 of the degree for
+ * this step. If N = 2^logn, then:
+ * - the F and G values already in fk->tmp (from the deeper
+ * levels) have degree N/2;
+ * - this function should return F and G of degree N.
+ */
+ unsigned logn;
+ size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+ int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+ uint32_t *x, *y;
+ int32_t *k;
+ const small_prime *primes;
+
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2 or N/3)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+ primes = PRIMES;
+
+ /*
+ * Fd and Gd are the F and G from the deeper level.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+
+ /*
+ * Compute the input f and g for this level. Note that we get f
+ * and g in RNS + NTT representation.
+ */
+ ft = Gd + dlen * hn;
+ make_fg(ft, f, g, logn_top, depth, 1);
+
+ /*
+ * Move the newly computed f and g to make room for our candidate
+ * F and G (unreduced).
+ */
+ Ft = tmp;
+ Gt = Ft + n * llen;
+ t1 = Gt + n * llen;
+ memmove(t1, ft, 2 * n * slen * sizeof * ft);
+ ft = t1;
+ gt = ft + slen * n;
+ t1 = gt + slen * n;
+
+ /*
+ * Move Fd and Gd _after_ f and g.
+ */
+ memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+ Fd = t1;
+ Gd = Fd + hn * dlen;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt (only n/2 values in each).
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * We do not need Fd and Gd after that point.
+ */
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * If we processed slen words, then f and g have been
+ * de-NTTized, and are in RNS; we can rebuild them.
+ */
+ if (u == slen) {
+ zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+ zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+ }
+
+ gm = t1;
+ igm = gm + n;
+ fx = igm + n;
+ gx = fx + n;
+
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ if (u < slen) {
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = *x;
+ gx[v] = *y;
+ }
+ modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+ modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+ } else {
+ uint32_t Rx;
+
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ for (v = 0, x = ft, y = gt;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = zint_mod_small_signed(x, slen,
+ p, p0i, R2, Rx);
+ gx[v] = zint_mod_small_signed(y, slen,
+ p, p0i, R2, Rx);
+ }
+ modp_NTT2(fx, gm, logn, p, p0i);
+ modp_NTT2(gx, gm, logn, p, p0i);
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed in
+ * a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * General case:
+ *
+ * we divide degree by d = 2 or 3
+ * f'(x^d) = N(f)(x^d) = f * adj(f)
+ * g'(x^d) = N(g)(x^d) = g * adj(g)
+ * f'*G' - g'*F' = q
+ * F = F'(x^d) * adj(g)
+ * G = G'(x^d) * adj(f)
+ *
+ * We compute things in the NTT. We group roots of phi
+ * such that all roots x in a group share the same x^d.
+ * If the roots in a group are x_1, x_2... x_d, then:
+ *
+ * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+ *
+ * Thus, we have:
+ *
+ * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * ...
+ * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+ *
+ * In all cases, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, in our chosen NTT representation, roots
+ * from the same group are consecutive in RAM.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+ v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild F and G with the CRT.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+ zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+ /*
+ * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+ * order).
+ */
+
+ /*
+ * Apply Babai reduction to bring back F and G to size slen.
+ *
+ * We use the FFT to compute successive approximations of the
+ * reduction coefficient. We first isolate the top bits of
+ * the coefficients of f and g, and convert them to floating
+ * point; with the FFT, we compute adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)).
+ *
+ * Then, we repeatedly apply the following:
+ *
+ * - Get the top bits of the coefficients of F and G into
+ * floating point, and use the FFT to compute:
+ * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+ *
+ * - Convert back that value into normal representation, and
+ * round it to the nearest integers, yielding a polynomial k.
+ * Proper scaling is applied to f, g, F and G so that the
+ * coefficients fit on 32 bits (signed).
+ *
+ * - Subtract k*f from F and k*g from G.
+ *
+ * Under normal conditions, this process reduces the size of F
+ * and G by some bits at each iteration. For constant-time
+ * operation, we do not want to measure the actual length of
+ * F and G; instead, we do the following:
+ *
+ * - f and g are converted to floating-point, with some scaling
+ * if necessary to keep values in the representable range.
+ *
+ * - For each iteration, we _assume_ a maximum size for F and G,
+ * and use the values at that size. If we overreach, then
+ * we get zeros, which is harmless: the resulting coefficients
+ * of k will be 0 and the value won't be reduced.
+ *
+ * - We conservatively assume that F and G will be reduced by
+ * at least 25 bits at each iteration.
+ *
+ * Even when reaching the bottom of the reduction, reduction
+ * coefficient will remain low. If it goes out-of-range, then
+ * something wrong occurred and the whole NTRU solving fails.
+ */
+
+ /*
+ * Memory layout:
+ * - We need to compute and keep adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+ * respectively).
+ * - At each iteration we need two extra fp buffer (N fp values),
+ * and produce a k (N 32-bit words). k will be shared with one
+ * of the fp buffers.
+ * - To compute k*f and k*g efficiently (with the NTT), we need
+ * some extra room; we reuse the space of the temporary buffers.
+ *
+ * Arrays of 'fpr' are obtained from the temporary array itself.
+ * We ensure that the base is at a properly aligned offset (the
+ * source array tmp[] is supposed to be already aligned).
+ */
+
+ rt3 = align_fpr(tmp, t1);
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt1 = rt5 + (n >> 1);
+ k = (int32_t *)align_u32(tmp, rt1);
+ rt2 = align_fpr(tmp, k + n);
+ if (rt2 < (rt1 + n)) {
+ rt2 = rt1 + n;
+ }
+ t1 = (uint32_t *)k + n;
+
+ /*
+ * Get f and g into rt3 and rt4 as floating-point approximations.
+ *
+ * We need to "scale down" the floating-point representation of
+ * coefficients when they are too big. We want to keep the value
+ * below 2^310 or so. Thus, when values are larger than 10 words,
+ * we consider only the top 10 words. Array lengths have been
+ * computed so that average maximum length will fall in the
+ * middle or the upper half of these top 10 words.
+ */
+ rlen = (slen > 10) ? 10 : slen;
+ poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+ poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+ /*
+ * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+ */
+ scale_fg = 31 * (int)(slen - rlen);
+
+ /*
+ * Estimated boundaries for the maximum size (in bits) of the
+ * coefficients of (f,g). We use the measured average, and
+ * allow for a deviation of at most six times the standard
+ * deviation.
+ */
+ minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+ maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+ /*
+ * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+ * and adj(g) in rt3 and rt4, respectively.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt3, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt4, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+
+ /*
+ * Reduce F and G repeatedly.
+ *
+ * The expected maximum bit length of coefficients of F and G
+ * is kept in maxbl_FG, with the corresponding word length in
+ * FGlen.
+ */
+ FGlen = llen;
+ maxbl_FG = 31 * (int)llen;
+
+ /*
+ * Each reduction operation computes the reduction polynomial
+ * "k". We need that polynomial to have coefficients that fit
+ * on 32-bit signed integers, with some scaling; thus, we use
+ * a descending sequence of scaling values, down to zero.
+ *
+ * The size of the coefficients of k is (roughly) the difference
+ * between the size of the coefficients of (F,G) and the size
+ * of the coefficients of (f,g). Thus, the maximum size of the
+ * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+ * this is our starting scale value for k.
+ *
+ * We need to estimate the size of (F,G) during the execution of
+ * the algorithm; we are allowed some overestimation but not too
+ * much (poly_big_to_fp() uses a 310-bit window). Generally
+ * speaking, after applying a reduction with k scaled to
+ * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+ * where 'dd' is a few bits to account for the fact that the
+ * reduction is never perfect (intuitively, dd is on the order
+ * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+ * bits).
+ *
+ * The size of (f,g) is not known exactly, but maxbl_fg is an
+ * upper bound.
+ */
+ scale_k = maxbl_FG - minbl_fg;
+
+ for (;;) {
+ int scale_FG, dc, new_maxbl_FG;
+ uint32_t scl, sch;
+ fpr pdc, pt;
+
+ /*
+ * Convert current F and G into floating-point. We apply
+ * scaling if the current length is more than 10 words.
+ */
+ rlen = (FGlen > 10) ? 10 : FGlen;
+ scale_FG = 31 * (int)(FGlen - rlen);
+ poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+ poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt1, rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt2, rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(rt2, rt2, rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt2, logn);
+
+ /*
+ * (f,g) are scaled by 'scale_fg', meaning that the
+ * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+ * to have their true mathematical value.
+ *
+ * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+ * the value we computed in rt2 is scaled by
+ * 'scale_FG-scale_fg'.
+ *
+ * We want that value to be scaled by 'scale_k', hence we
+ * apply a corrective scaling. After scaling, the values
+ * should fit in -2^31-1..+2^31-1.
+ */
+ dc = scale_k - scale_FG + scale_fg;
+
+ /*
+ * We will need to multiply values by 2^(-dc). The value
+ * 'dc' is not secret, so we can compute 2^(-dc) with a
+ * non-constant-time process.
+ * (We could use ldexp(), but we prefer to avoid any
+ * dependency on libm. When using FP emulation, we could
+ * use our fpr_ldexp(), which is constant-time.)
+ */
+ if (dc < 0) {
+ dc = -dc;
+ pt = fpr_two;
+ } else {
+ pt = fpr_onehalf;
+ }
+ pdc = fpr_one;
+ while (dc != 0) {
+ if ((dc & 1) != 0) {
+ pdc = fpr_mul(pdc, pt);
+ }
+ dc >>= 1;
+ pt = fpr_sqr(pt);
+ }
+
+ for (u = 0; u < n; u ++) {
+ fpr xv;
+
+ xv = fpr_mul(rt2[u], pdc);
+
+ /*
+ * Sometimes the values can be out-of-bounds if
+ * the algorithm fails; we must not call
+ * fpr_rint() (and cast to int32_t) if the value
+ * is not in-bounds. Note that the test does not
+ * break constant-time discipline, since any
+ * failure here implies that we discard the current
+ * secret key (f,g).
+ */
+ if (!fpr_lt(fpr_mtwo31m1, xv)
+ || !fpr_lt(xv, fpr_ptwo31m1)) {
+ return 0;
+ }
+ k[u] = (int32_t)fpr_rint(xv);
+ }
+
+ /*
+ * Values in k[] are integers. They really are scaled
+ * down by maxbl_FG - minbl_fg bits.
+ *
+ * If we are at low depth, then we use the NTT to
+ * compute k*f and k*g.
+ */
+ sch = (uint32_t)(scale_k / 31);
+ scl = (uint32_t)(scale_k % 31);
+ if (depth <= DEPTH_INT_FG) {
+ poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn, t1);
+ poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn, t1);
+ } else {
+ poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn);
+ poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn);
+ }
+
+ /*
+ * We compute the new maximum size of (F,G), assuming that
+ * (f,g) has _maximal_ length (i.e. that reduction is
+ * "late" instead of "early". We also adjust FGlen
+ * accordingly.
+ */
+ new_maxbl_FG = scale_k + maxbl_fg + 10;
+ if (new_maxbl_FG < maxbl_FG) {
+ maxbl_FG = new_maxbl_FG;
+ if ((int)FGlen * 31 >= maxbl_FG + 31) {
+ FGlen --;
+ }
+ }
+
+ /*
+ * We suppose that scaling down achieves a reduction by
+ * at least 25 bits per iteration. We stop when we have
+ * done the loop with an unscaled k.
+ */
+ if (scale_k <= 0) {
+ break;
+ }
+ scale_k -= 25;
+ if (scale_k < 0) {
+ scale_k = 0;
+ }
+ }
+
+ /*
+ * If (F,G) length was lowered below 'slen', then we must take
+ * care to re-extend the sign.
+ */
+ if (FGlen < slen) {
+ for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+ size_t v;
+ uint32_t sw;
+
+ sw = -(Ft[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Ft[v] = sw;
+ }
+ sw = -(Gt[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Gt[v] = sw;
+ }
+ }
+ }
+
+ /*
+ * Compress encoding of all values to 'slen' words (this is the
+ * expected output format).
+ */
+ for (u = 0, x = tmp, y = tmp;
+ u < (n << 1); u ++, x += slen, y += llen) {
+ memmove(x, y, slen * sizeof * y);
+ }
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ /*
+ * The first half of this function is a copy of the corresponding
+ * part in solve_NTRU_intermediate(), for the reconstruction of
+ * the unreduced F and G. The second half (Babai reduction) is
+ * done differently, because the unreduced F and G fit in 53 bits
+ * of precision, allowing a much simpler process with lower RAM
+ * usage.
+ */
+ unsigned depth, logn;
+ size_t n_top, n, hn, slen, dlen, llen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+ uint32_t *x, *y;
+
+ depth = 1;
+ n_top = (size_t)1 << logn_top;
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ */
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+
+ /*
+ * Fd and Gd are the F and G from the deeper level. Ft and Gt
+ * are the destination arrays for the unreduced F and G.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+ Ft = Gd + dlen * hn;
+ Gt = Ft + llen * n;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * Now Fd and Gd are not needed anymore; we can squeeze them out.
+ */
+ memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+ Ft = tmp;
+ memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+ Gt = Ft + llen * n;
+ ft = Gt + llen * n;
+ gt = ft + slen * n;
+
+ t1 = gt + slen * n;
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ unsigned e;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * We recompute things from the source f and g, of full
+ * degree. However, we will need only the n first elements
+ * of the inverse NTT table (igm); the call to modp_mkgm()
+ * below will fill n_top elements in igm[] (thus overflowing
+ * into fx[]) but later code will overwrite these extra
+ * elements.
+ */
+ gm = t1;
+ igm = gm + n_top;
+ fx = igm + n;
+ gx = fx + n_top;
+ modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+ /*
+ * Set ft and gt to f and g modulo p, respectively.
+ */
+ for (v = 0; v < n_top; v ++) {
+ fx[v] = modp_set(f[v], p);
+ gx[v] = modp_set(g[v], p);
+ }
+
+ /*
+ * Convert to NTT and compute our f and g.
+ */
+ modp_NTT2(fx, gm, logn_top, p, p0i);
+ modp_NTT2(gx, gm, logn_top, p, p0i);
+ for (e = logn_top; e > logn; e --) {
+ modp_poly_rec_res(fx, e, p, p0i, R2);
+ modp_poly_rec_res(gx, e, p, p0i, R2);
+ }
+
+ /*
+ * From that point onward, we only need tables for
+ * degree n, so we can save some space.
+ */
+ if (depth > 0) { /* always true */
+ memmove(gm + n, igm, n * sizeof * igm);
+ igm = gm + n;
+ memmove(igm + n, fx, n * sizeof * ft);
+ fx = igm + n;
+ memmove(fx + n, gx, n * sizeof * gt);
+ gx = fx + n;
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed
+ * in a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * Equations are:
+ *
+ * f'(x^2) = N(f)(x^2) = f * adj(f)
+ * g'(x^2) = N(g)(x^2) = g * adj(g)
+ *
+ * f'*G' - g'*F' = q
+ *
+ * F = F'(x^2) * adj(g)
+ * G = G'(x^2) * adj(f)
+ *
+ * The NTT representation of f is f(w) for all w which
+ * are roots of phi. In the binary case, as well as in
+ * the ternary case for all depth except the deepest,
+ * these roots can be grouped in pairs (w,-w), and we
+ * then have:
+ *
+ * f(w) = adj(f)(-w)
+ * f(-w) = adj(f)(w)
+ *
+ * and w^2 is then a root for phi at the half-degree.
+ *
+ * At the deepest level in the ternary case, this still
+ * holds, in the following sense: the roots of x^2-x+1
+ * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+ * have:
+ *
+ * f(w) = adj(f)(-w^2)
+ * f(-w^2) = adj(f)(w)
+ *
+ * In all case, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, the two roots for each pair are consecutive
+ * in our bit-reversal encoding.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+ /*
+ * Also save ft and gt (only up to size slen).
+ */
+ if (u < slen) {
+ modp_iNTT2(fx, igm, logn, p, p0i);
+ modp_iNTT2(gx, igm, logn, p, p0i);
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ *x = fx[v];
+ *y = gx[v];
+ }
+ }
+ }
+
+ /*
+ * Rebuild f, g, F and G with the CRT. Note that the elements of F
+ * and G are consecutive, and thus can be rebuilt in a single
+ * loop; similarly, the elements of f and g are consecutive.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+ zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+ /*
+ * Here starts the Babai reduction, specialized for depth = 1.
+ *
+ * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+ * are converted to floating point. There is no scaling, and a
+ * single pass is sufficient.
+ */
+
+ /*
+ * Convert F and G into floating point (rt1 and rt2).
+ */
+ rt1 = align_fpr(tmp, gt + slen * n);
+ rt2 = rt1 + n;
+ poly_big_to_fp(rt1, Ft, llen, llen, logn);
+ poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+ /*
+ * Integer representation of F and G is no longer needed, we
+ * can remove it.
+ */
+ memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+ ft = tmp;
+ gt = ft + slen * n;
+ rt3 = align_fpr(tmp, gt + slen * n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * Convert f and g into floating point (rt3 and rt4).
+ */
+ poly_big_to_fp(rt3, ft, slen, slen, logn);
+ poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+ /*
+ * Remove unneeded ft and gt.
+ */
+ memmove(tmp, rt1, 4 * n * sizeof * rt1);
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt6 = rt5 + n;
+
+ /*
+ * We now have:
+ * rt1 = F
+ * rt2 = G
+ * rt3 = f
+ * rt4 = g
+ * in that order in RAM. We convert all of them to FFT.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = F*adj(f) + G*adj(g)
+ * rt6 = 1 / (f*adj(f) + g*adj(g))
+ * (Note that rt6 is half-length.)
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt5, rt5, rt6, logn);
+
+ /*
+ * Compute k as the rounded version of rt5. Check that none of
+ * the values is larger than 2^63-1 (in absolute value)
+ * because that would make the fpr_rint() do something undefined;
+ * note that any out-of-bounds value here implies a failure and
+ * (f,g) will be discarded, so we can make a simple test.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt5, logn);
+ for (u = 0; u < n; u ++) {
+ fpr z;
+
+ z = rt5[u];
+ if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+ return 0;
+ }
+ rt5[u] = fpr_of(fpr_rint(z));
+ }
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt5, logn);
+
+ /*
+ * Subtract k*f from F, and k*g from G.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt3, rt3, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(rt1, rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt1, logn);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(rt4, rt4, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(rt2, rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt2, logn);
+
+ /*
+ * Convert back F and G to integers, and return.
+ */
+ Ft = tmp;
+ Gt = Ft + n;
+ rt3 = align_fpr(tmp, Gt + n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ for (u = 0; u < n; u ++) {
+ Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+ Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t n, hn, u;
+ uint32_t p, p0i, R2;
+ uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+ uint32_t *gm, *igm, *ft, *gt;
+ fpr *rt2, *rt3;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ *
+ * Everything should fit in 31-bit integers, hence we can just use
+ * the first small prime p = 2147473409.
+ */
+ p = PRIMES[0].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ Fp = tmp;
+ Gp = Fp + hn;
+ ft = Gp + hn;
+ gt = ft + n;
+ gm = gt + n;
+ igm = gm + n;
+
+ modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F' anf G' in NTT representation.
+ */
+ for (u = 0; u < hn; u ++) {
+ Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+ Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Load f and g and convert them to NTT representation.
+ */
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+
+ /*
+ * Build the unreduced F,G in ft and gt.
+ */
+ for (u = 0; u < n; u += 2) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = ft[u + 0];
+ ftB = ft[u + 1];
+ gtA = gt[u + 0];
+ gtB = gt[u + 1];
+ mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+ mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+ ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+ ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+ gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+ gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2(ft, igm, logn, p, p0i);
+ modp_iNTT2(gt, igm, logn, p, p0i);
+
+ Gp = Fp + n;
+ t1 = Gp + n;
+ memmove(Fp, ft, 2 * n * sizeof * ft);
+
+ /*
+ * We now need to apply the Babai reduction. At that point,
+ * we have F and G in two n-word arrays.
+ *
+ * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+ * modulo p, using the NTT. We still move memory around in
+ * order to save RAM.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+
+ /*
+ * Compute the NTT tables in t1 and t2. We do not keep t2
+ * (we'll recompute it later on).
+ */
+ modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F and G to NTT.
+ */
+ modp_NTT2(Fp, t1, logn, p, p0i);
+ modp_NTT2(Gp, t1, logn, p, p0i);
+
+ /*
+ * Load f and adj(f) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(f[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[n - u] = modp_set(-f[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Compute F*adj(f) in t2, and f*adj(f) in t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_montymul(w, Fp[u], p, p0i);
+ t3[u] = modp_montymul(w, t4[u], p, p0i);
+ }
+
+ /*
+ * Load g and adj(g) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(g[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(g[u], p);
+ t5[n - u] = modp_set(-g[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Add G*adj(g) to t2, and g*adj(g) to t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_add(t2[u],
+ modp_montymul(w, Gp[u], p, p0i), p);
+ t3[u] = modp_add(t3[u],
+ modp_montymul(w, t4[u], p, p0i), p);
+ }
+
+ /*
+ * Convert back t2 and t3 to normal representation (normalized
+ * around 0), and then
+ * move them to t1 and t2. We first need to recompute the
+ * inverse table for NTT.
+ */
+ modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+ modp_iNTT2(t2, t4, logn, p, p0i);
+ modp_iNTT2(t3, t4, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint32_t)modp_norm(t2[u], p);
+ t2[u] = (uint32_t)modp_norm(t3[u], p);
+ }
+
+ /*
+ * At that point, array contents are:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * F*adj(f)+G*adj(g) (t1)
+ * f*adj(f)+g*adj(g) (t2)
+ *
+ * We want to divide t1 by t2. The result is not integral; it
+ * must be rounded. We thus need to use the FFT.
+ */
+
+ /*
+ * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+ * polynomial is auto-adjoint, all its coordinates in FFT
+ * representation are actually real, so we can truncate off
+ * the imaginary parts.
+ */
+ rt3 = align_fpr(tmp, t3);
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t2)[u]);
+ }
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+ rt2 = align_fpr(tmp, t2);
+ memmove(rt2, rt3, hn * sizeof * rt3);
+
+ /*
+ * Convert F*adj(f)+G*adj(g) in FFT representation.
+ */
+ rt3 = rt2 + hn;
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t1)[u]);
+ }
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt3, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+ * its rounded normal representation in t1.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_autoadj_fft(rt3, rt3, rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt3, logn);
+ for (u = 0; u < n; u ++) {
+ t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+ }
+
+ /*
+ * RAM contents are now:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * k (t1)
+ *
+ * We want to compute F-k*f, and G-k*g.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+ modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(t1, t2, logn, p, p0i);
+ modp_NTT2(t4, t2, logn, p, p0i);
+ modp_NTT2(t5, t2, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t kw;
+
+ kw = modp_montymul(t1[u], R2, p, p0i);
+ Fp[u] = modp_sub(Fp[u],
+ modp_montymul(kw, t4[u], p, p0i), p);
+ Gp[u] = modp_sub(Gp[u],
+ modp_montymul(kw, t5[u], p, p0i), p);
+ }
+ modp_iNTT2(Fp, t3, logn, p, p0i);
+ modp_iNTT2(Gp, t3, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+ Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+ }
+
+ return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+ const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+ size_t n, u;
+ uint32_t *ft, *gt, *Ft, *Gt, *gm;
+ uint32_t p, p0i, r;
+ const small_prime *primes;
+
+ n = MKN(logn);
+
+ if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+ return 0;
+ }
+
+ /*
+ * For logn <= 2, we need to use solve_NTRU_intermediate()
+ * directly, because coefficients are a bit too large and
+ * do not fit the hypotheses in solve_NTRU_binary_depth0().
+ */
+ if (logn <= 2) {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 0) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ } else {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 2) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+ return 0;
+ }
+ if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+ return 0;
+ }
+ }
+
+ /*
+ * If no buffer has been provided for G, use a temporary one.
+ */
+ if (G == NULL) {
+ G = (int8_t *)(tmp + 2 * n);
+ }
+
+ /*
+ * Final F and G are in fk->tmp, one word per coefficient
+ * (signed value over 31 bits).
+ */
+ if (!poly_big_to_small(F, tmp, lim, logn)
+ || !poly_big_to_small(G, tmp + n, lim, logn)) {
+ return 0;
+ }
+
+ /*
+ * Verify that the NTRU equation is fulfilled. Since all elements
+ * have short lengths, verifying modulo a small prime p works, and
+ * allows using the NTT.
+ *
+ * We put Gt[] first in tmp[], and process it first, so that it does
+ * not overlap with G[] in case we allocated it ourselves.
+ */
+ Gt = tmp;
+ ft = Gt + n;
+ gt = ft + n;
+ Ft = gt + n;
+ gm = Ft + n;
+
+ primes = PRIMES;
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Gt[u] = modp_set(G[u], p);
+ }
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ Ft[u] = modp_set(F[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ modp_NTT2(Ft, gm, logn, p, p0i);
+ modp_NTT2(Gt, gm, logn, p, p0i);
+ r = modp_montymul(12289, 1, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t z;
+
+ z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+ modp_montymul(gt[u], Ft[u], p, p0i), p);
+ if (z != r) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+ size_t n, u;
+ unsigned mod2;
+
+ n = MKN(logn);
+ mod2 = 0;
+ for (u = 0; u < n; u ++) {
+ int s;
+
+restart:
+ s = mkgauss(rng, logn);
+
+ /*
+ * We need the coefficient to fit within -127..+127;
+ * realistically, this is always the case except for
+ * the very low degrees (N = 2 or 4), for which there
+ * is no real security anyway.
+ */
+ if (s < -127 || s > 127) {
+ goto restart;
+ }
+
+ /*
+ * We need the sum of all coefficients to be 1; otherwise,
+ * the resultant of the polynomial with X^N+1 will be even,
+ * and the binary GCD will fail.
+ */
+ if (u == n - 1) {
+ if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+ goto restart;
+ }
+ } else {
+ mod2 ^= (unsigned)(s & 1);
+ }
+ f[u] = (int8_t)s;
+ }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp) {
+ /*
+ * Algorithm is the following:
+ *
+ * - Generate f and g with the Gaussian distribution.
+ *
+ * - If either Res(f,phi) or Res(g,phi) is even, try again.
+ *
+ * - If ||(f,g)|| is too large, try again.
+ *
+ * - If ||B~_{f,g}|| is too large, try again.
+ *
+ * - If f is not invertible mod phi mod q, try again.
+ *
+ * - Compute h = g/f mod phi mod q.
+ *
+ * - Solve the NTRU equation fG - gF = q; if the solving fails,
+ * try again. Usual failure condition is when Res(f,phi)
+ * and Res(g,phi) are not prime to each other.
+ */
+ size_t n, u;
+ int16_t *h2, *tmp2;
+ RNG_CONTEXT *rc;
+
+ n = MKN(logn);
+ rc = rng;
+
+ /*
+ * We need to generate f and g randomly, until we find values
+ * such that the norm of (g,-f), and of the orthogonalized
+ * vector, are satisfying. The orthogonalized vector is:
+ * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+ * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+ *
+ * In the binary case, coefficients of f and g are generated
+ * independently of each other, with a discrete Gaussian
+ * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+ * the two vectors have expected norm 1.17*sqrt(q), which is
+ * also our acceptance bound: we require both vectors to be no
+ * larger than that (this will be satisfied about 1/4th of the
+ * time, thus we expect sampling new (f,g) about 4 times for that
+ * step).
+ *
+ * We require that Res(f,phi) and Res(g,phi) are both odd (the
+ * NTRU equation solver requires it).
+ */
+ for (;;) {
+ fpr *rt1, *rt2, *rt3;
+ fpr bnorm;
+ uint32_t normf, normg, norm;
+ int lim;
+
+ /*
+ * The poly_small_mkgauss() function makes sure
+ * that the sum of coefficients is 1 modulo 2
+ * (i.e. the resultant of the polynomial with phi
+ * will be odd).
+ */
+ poly_small_mkgauss(rc, f, logn);
+ poly_small_mkgauss(rc, g, logn);
+
+ /*
+ * Verify that all coefficients are within the bounds
+ * defined in max_fg_bits. This is the case with
+ * overwhelming probability; this guarantees that the
+ * key will be encodable with FALCON_COMP_TRIM.
+ */
+ lim = 1 << (PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[logn] - 1);
+ for (u = 0; u < n; u ++) {
+ /*
+ * We can use non-CT tests since on any failure
+ * we will discard f and g.
+ */
+ if (f[u] >= lim || f[u] <= -lim
+ || g[u] >= lim || g[u] <= -lim) {
+ lim = -1;
+ break;
+ }
+ }
+ if (lim < 0) {
+ continue;
+ }
+
+ /*
+ * Bound is 1.17*sqrt(q). We compute the squared
+ * norms. With q = 12289, the squared bound is:
+ * (1.17^2)* 12289 = 16822.4121
+ * Since f and g are integral, the squared norm
+ * of (g,-f) is an integer.
+ */
+ normf = poly_small_sqnorm(f, logn);
+ normg = poly_small_sqnorm(g, logn);
+ norm = (normf + normg) | -((normf | normg) >> 31);
+ if (norm >= 16823) {
+ continue;
+ }
+
+ /*
+ * We compute the orthogonalized vector norm.
+ */
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+
+ poly_small_to_fp(rt1, f, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt1, rt1, logn);
+
+ poly_small_to_fp(rt2, g, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(rt2, rt2, logn);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(rt1, rt1, fpr_q, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt1, rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt1, logn);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(rt2, rt2, fpr_q, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(rt2, logn);
+
+ bnorm = PQCLEAN_FALCONPADDED1024_AARCH64_compute_bnorm(rt1, rt2);
+
+ if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+ continue;
+ }
+
+ /*
+ * Compute public key h = g/f mod X^N+1 mod q. If this
+ * fails, we must restart.
+ */
+ if (h == NULL) {
+ h2 = (int16_t *)tmp;
+ tmp2 = h2 + n;
+ } else {
+ h2 = (int16_t *)h;
+ tmp2 = (int16_t *)tmp;
+ }
+
+ if (!PQCLEAN_FALCONPADDED1024_AARCH64_compute_public(h2, f, g, tmp2)) {
+ continue;
+ }
+
+ /*
+ * Solve the NTRU equation to get F and G.
+ */
+ lim = (1 << (PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[logn] - 1)) - 1;
+ if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+ continue;
+ }
+
+ /*
+ * Key pair is generated.
+ */
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrof.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrof.h
new file mode 100644
index 000000000..c8f82991e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrof.h
@@ -0,0 +1,125 @@
+/*
+ * 64-bit Floating point NEON macro x1
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+
+// c <= addr x1
+#define vload(c, addr) c = vld1q_f64(addr);
+// c <= addr interleave 2
+#define vload2(c, addr) c = vld2q_f64(addr);
+// c <= addr interleave 4
+#define vload4(c, addr) c = vld4q_f64(addr);
+
+#define vstore(addr, c) vst1q_f64(addr, c);
+// addr <= c
+#define vstore2(addr, c) vst2q_f64(addr, c);
+// addr <= c
+#define vstore4(addr, c) vst4q_f64(addr, c);
+
+// c <= addr x2
+#define vloadx2(c, addr) c = vld1q_f64_x2(addr);
+// c <= addr x3
+#define vloadx3(c, addr) c = vld1q_f64_x3(addr);
+
+// addr <= c
+#define vstorex2(addr, c) vst1q_f64_x2(addr, c);
+
+// c = a - b
+#define vfsub(c, a, b) c = vsubq_f64(a, b);
+
+// c = a + b
+#define vfadd(c, a, b) c = vaddq_f64(a, b);
+
+// c = a * b
+#define vfmul(c, a, b) c = vmulq_f64(a, b);
+
+// c = a * n (n is constant)
+#define vfmuln(c, a, n) c = vmulq_n_f64(a, n);
+
+// Swap from a|b to b|a
+#define vswap(c, a) c = vextq_f64(a, a, 1);
+
+// c = a * b[i]
+#define vfmul_lane(c, a, b, i) c = vmulq_laneq_f64(a, b, i);
+
+// c = 1/a
+#define vfinv(c, a) c = vdivq_f64(vdupq_n_f64(1.0), a);
+
+// c = -a
+#define vfneg(c, a) c = vnegq_f64(a);
+
+#define transpose_f64(a, b, t, ia, ib, it) \
+ t.val[it] = a.val[ia]; \
+ a.val[ia] = vzip1q_f64(a.val[ia], b.val[ib]); \
+ b.val[ib] = vzip2q_f64(t.val[it], b.val[ib]);
+
+/*
+ * c = a + jb
+ * c[0] = a[0] - b[1]
+ * c[1] = a[1] + b[0]
+ */
+#define vfcaddj(c, a, b) c = vcaddq_rot90_f64(a, b);
+
+/*
+ * c = a - jb
+ * c[0] = a[0] + b[1]
+ * c[1] = a[1] - b[0]
+ */
+#define vfcsubj(c, a, b) c = vcaddq_rot270_f64(a, b);
+
+// c[0] = c[0] + b[0]*a[0], c[1] = c[1] + b[1]*a[0]
+#define vfcmla(c, a, b) c = vcmlaq_f64(c, a, b);
+
+// c[0] = c[0] - b[1]*a[1], c[1] = c[1] + b[0]*a[1]
+#define vfcmla_90(c, a, b) c = vcmlaq_rot90_f64(c, a, b);
+
+// c[0] = c[0] - b[0]*a[0], c[1] = c[1] - b[1]*a[0]
+#define vfcmla_180(c, a, b) c = vcmlaq_rot180_f64(c, a, b);
+
+// c[0] = c[0] + b[1]*a[1], c[1] = c[1] - b[0]*a[1]
+#define vfcmla_270(c, a, b) c = vcmlaq_rot270_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a*b
+ * c[0] = a[0]*b[0] - a[1]*b[1]
+ * c[1] = a[0]*b[1] + a[1]*b[0]
+ */
+#define FPC_CMUL(c, a, b) \
+ c = vmulq_laneq_f64(b, a, 0); \
+ c = vcmlaq_rot90_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a * conjugate(b) = a * (b[0], -b[1])
+ * c[0] = b[0]*a[0] + b[1]*a[1]
+ * c[1] = + b[0]*a[1] - b[1]*a[0]
+ */
+#define FPC_CMUL_CONJ(c, a, b) \
+ c = vmulq_laneq_f64(a, b, 0); \
+ c = vcmlaq_rot270_f64(c, b, a);
+
+// d = c + a *b
+#define vfmla(d, c, a, b) d = vfmaq_f64(c, a, b);
+// d = c - a * b
+#define vfmls(d, c, a, b) d = vfmsq_f64(c, a, b);
+// d = c + a * b[i]
+#define vfmla_lane(d, c, a, b, i) d = vfmaq_laneq_f64(c, a, b, i);
+// d = c - a * b[i]
+#define vfmls_lane(d, c, a, b, i) d = vfmsq_laneq_f64(c, a, b, i);
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrofx4.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrofx4.h
new file mode 100644
index 000000000..e6b70e64e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrofx4.h
@@ -0,0 +1,430 @@
+/*
+ * 64-bit Floating point NEON macro x4
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+#include "macrof.h"
+
+#define vloadx4(c, addr) c = vld1q_f64_x4(addr);
+
+#define vstorex4(addr, c) vst1q_f64_x4(addr, c);
+
+#define vfdupx4(c, constant) \
+ c.val[0] = vdupq_n_f64(constant); \
+ c.val[1] = vdupq_n_f64(constant); \
+ c.val[2] = vdupq_n_f64(constant); \
+ c.val[3] = vdupq_n_f64(constant);
+
+#define vfnegx4(c, a) \
+ c.val[0] = vnegq_f64(a.val[0]); \
+ c.val[1] = vnegq_f64(a.val[1]); \
+ c.val[2] = vnegq_f64(a.val[2]); \
+ c.val[3] = vnegq_f64(a.val[3]);
+
+#define vfmulnx4(c, a, n) \
+ c.val[0] = vmulq_n_f64(a.val[0], n); \
+ c.val[1] = vmulq_n_f64(a.val[1], n); \
+ c.val[2] = vmulq_n_f64(a.val[2], n); \
+ c.val[3] = vmulq_n_f64(a.val[3], n);
+
+// c = a - b
+#define vfsubx4(c, a, b) \
+ c.val[0] = vsubq_f64(a.val[0], b.val[0]); \
+ c.val[1] = vsubq_f64(a.val[1], b.val[1]); \
+ c.val[2] = vsubq_f64(a.val[2], b.val[2]); \
+ c.val[3] = vsubq_f64(a.val[3], b.val[3]);
+
+// c = a + b
+#define vfaddx4(c, a, b) \
+ c.val[0] = vaddq_f64(a.val[0], b.val[0]); \
+ c.val[1] = vaddq_f64(a.val[1], b.val[1]); \
+ c.val[2] = vaddq_f64(a.val[2], b.val[2]); \
+ c.val[3] = vaddq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4(c, a, b) \
+ c.val[0] = vmulq_f64(a.val[0], b.val[0]); \
+ c.val[1] = vmulq_f64(a.val[1], b.val[1]); \
+ c.val[2] = vmulq_f64(a.val[2], b.val[2]); \
+ c.val[3] = vmulq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4_i(c, a, b) \
+ c.val[0] = vmulq_f64(a.val[0], b); \
+ c.val[1] = vmulq_f64(a.val[1], b); \
+ c.val[2] = vmulq_f64(a.val[2], b); \
+ c.val[3] = vmulq_f64(a.val[3], b);
+
+#define vfinvx4(c, a) \
+ c.val[0] = vdivq_f64(vdupq_n_f64(1.0), a.val[0]); \
+ c.val[1] = vdivq_f64(vdupq_n_f64(1.0), a.val[1]); \
+ c.val[2] = vdivq_f64(vdupq_n_f64(1.0), a.val[2]); \
+ c.val[3] = vdivq_f64(vdupq_n_f64(1.0), a.val[3]);
+
+#define vfcvtx4(c, a) \
+ c.val[0] = vcvtq_f64_s64(a.val[0]); \
+ c.val[1] = vcvtq_f64_s64(a.val[1]); \
+ c.val[2] = vcvtq_f64_s64(a.val[2]); \
+ c.val[3] = vcvtq_f64_s64(a.val[3]);
+
+#define vfmlax4(d, c, a, b) \
+ vfmla(d.val[0], c.val[0], a.val[0], b.val[0]); \
+ vfmla(d.val[1], c.val[1], a.val[1], b.val[1]); \
+ vfmla(d.val[2], c.val[2], a.val[2], b.val[2]); \
+ vfmla(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfmlsx4(d, c, a, b) \
+ vfmls(d.val[0], c.val[0], a.val[0], b.val[0]); \
+ vfmls(d.val[1], c.val[1], a.val[1], b.val[1]); \
+ vfmls(d.val[2], c.val[2], a.val[2], b.val[2]); \
+ vfmls(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfrintx4(c, a) \
+ c.val[0] = vcvtnq_s64_f64(a.val[0]); \
+ c.val[1] = vcvtnq_s64_f64(a.val[1]); \
+ c.val[2] = vcvtnq_s64_f64(a.val[2]); \
+ c.val[3] = vcvtnq_s64_f64(a.val[3]);
+
+/*
+ * Wrapper for FFT, split/merge and poly_float.c
+ */
+
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re, a_re, b_re); \
+ vfmls(d_re, d_re, a_im, b_im); \
+ vfmul(d_im, a_re, b_im); \
+ vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MULx2(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmul(d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmul(d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmul(d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MULx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmul(d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmul(d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+ vfmul(d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+ vfmul(d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmul(d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+ vfmul(d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+ vfmul(d_im.val[3], a_re.val[3], b_im.val[3]); \
+ vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re, d_re, a_re, b_re); \
+ vfmls(d_re, d_re, a_im, b_im); \
+ vfmla(d_im, d_im, a_re, b_im); \
+ vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+ vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+ vfmla(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+ vfmla(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]); \
+ vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re.val[0], b_im.val[0], a_im.val[0]); \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmul(d_re.val[1], b_im.val[1], a_im.val[1]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmul(d_re.val[2], b_im.val[2], a_im.val[2]); \
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmul(d_re.val[3], b_im.val[3], a_im.val[3]); \
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmul(d_im.val[0], b_re.val[0], a_im.val[0]); \
+ vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmul(d_im.val[1], b_re.val[1], a_im.val[1]); \
+ vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmul(d_im.val[2], b_re.val[2], a_im.val[2]); \
+ vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmul(d_im.val[3], b_re.val[3], a_im.val[3]); \
+ vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re.val[0], d_re.val[0], b_im.val[0], a_im.val[0]); \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmla(d_re.val[1], d_re.val[1], b_im.val[1], a_im.val[1]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmla(d_re.val[2], d_re.val[2], b_im.val[2], a_im.val[2]); \
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmla(d_re.val[3], d_re.val[3], b_im.val[3], a_im.val[3]); \
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmla(d_im.val[0], d_im.val[0], b_re.val[0], a_im.val[0]); \
+ vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[1], d_im.val[1], b_re.val[1], a_im.val[1]); \
+ vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[2], d_im.val[2], b_re.val[2], a_im.val[2]); \
+ vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmla(d_im.val[3], d_im.val[3], b_re.val[3], a_im.val[3]); \
+ vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MUL_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re, a_re, b_re_im, 0); \
+ vfmls_lane(d_re, d_re, a_im, b_re_im, 1); \
+ vfmul_lane(d_im, a_re, b_re_im, 1); \
+ vfmla_lane(d_im, d_im, a_im, b_re_im, 0);
+
+#define FPC_MUL_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0); \
+ vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+ vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0); \
+ vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+ vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0); \
+ vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+ vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0); \
+ vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+ vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 1); \
+ vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 0); \
+ vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 1); \
+ vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 0); \
+ vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 1); \
+ vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 0); \
+ vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 1); \
+ vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 0);
+
+#define FWD_TOP(t_re, t_im, b_re, b_im, zeta_re, zeta_im) \
+ FPC_MUL(t_re, t_im, b_re, b_im, zeta_re, zeta_im);
+
+#define FWD_TOP_LANE(t_re, t_im, b_re, b_im, zeta) \
+ FPC_MUL_LANE(t_re, t_im, b_re, b_im, zeta);
+
+#define FWD_TOP_LANEx4(t_re, t_im, b_re, b_im, zeta) \
+ FPC_MUL_LANEx4(t_re, t_im, b_re, b_im, zeta);
+
+/*
+ * FPC
+ */
+
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vsubq_f64(a_re, b_re); \
+ d_im = vsubq_f64(a_im, b_im);
+
+#define FPC_SUBx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vsubq_f64(a_re.val[0], b_re.val[0]); \
+ d_im.val[0] = vsubq_f64(a_im.val[0], b_im.val[0]); \
+ d_re.val[1] = vsubq_f64(a_re.val[1], b_re.val[1]); \
+ d_im.val[1] = vsubq_f64(a_im.val[1], b_im.val[1]); \
+ d_re.val[2] = vsubq_f64(a_re.val[2], b_re.val[2]); \
+ d_im.val[2] = vsubq_f64(a_im.val[2], b_im.val[2]); \
+ d_re.val[3] = vsubq_f64(a_re.val[3], b_re.val[3]); \
+ d_im.val[3] = vsubq_f64(a_im.val[3], b_im.val[3]);
+
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vaddq_f64(a_re, b_re); \
+ d_im = vaddq_f64(a_im, b_im);
+
+#define FPC_ADDx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vaddq_f64(a_re.val[0], b_re.val[0]); \
+ d_im.val[0] = vaddq_f64(a_im.val[0], b_im.val[0]); \
+ d_re.val[1] = vaddq_f64(a_re.val[1], b_re.val[1]); \
+ d_im.val[1] = vaddq_f64(a_im.val[1], b_im.val[1]); \
+ d_re.val[2] = vaddq_f64(a_re.val[2], b_re.val[2]); \
+ d_im.val[2] = vaddq_f64(a_im.val[2], b_im.val[2]); \
+ d_re.val[3] = vaddq_f64(a_re.val[3], b_re.val[3]); \
+ d_im.val[3] = vaddq_f64(a_im.val[3], b_im.val[3]);
+
+#define FWD_BOT(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUB(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADD(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUBx4(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADDx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+/*
+ * FPC_J
+ */
+
+#define FPC_ADDJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vsubq_f64(a_re, b_im); \
+ d_im = vaddq_f64(a_im, b_re);
+
+#define FPC_ADDJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vsubq_f64(a_re.val[0], b_im.val[0]); \
+ d_im.val[0] = vaddq_f64(a_im.val[0], b_re.val[0]); \
+ d_re.val[1] = vsubq_f64(a_re.val[1], b_im.val[1]); \
+ d_im.val[1] = vaddq_f64(a_im.val[1], b_re.val[1]); \
+ d_re.val[2] = vsubq_f64(a_re.val[2], b_im.val[2]); \
+ d_im.val[2] = vaddq_f64(a_im.val[2], b_re.val[2]); \
+ d_re.val[3] = vsubq_f64(a_re.val[3], b_im.val[3]); \
+ d_im.val[3] = vaddq_f64(a_im.val[3], b_re.val[3]);
+
+#define FPC_SUBJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vaddq_f64(a_re, b_im); \
+ d_im = vsubq_f64(a_im, b_re);
+
+#define FPC_SUBJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vaddq_f64(a_re.val[0], b_im.val[0]); \
+ d_im.val[0] = vsubq_f64(a_im.val[0], b_re.val[0]); \
+ d_re.val[1] = vaddq_f64(a_re.val[1], b_im.val[1]); \
+ d_im.val[1] = vsubq_f64(a_im.val[1], b_re.val[1]); \
+ d_re.val[2] = vaddq_f64(a_re.val[2], b_im.val[2]); \
+ d_im.val[2] = vsubq_f64(a_im.val[2], b_re.val[2]); \
+ d_re.val[3] = vaddq_f64(a_re.val[3], b_im.val[3]); \
+ d_im.val[3] = vsubq_f64(a_im.val[3], b_re.val[3]);
+
+#define FWD_BOTJ(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUBJ(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADDJ(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTJx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUBJx4(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADDJx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+//============== Inverse FFT
+/*
+ * FPC_J
+ * a * conj(b)
+ * Original (without swap):
+ * d_re = b_im * a_im + a_re * b_re;
+ * d_im = b_re * a_im - a_re * b_im;
+ */
+#define FPC_MUL_BOTJ_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re, a_re, b_re_im, 0); \
+ vfmla_lane(d_re, d_re, a_im, b_re_im, 1); \
+ vfmul_lane(d_im, a_im, b_re_im, 0); \
+ vfmls_lane(d_im, d_im, a_re, b_re_im, 1);
+
+#define FPC_MUL_BOTJ_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0); \
+ vfmla_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+ vfmul_lane(d_im.val[0], a_im.val[0], b_re_im, 0); \
+ vfmls_lane(d_im.val[0], d_im.val[0], a_re.val[0], b_re_im, 1); \
+ vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0); \
+ vfmla_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+ vfmul_lane(d_im.val[1], a_im.val[1], b_re_im, 0); \
+ vfmls_lane(d_im.val[1], d_im.val[1], a_re.val[1], b_re_im, 1); \
+ vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0); \
+ vfmla_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+ vfmul_lane(d_im.val[2], a_im.val[2], b_re_im, 0); \
+ vfmls_lane(d_im.val[2], d_im.val[2], a_re.val[2], b_re_im, 1); \
+ vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0); \
+ vfmla_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+ vfmul_lane(d_im.val[3], a_im.val[3], b_re_im, 0); \
+ vfmls_lane(d_im.val[3], d_im.val[3], a_re.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re, b_im, a_im); \
+ vfmla(d_re, d_re, a_re, b_re); \
+ vfmul(d_im, b_re, a_im); \
+ vfmls(d_im, d_im, a_re, b_im);
+
+#define INV_TOPJ(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); \
+ FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUBx4(t_re, t_im, a_re, a_im, b_re, b_im); \
+ FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+ FPC_MUL_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta);
+
+/*
+ * FPC_Jm
+ * a * -conj(b)
+ * d_re = a_re * b_im - a_im * b_re;
+ * d_im = a_im * b_im + a_re * b_re;
+ */
+#define FPC_MUL_BOTJm_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re, a_re, b_re_im, 1); \
+ vfmls_lane(d_re, d_re, a_im, b_re_im, 0); \
+ vfmul_lane(d_im, a_re, b_re_im, 0); \
+ vfmla_lane(d_im, d_im, a_im, b_re_im, 1);
+
+#define FPC_MUL_BOTJm_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 1); \
+ vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 0); \
+ vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 0); \
+ vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 1); \
+ vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 1); \
+ vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 0); \
+ vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 0); \
+ vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 1); \
+ vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 1); \
+ vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 0); \
+ vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 0); \
+ vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 1); \
+ vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 1); \
+ vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 0); \
+ vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 0); \
+ vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJm(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re, a_re, b_im); \
+ vfmls(d_re, d_re, a_im, b_re); \
+ vfmul(d_im, a_im, b_im); \
+ vfmla(d_im, d_im, a_re, b_re);
+
+#define INV_TOPJm(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUB(t_re, t_im, b_re, b_im, a_re, a_im); \
+ FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJmx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUBx4(t_re, t_im, b_re, b_im, a_re, a_im); \
+ FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+ FPC_MUL_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta);
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrous.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrous.h
new file mode 100644
index 000000000..dfee8bc12
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/macrous.h
@@ -0,0 +1,469 @@
+/*
+ * Macro for sign/unsigned integer
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+
+#define vmull_lo(c, a, b) c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+
+#define vmull_hi(c, a, b) c = vmull_high_s16(a, b);
+
+#define vmulla_lo(d, c, a, b) d = vmlal_s16(c, vget_low_s16(a), vget_low_s16(b));
+
+#define vmulla_hi(d, c, a, b) d = vmlal_high_s16(c, a, b);
+
+#define vadd(c, a, b) c = vaddq_u32(a, b);
+
+#define vaddv(c, a) c = vaddvq_u32(a);
+
+#define vor(c, a, b) c = vorrq_u32(a, b);
+
+// Macro for NTT operation. Using signed 16-bit.
+#define vload_s16_4(c, addr) c = vld4q_s16(addr);
+#define vload_s16_x2(c, addr) c = vld1q_s16_x2(addr);
+#define vload_s16_x4(c, addr) c = vld1q_s16_x4(addr);
+
+#define vstore_s16_x4(addr, c) vst1q_s16_x4(addr, c);
+#define vstore_s16_x2(addr, c) vst1q_s16_x2(addr, c);
+#define vstore_s16_4(add, c) vst4q_s16(add, c);
+
+/*
+ * Strategy for NTT:
+ * - Forward and Inverse NTT multiply with constant, use either Barrett or Montgomery *Rounding* arithmetic
+ * - Pointwise multiplication must use Montgomery *Doubling* arithmetic
+ *
+ * Rounding because:
+ *
+ * - Montgomery need one coefficient to be *odd*, it only works with precomputed coefficient
+ * => Tried this approach, very strict on coefficient input range.
+ * => E.g a*b: a in [-R/2, R/2]. b in [-Q/2, Q/2] then c in [-2Q, 2Q]
+ *
+ * - Barrett multiplication seem to work better with no restriction
+ * => Proved to be good. E.g c=a*b, a in [-R, R], b in [-Q/2, Q/2] then c in [-3Q/2, 3Q/2]
+ * However, depend on the input bound, the output bound is varies. By using this knowledge, we can further
+ * optimize Barrett point by carefully check the output bound according to input bound.
+ *
+ * - Barrett reduction with c = a % Q. a in [-R, R] then c in [-Q/2, Q/2]
+ *
+ *
+ * Doubling because
+ * - Montgomery Doubling work with two unknown coefficient, no constaint at all
+ * => c = a*b. a,b in [-R, R] c in [-Q, Q]
+ */
+
+// ------------ Forward NTT and Inverse NTT ------------
+/*
+ * GS Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define gsbf_br(a, b, zl, zh, QMVQ, t) \
+ t = vsubq_s16(a, b); \
+ a = vaddq_s16(a, b); \
+ b = vqrdmulhq_s16(t, zh); \
+ t = vmulq_s16(t, zl); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri(a, b, zl, zh, i, QMVQ, t) \
+ t = vsubq_s16(a, b); \
+ a = vaddq_s16(a, b); \
+ b = vqrdmulhq_laneq_s16(t, zh, i); \
+ t = vmulq_laneq_s16(t, zl, i); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ t.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+ t.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+ t.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+ t.val[3] = vsubq_s16(a.val[3], b.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], b.val[3]); \
+ b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0); \
+ b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1); \
+ b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2); \
+ b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3); \
+ t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0); \
+ b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+ t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1); \
+ b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+ t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2); \
+ b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+ t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3); \
+ b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top_x4(a, b, t) \
+ t.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+ t.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+ t.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+ t.val[3] = vsubq_s16(a.val[3], b.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
+#define gsbf_bri_bot_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0); \
+ b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1); \
+ b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2); \
+ b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3); \
+ t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0); \
+ b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+ t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1); \
+ b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+ t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2); \
+ b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+ t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3); \
+ b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top(a, b, t) \
+ t = vsubq_s16(a, b); \
+ a = vaddq_s16(a, b);
+
+#define gsbf_bri_bot(b, zl, zh, i, QMVQ, t) \
+ b = vqrdmulhq_laneq_s16(t, zh, i); \
+ t = vmulq_laneq_s16(t, zl, i); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_br_bot(b, zl, zh, QMVQ, t) \
+ b = vqrdmulhq_s16(t, zh); \
+ t = vmulq_s16(t, zl); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+/*
+ * Barrett multiplication via *Rounding* use for Inverse NTT
+ * Input: a, b, zl, zh, Q. a in [-R, R]
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define barmul_invntt(a, zl, zh, i, QMVQ, t) \
+ t = vqrdmulhq_laneq_s16(a, zh, i); \
+ a = vmulq_laneq_s16(a, zl, i); \
+ a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barmul_invntt_x2(a, zl, zh, i, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i); \
+ a.val[0] = vmulq_laneq_s16(a.val[0], zl, i); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+ a.val[1] = vmulq_laneq_s16(a.val[1], zl, i); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0);
+
+#define barmul_invntt_x4(a, zl, zh, i, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i); \
+ t.val[2] = vqrdmulhq_laneq_s16(a.val[2], zh, i); \
+ t.val[3] = vqrdmulhq_laneq_s16(a.val[3], zh, i); \
+ a.val[0] = vmulq_laneq_s16(a.val[0], zl, i); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+ a.val[1] = vmulq_laneq_s16(a.val[1], zl, i); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+ a.val[2] = vmulq_laneq_s16(a.val[2], zl, i); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+ a.val[3] = vmulq_laneq_s16(a.val[3], zl, i); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+/*
+ * Convert coefficients to Montgomery domain
+ */
+#define barmuli_mont(a, QMVM, t) \
+ t = vqrdmulhq_laneq_s16(a, QMVM, 6); \
+ a = vmulq_laneq_s16(a, QMVM, 2); \
+ a = vmlsq_laneq_s16(a, t, QMVM, 0);
+
+#define barmuli_mont_x8(a, b, QMVM, t, t2) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 6); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 6); \
+ t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 6); \
+ t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 6); \
+ t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 6); \
+ t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 6); \
+ t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 6); \
+ t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 6); \
+ a.val[0] = vmulq_laneq_s16(a.val[0], QMVM, 2); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0); \
+ a.val[1] = vmulq_laneq_s16(a.val[1], QMVM, 2); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0); \
+ a.val[2] = vmulq_laneq_s16(a.val[2], QMVM, 2); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0); \
+ a.val[3] = vmulq_laneq_s16(a.val[3], QMVM, 2); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0); \
+ b.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 2); \
+ b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+ b.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 2); \
+ b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+ b.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 2); \
+ b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+ b.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 2); \
+ b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * Convert coefficients to Montgomery domain and embeded n^-1
+ */
+
+#define barmuli_mont_ninv_x8(a, b, QMVM, t, t2) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 7); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 7); \
+ t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 7); \
+ t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 7); \
+ t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 7); \
+ t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 7); \
+ t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 7); \
+ t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 7); \
+ a.val[0] = vshlq_n_s16(a.val[0], FALCON_LOG2_NINV_MONT); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0); \
+ a.val[1] = vshlq_n_s16(a.val[1], FALCON_LOG2_NINV_MONT); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0); \
+ a.val[2] = vshlq_n_s16(a.val[2], FALCON_LOG2_NINV_MONT); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0); \
+ a.val[3] = vshlq_n_s16(a.val[3], FALCON_LOG2_NINV_MONT); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0); \
+ b.val[0] = vshlq_n_s16(b.val[0], FALCON_LOG2_NINV_MONT); \
+ b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+ b.val[1] = vshlq_n_s16(b.val[1], FALCON_LOG2_NINV_MONT); \
+ b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+ b.val[2] = vshlq_n_s16(b.val[2], FALCON_LOG2_NINV_MONT); \
+ b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+ b.val[3] = vshlq_n_s16(b.val[3], FALCON_LOG2_NINV_MONT); \
+ b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * CT Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define ctbf_br(a, b, zl, zh, QMVQ, t) \
+ t = vqrdmulhq_s16(b, zh); \
+ b = vmulq_s16(b, zl); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0); \
+ b = vsubq_s16(a, t); \
+ a = vaddq_s16(a, t);
+
+#define ctbf_bri(a, b, zl, zh, i, QMVQ, t) \
+ t = vqrdmulhq_laneq_s16(b, zh, i); \
+ b = vmulq_laneq_s16(b, zl, i); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0); \
+ b = vsubq_s16(a, t); \
+ a = vaddq_s16(a, t);
+
+#define ctbf_br_top(b, zl, zh, QMVQ, t) \
+ t = vqrdmulhq_s16(b, zh); \
+ b = vmulq_s16(b, zl); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bri_top(b, zl, zh, i, QMVQ, t) \
+ t = vqrdmulhq_laneq_s16(b, zh, i); \
+ b = vmulq_laneq_s16(b, zl, i); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bot(a, b, t) \
+ b = vsubq_s16(a, t); \
+ a = vaddq_s16(a, t);
+
+#define ctbf_bri_top_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0); \
+ t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1); \
+ t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2); \
+ t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3); \
+ b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0); \
+ t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+ b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1); \
+ t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+ b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2); \
+ t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+ b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3); \
+ t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0);
+
+#define ctbf_bot_x4(a, b, t) \
+ b.val[0] = vsubq_s16(a.val[0], t.val[0]); \
+ b.val[1] = vsubq_s16(a.val[1], t.val[1]); \
+ b.val[2] = vsubq_s16(a.val[2], t.val[2]); \
+ b.val[3] = vsubq_s16(a.val[3], t.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], t.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], t.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], t.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+#define ctbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0); \
+ t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1); \
+ t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2); \
+ t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3); \
+ b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0); \
+ t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+ b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1); \
+ t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+ b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2); \
+ t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+ b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3); \
+ t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0); \
+ b.val[0] = vsubq_s16(a.val[0], t.val[0]); \
+ b.val[1] = vsubq_s16(a.val[1], t.val[1]); \
+ b.val[2] = vsubq_s16(a.val[2], t.val[2]); \
+ b.val[3] = vsubq_s16(a.val[3], t.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], t.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], t.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], t.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+// ------------ Pointwise Multiplication ------------
+/*
+ * Montgomery multiplication via *Doubling*
+ * Input: a, b, bNinv, Q
+ * Output: c = ab * R^-1
+ */
+#define montmul(c, a, b, QMVM, t) \
+ c = vqdmulhq_s16(a, b); \
+ t = vmulq_laneq_s16(b, QMVM, 1); \
+ t = vmulq_s16(a, t); \
+ t = vqdmulhq_laneq_s16(t, QMVM, 0); \
+ c = vhsubq_s16(c, t);
+
+#define montmul_x4(z, a, b, QMVM, t) \
+ z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]); \
+ z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]); \
+ z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]); \
+ z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]); \
+ t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1); \
+ t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1); \
+ t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1); \
+ t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1); \
+ t.val[0] = vmulq_s16(a.val[0], t.val[0]); \
+ t.val[1] = vmulq_s16(a.val[1], t.val[1]); \
+ t.val[2] = vmulq_s16(a.val[2], t.val[2]); \
+ t.val[3] = vmulq_s16(a.val[3], t.val[3]); \
+ t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+ z.val[0] = vhsubq_s16(z.val[0], t.val[0]); \
+ t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+ z.val[1] = vhsubq_s16(z.val[1], t.val[1]); \
+ t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+ z.val[2] = vhsubq_s16(z.val[2], t.val[2]); \
+ t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+ z.val[3] = vhsubq_s16(z.val[3], t.val[3]);
+
+#define montmul_x8(z, w, a, b, e, f, QMVM, t, k) \
+ z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]); \
+ z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]); \
+ z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]); \
+ z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]); \
+ w.val[0] = vqdmulhq_s16(e.val[0], f.val[0]); \
+ w.val[1] = vqdmulhq_s16(e.val[1], f.val[1]); \
+ w.val[2] = vqdmulhq_s16(e.val[2], f.val[2]); \
+ w.val[3] = vqdmulhq_s16(e.val[3], f.val[3]); \
+ t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1); \
+ t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1); \
+ t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1); \
+ t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1); \
+ k.val[0] = vmulq_laneq_s16(f.val[0], QMVM, 1); \
+ k.val[1] = vmulq_laneq_s16(f.val[1], QMVM, 1); \
+ k.val[2] = vmulq_laneq_s16(f.val[2], QMVM, 1); \
+ k.val[3] = vmulq_laneq_s16(f.val[3], QMVM, 1); \
+ t.val[0] = vmulq_s16(a.val[0], t.val[0]); \
+ t.val[1] = vmulq_s16(a.val[1], t.val[1]); \
+ t.val[2] = vmulq_s16(a.val[2], t.val[2]); \
+ t.val[3] = vmulq_s16(a.val[3], t.val[3]); \
+ k.val[0] = vmulq_s16(e.val[0], k.val[0]); \
+ k.val[1] = vmulq_s16(e.val[1], k.val[1]); \
+ k.val[2] = vmulq_s16(e.val[2], k.val[2]); \
+ k.val[3] = vmulq_s16(e.val[3], k.val[3]); \
+ t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+ z.val[0] = vhsubq_s16(z.val[0], t.val[0]); \
+ t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+ z.val[1] = vhsubq_s16(z.val[1], t.val[1]); \
+ t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+ z.val[2] = vhsubq_s16(z.val[2], t.val[2]); \
+ t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+ z.val[3] = vhsubq_s16(z.val[3], t.val[3]); \
+ k.val[0] = vqdmulhq_laneq_s16(k.val[0], QMVM, 0); \
+ w.val[0] = vhsubq_s16(w.val[0], k.val[0]); \
+ k.val[1] = vqdmulhq_laneq_s16(k.val[1], QMVM, 0); \
+ w.val[1] = vhsubq_s16(w.val[1], k.val[1]); \
+ k.val[2] = vqdmulhq_laneq_s16(k.val[2], QMVM, 0); \
+ w.val[2] = vhsubq_s16(w.val[2], k.val[2]); \
+ k.val[3] = vqdmulhq_laneq_s16(k.val[3], QMVM, 0); \
+ w.val[3] = vhsubq_s16(w.val[3], k.val[3]);
+
+// ------------ Barrett Reduction ------------
+/*
+ * Barrett reduction, return [-Q/2, Q/2]
+ * `v` = 5461, `n` = 11
+ */
+#define barrett(a, QMVQ, t) \
+ t = vqdmulhq_laneq_s16(a, QMVQ, 4); \
+ t = vrshrq_n_s16(t, 11); \
+ a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barrett_x2(a, i, j, m, n, QMVQ, t) \
+ t.val[m] = vqdmulhq_laneq_s16(a.val[i], QMVQ, 4); \
+ t.val[m] = vrshrq_n_s16(t.val[m], 11); \
+ t.val[n] = vqdmulhq_laneq_s16(a.val[j], QMVQ, 4); \
+ t.val[n] = vrshrq_n_s16(t.val[n], 11); \
+ a.val[i] = vmlsq_laneq_s16(a.val[i], t.val[m], QMVQ, 0); \
+ a.val[j] = vmlsq_laneq_s16(a.val[j], t.val[n], QMVQ, 0);
+
+#define barrett_x4(a, QMVQ, t) \
+ t.val[0] = vqdmulhq_laneq_s16(a.val[0], QMVQ, 4); \
+ t.val[0] = vrshrq_n_s16(t.val[0], 11); \
+ t.val[1] = vqdmulhq_laneq_s16(a.val[1], QMVQ, 4); \
+ t.val[1] = vrshrq_n_s16(t.val[1], 11); \
+ t.val[2] = vqdmulhq_laneq_s16(a.val[2], QMVQ, 4); \
+ t.val[2] = vrshrq_n_s16(t.val[2], 11); \
+ t.val[3] = vqdmulhq_laneq_s16(a.val[3], QMVQ, 4); \
+ t.val[3] = vrshrq_n_s16(t.val[3], 11); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+// ------------ Matrix Transpose ------------
+/*
+ * Matrix 4x4 transpose: v
+ * Input: int16x8x4_t v, tmp
+ * Output: int16x8x4_t v
+ */
+#define transpose(v, tmp) \
+ tmp.val[0] = vtrn1q_s16(v.val[0], v.val[1]); \
+ tmp.val[1] = vtrn2q_s16(v.val[0], v.val[1]); \
+ tmp.val[2] = vtrn1q_s16(v.val[2], v.val[3]); \
+ tmp.val[3] = vtrn2q_s16(v.val[2], v.val[3]); \
+ v.val[0] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+ v.val[2] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+ v.val[1] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]); \
+ v.val[3] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]);
+
+// ------------ Re-arrange vector ------------
+#define arrange(v_out, v_in, i, j, m, n, a, b, c, d) \
+ v_out.val[a] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+ v_out.val[b] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+ v_out.val[c] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]); \
+ v_out.val[d] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]);
+
+// ------------ Addition/Subtraction ------------
+#define vsub_x4(c, a, b) \
+ c.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+ c.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+ c.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+ c.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+#define vadd_x4(c, a, b) \
+ c.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+ c.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+ c.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+ c.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt.c
new file mode 100644
index 000000000..7007cf245
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt.c
@@ -0,0 +1,928 @@
+/*
+ * High-speed vectorize NTT for N = 512, 1024
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrous.h"
+#include "ntt_consts.h"
+#include "poly.h"
+
+/*
+ * Assume Input in the range [-Q/2, Q/2]
+ * Total Barrett point for N = 512, 1024: 2048, 4096
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont) {
+ // Total SIMD registers 29 = 16 + 12 + 1
+ int16x8x4_t v0, v1, v2, v3; // 16
+ int16x8x4_t zl, zh, t, t2; // 12
+ int16x8x2_t zlh, zhh; // 4
+ int16x8_t neon_qmvq; // 1
+ const int16_t *ptr_ntt_br = PQCLEAN_FALCONPADDED1024_AARCH64_ntt_br;
+ const int16_t *ptr_ntt_qinv_br = PQCLEAN_FALCONPADDED1024_AARCH64_ntt_qinv_br;
+
+ neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+ zl.val[0] = vld1q_s16(ptr_ntt_br);
+ zh.val[0] = vld1q_s16(ptr_ntt_qinv_br);
+ ptr_ntt_br += 8;
+ ptr_ntt_qinv_br += 8;
+
+ // Layer 9, 8, 7
+ int16x8x2_t u0, u1, u2, u3, u4, u5, u6, u7;
+
+ for (unsigned j = 0; j < 128; j += 16) {
+ vload_s16_x2(u0, &a[j]);
+ vload_s16_x2(u1, &a[j + 128]);
+ vload_s16_x2(u2, &a[j + 256]);
+ vload_s16_x2(u3, &a[j + 384]);
+
+ vload_s16_x2(u4, &a[j + 512]);
+ vload_s16_x2(u5, &a[j + 640]);
+ vload_s16_x2(u6, &a[j + 768]);
+ vload_s16_x2(u7, &a[j + 896]);
+
+ // u0, 4: .5
+ // u1, 5: .5
+ // u2, 6: .5
+ // u3, 7: .5
+
+ // Layer 9
+ // u0 - u4, u1 - u5
+ // u2 - u6, u3 - u7
+ ctbf_bri_top(u4.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[0]);
+ ctbf_bri_top(u4.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[1]);
+ ctbf_bri_top(u5.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[2]);
+ ctbf_bri_top(u5.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[3]);
+
+ ctbf_bri_top(u6.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[0]);
+ ctbf_bri_top(u6.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[1]);
+ ctbf_bri_top(u7.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[2]);
+ ctbf_bri_top(u7.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t2.val[3]);
+
+ ctbf_bot(u0.val[0], u4.val[0], t.val[0]);
+ ctbf_bot(u0.val[1], u4.val[1], t.val[1]);
+ ctbf_bot(u1.val[0], u5.val[0], t.val[2]);
+ ctbf_bot(u1.val[1], u5.val[1], t.val[3]);
+
+ ctbf_bot(u2.val[0], u6.val[0], t2.val[0]);
+ ctbf_bot(u2.val[1], u6.val[1], t2.val[1]);
+ ctbf_bot(u3.val[0], u7.val[0], t2.val[2]);
+ ctbf_bot(u3.val[1], u7.val[1], t2.val[3]);
+
+ // u0, 4: 1.2
+ // u1, 5: 1.2
+ // u2, 6: 1.2
+ // u3, 7: 1.2
+
+ // Layer 8
+ // u0 - u2, u1 - u3
+ // u4 - u6, u5 - u7
+ ctbf_bri_top(u2.val[0], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[0]);
+ ctbf_bri_top(u2.val[1], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[1]);
+ ctbf_bri_top(u3.val[0], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[2]);
+ ctbf_bri_top(u3.val[1], zl.val[0], zh.val[0], 2, neon_qmvq, t.val[3]);
+
+ ctbf_bri_top(u6.val[0], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[0]);
+ ctbf_bri_top(u6.val[1], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[1]);
+ ctbf_bri_top(u7.val[0], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[2]);
+ ctbf_bri_top(u7.val[1], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[3]);
+
+ ctbf_bot(u0.val[0], u2.val[0], t.val[0]);
+ ctbf_bot(u0.val[1], u2.val[1], t.val[1]);
+ ctbf_bot(u1.val[0], u3.val[0], t.val[2]);
+ ctbf_bot(u1.val[1], u3.val[1], t.val[3]);
+
+ ctbf_bot(u4.val[0], u6.val[0], t2.val[0]);
+ ctbf_bot(u4.val[1], u6.val[1], t2.val[1]);
+ ctbf_bot(u5.val[0], u7.val[0], t2.val[2]);
+ ctbf_bot(u5.val[1], u7.val[1], t2.val[3]);
+
+ // 2.14 -> 0.5
+ barrett_x2(u0, 0, 1, 0, 1, neon_qmvq, t);
+ barrett_x2(u1, 0, 1, 2, 3, neon_qmvq, t);
+ barrett_x2(u2, 0, 1, 0, 1, neon_qmvq, t);
+ barrett_x2(u3, 0, 1, 2, 3, neon_qmvq, t);
+
+ barrett_x2(u4, 0, 1, 0, 1, neon_qmvq, t2);
+ barrett_x2(u5, 0, 1, 2, 3, neon_qmvq, t2);
+ barrett_x2(u6, 0, 1, 0, 1, neon_qmvq, t2);
+ barrett_x2(u7, 0, 1, 2, 3, neon_qmvq, t2);
+ // u0, 4: .5
+ // u1, 5: .5
+ // u2, 6: .5
+ // u3, 7: .5
+
+ // Layer 7
+ // u0 - u1, u2 - u3
+ // u4 - u5, u6 - u7
+ ctbf_bri_top(u1.val[0], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[0]);
+ ctbf_bri_top(u1.val[1], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[1]);
+ ctbf_bri_top(u3.val[0], zl.val[0], zh.val[0], 5, neon_qmvq, t.val[2]);
+ ctbf_bri_top(u3.val[1], zl.val[0], zh.val[0], 5, neon_qmvq, t.val[3]);
+
+ ctbf_bri_top(u5.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[0]);
+ ctbf_bri_top(u5.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[1]);
+ ctbf_bri_top(u7.val[0], zl.val[0], zh.val[0], 7, neon_qmvq, t2.val[2]);
+ ctbf_bri_top(u7.val[1], zl.val[0], zh.val[0], 7, neon_qmvq, t2.val[3]);
+
+ ctbf_bot(u0.val[0], u1.val[0], t.val[0]);
+ ctbf_bot(u0.val[1], u1.val[1], t.val[1]);
+ ctbf_bot(u2.val[0], u3.val[0], t.val[2]);
+ ctbf_bot(u2.val[1], u3.val[1], t.val[3]);
+
+ ctbf_bot(u4.val[0], u5.val[0], t2.val[0]);
+ ctbf_bot(u4.val[1], u5.val[1], t2.val[1]);
+ ctbf_bot(u6.val[0], u7.val[0], t2.val[2]);
+ ctbf_bot(u6.val[1], u7.val[1], t2.val[3]);
+
+ // u0, 4: 1.2
+ // u1, 5: 1.2
+ // u2, 6: 1.2
+ // u3, 7: 1.2
+
+ // Store at 1.2Q
+ vstore_s16_x2(&a[j], u0);
+ vstore_s16_x2(&a[j + 128], u1);
+ vstore_s16_x2(&a[j + 256], u2);
+ vstore_s16_x2(&a[j + 384], u3);
+
+ vstore_s16_x2(&a[j + 512], u4);
+ vstore_s16_x2(&a[j + 640], u5);
+ vstore_s16_x2(&a[j + 768], u6);
+ vstore_s16_x2(&a[j + 896], u7);
+ }
+
+ // Layer 6, 5, 4, 3, 2, 1, 0
+ for (unsigned j = 0; j < FALCON_N; j += 128) {
+ vload_s16_x4(v0, &a[j]);
+ vload_s16_x4(v1, &a[j + 32]);
+ vload_s16_x4(v2, &a[j + 64]);
+ vload_s16_x4(v3, &a[j + 96]);
+
+ vload_s16_x2(zlh, ptr_ntt_br);
+ vload_s16_x2(zhh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 16;
+ ptr_ntt_qinv_br += 16;
+
+ // Layer 6
+ // v0 - v2, v1 - v3
+ ctbf_bri_top_x4(v2, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+ ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t2);
+
+ ctbf_bot_x4(v0, v2, t);
+ ctbf_bot_x4(v1, v3, t2);
+
+ // 2.3 -> 0.5
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t);
+ barrett_x4(v3, neon_qmvq, t);
+
+ // Layer 5
+ // v0 - v1, v2 - v3
+ ctbf_bri_top_x4(v1, zlh.val[0], zhh.val[0], 1, 1, 1, 1, neon_qmvq, t);
+ ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+
+ ctbf_bot_x4(v0, v1, t);
+ ctbf_bot_x4(v2, v3, t2);
+
+ // 1.3
+
+ // Layer 4
+ // v0(0, 1 - 2, 3)
+ // v1(0, 1 - 2, 3)
+ // v2(0, 1 - 2, 3)
+ // v3(0, 1 - 2, 3)
+ ctbf_bri_top(v0.val[2], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[0]);
+ ctbf_bri_top(v0.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[1]);
+ ctbf_bri_top(v1.val[2], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[2]);
+ ctbf_bri_top(v1.val[3], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[3]);
+
+ ctbf_bri_top(v2.val[2], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[0]);
+ ctbf_bri_top(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+ ctbf_bri_top(v3.val[2], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+ ctbf_bri_top(v3.val[3], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+ ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+ ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+ ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+ ctbf_bot(v2.val[0], v2.val[2], t2.val[0]);
+ ctbf_bot(v2.val[1], v2.val[3], t2.val[1]);
+ ctbf_bot(v3.val[0], v3.val[2], t2.val[2]);
+ ctbf_bot(v3.val[1], v3.val[3], t2.val[3]);
+
+ // 2.3 -> 0.5
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // Layer 3
+ // v0(0, 2 - 1, 3)
+ // v1(0, 2 - 1, 3)
+ // v2(0, 2 - 1, 3)
+ // v3(0, 2 - 1, 3)
+ ctbf_bri_top(v0.val[1], zlh.val[0], zhh.val[0], 7, neon_qmvq, t.val[0]);
+ ctbf_bri_top(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+ ctbf_bri_top(v1.val[1], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+ ctbf_bri_top(v1.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t.val[3]);
+
+ ctbf_bri_top(v2.val[1], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[0]);
+ ctbf_bri_top(v2.val[3], zlh.val[1], zhh.val[1], 4, neon_qmvq, t2.val[1]);
+ ctbf_bri_top(v3.val[1], zlh.val[1], zhh.val[1], 5, neon_qmvq, t2.val[2]);
+ ctbf_bri_top(v3.val[3], zlh.val[1], zhh.val[1], 6, neon_qmvq, t2.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+ ctbf_bot(v0.val[2], v0.val[3], t.val[1]);
+ ctbf_bot(v1.val[0], v1.val[1], t.val[2]);
+ ctbf_bot(v1.val[2], v1.val[3], t.val[3]);
+
+ ctbf_bot(v2.val[0], v2.val[1], t2.val[0]);
+ ctbf_bot(v2.val[2], v2.val[3], t2.val[1]);
+ ctbf_bot(v3.val[0], v3.val[1], t2.val[2]);
+ ctbf_bot(v3.val[2], v3.val[3], t2.val[3]);
+
+ // 1.3
+
+ // Layer 2
+ // Input:
+ // 0, 1, 2, 3 | 4, 5, 6, 7
+ // 8, 9, 10, 11 | 12, 13, 14, 15
+ // 16, 17, 18, 19 | 20, 21, 22, 23
+ // 24, 25, 26, 27 | 28, 29, 30, 31
+ arrange(t, v0, 0, 2, 1, 3, 0, 1, 2, 3);
+ v0 = t;
+ arrange(t, v1, 0, 2, 1, 3, 0, 1, 2, 3);
+ v1 = t;
+ arrange(t2, v2, 0, 2, 1, 3, 0, 1, 2, 3);
+ v2 = t2;
+ arrange(t2, v3, 0, 2, 1, 3, 0, 1, 2, 3);
+ v3 = t2;
+ // Output:
+ // 0, 1, 2, 3 | 16, 17, 18, 19
+ // 4, 5, 6, 7 | 20, 21, 22, 23
+ // 8, 9, 10, 11 | 24, 25, 26, 27
+ // 12, 13, 14, 15 | 28, 29, 30, 31
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+ ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+ ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+ ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+ ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+ ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+ ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+ // 2.3 -> 0.5
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // Layer 1: v0.val[0] x v0.val[2] | v0.val[1] x v0.val[3]
+ // v0.val[0]: 0, 1, 2, 3 | 16, 17, 18, 19
+ // v0.val[1]: 4, 5, 6, 7 | 20, 21, 22, 23
+ // v0.val[2]: 8, 9, 10, 11 | 24, 25, 26, 27
+ // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+ // transpose 4x4
+ transpose(v0, t);
+ transpose(v1, t);
+ transpose(v2, t2);
+ transpose(v3, t2);
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[2], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[1]);
+ ctbf_br_top(v1.val[2], zl.val[1], zh.val[1], neon_qmvq, t.val[2]);
+ ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+ ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+ ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+ ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+ ctbf_br_top(v2.val[2], zl.val[2], zh.val[2], neon_qmvq, t.val[0]);
+ ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[1]);
+ ctbf_br_top(v3.val[2], zl.val[3], zh.val[3], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v2.val[0], v2.val[2], t.val[0]);
+ ctbf_bot(v2.val[1], v2.val[3], t.val[1]);
+ ctbf_bot(v3.val[0], v3.val[2], t.val[2]);
+ ctbf_bot(v3.val[1], v3.val[3], t.val[3]);
+
+ // 1.3
+
+ // Layer 0
+ // v(0, 2 - 1, 3)
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+ ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+ ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+ ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+ ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+ ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+ ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+ // 2.3
+
+ if (mont == NTT_MONT) {
+ // Convert to Montgomery domain by multiply with FALCON_MONT
+ barmuli_mont_x8(v0, v1, neon_qmvq, t, t2);
+ barmuli_mont_x8(v2, v3, neon_qmvq, t, t2);
+ } else if (mont == NTT_MONT_INV) {
+ barmuli_mont_ninv_x8(v0, v1, neon_qmvq, t, t2);
+ barmuli_mont_ninv_x8(v2, v3, neon_qmvq, t, t2);
+ }
+
+ vstore_s16_4(&a[j], v0);
+ vstore_s16_4(&a[j + 32], v1);
+ vstore_s16_4(&a[j + 64], v2);
+ vstore_s16_4(&a[j + 96], v3);
+ }
+}
+
+/*
+ * Assume input in range [-Q, Q]
+ * Total Barrett point N = 512, 1024: 1792, 3840
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv) {
+ // Total SIMD registers: 29 = 16 + 12 + 1
+ int16x8x4_t v0, v1, v2, v3; // 16
+ int16x8x4_t zl, zh, t, t2; // 12
+ int16x8x2_t zlh, zhh; // 4
+ int16x8_t neon_qmvq; // 1
+ const int16_t *ptr_invntt_br = PQCLEAN_FALCONPADDED1024_AARCH64_invntt_br;
+ const int16_t *ptr_invntt_qinv_br = PQCLEAN_FALCONPADDED1024_AARCH64_invntt_qinv_br;
+
+ neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+ unsigned j;
+
+ // Layer 0, 1, 2, 3, 4, 5, 6
+ for (j = 0; j < FALCON_N; j += 128) {
+ vload_s16_4(v0, &a[j]);
+ vload_s16_4(v1, &a[j + 32]);
+ vload_s16_4(v2, &a[j + 64]);
+ vload_s16_4(v3, &a[j + 96]);
+
+ // Layer 0
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+ gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+ gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+ gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+ gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+ gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+ gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+ gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+ gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ // 0 - 1*, 2 - 3*
+ gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+ gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+ gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+ vload_s16_x4(zl, ptr_invntt_br);
+ vload_s16_x4(zh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 32;
+ ptr_invntt_qinv_br += 32;
+
+ gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+ gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+ gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+ gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+ // 0: 2
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ barrett(v0.val[0], neon_qmvq, t.val[0]);
+ barrett(v1.val[0], neon_qmvq, t.val[1]);
+ barrett(v2.val[0], neon_qmvq, t.val[2]);
+ barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+ // 0: 0.5
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ // Layer 1
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+ // 0 - 2*, 1 - 3*
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+ gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+ gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+ gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+ gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+ gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+ gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+ gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+ gsbf_br_bot(v0.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+ gsbf_br_bot(v0.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t.val[1]);
+ gsbf_br_bot(v1.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t.val[2]);
+ gsbf_br_bot(v1.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_br_bot(v2.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[0]);
+ gsbf_br_bot(v2.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[1]);
+ gsbf_br_bot(v3.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[2]);
+ gsbf_br_bot(v3.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[3]);
+
+ // 0: 2.5
+ // 1: 2.6
+ // 2: 1.5
+ // 3: 1.5
+
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // 0: 0.5
+ // 1: 0.5
+ // 2: 0.5
+ // 3: 0.5
+
+ // Layer 2
+ // Before Transpose
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+ transpose(v0, t);
+ transpose(v1, t);
+ transpose(v2, t2);
+ transpose(v3, t2);
+
+ // After Transpose
+ // v0.val[0]: 0, 1, 2, 3 | 16, 17, 18, 19
+ // v0.val[1]: 4, 5, 6, 7 | 20, 21, 22, 23
+ // v0.val[2]: 8, 9, 10, 11 | 24, 25, 26, 27
+ // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+ // 0 - 1*, 2 - 3*
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+ gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+ gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+ gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+ gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+ gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+ gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+ gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+ gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+ gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+ gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+ vload_s16_x4(zl, ptr_invntt_br);
+ vload_s16_x4(zh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 32;
+ ptr_invntt_qinv_br += 32;
+
+ gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+ gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+ gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+ gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+ // 0: 1
+ // 1: 0.9
+ // 2: 1
+ // 3: 0.9
+
+ // Layer 3
+ // Re-arrange vector from
+ // v0.val[0]: 0, 1, 2, 3 | 16, 17, 18, 19
+ // v0.val[1]: 4, 5, 6, 7 | 20, 21, 22, 23
+ // v0.val[2]: 8, 9, 10, 11 | 24, 25, 26, 27
+ // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+ // Compiler will handle register re-naming
+ arrange(t, v0, 0, 1, 2, 3, 0, 2, 1, 3);
+ v0 = t;
+
+ // Compiler will handle register re-naming
+ arrange(t, v1, 0, 1, 2, 3, 0, 2, 1, 3);
+ v1 = t;
+
+ // Compiler will handle register re-naming
+ arrange(t2, v2, 0, 1, 2, 3, 0, 2, 1, 3);
+ v2 = t2;
+
+ // Compiler will handle register re-naming
+ arrange(t2, v3, 0, 1, 2, 3, 0, 2, 1, 3);
+ v3 = t2;
+ // To
+ // v0.val[0]: 0, 1, 2, 3 | 4, 5, 6, 7
+ // v0.val[1]: 8, 9, 10, 11 | 12, 13, 14, 15
+ // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+ // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+ // 0 - 1, 2 - 3
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+ gsbf_top(v0.val[2], v0.val[3], t.val[1]);
+ gsbf_top(v1.val[0], v1.val[1], t.val[2]);
+ gsbf_top(v1.val[2], v1.val[3], t.val[3]);
+
+ gsbf_top(v2.val[0], v2.val[1], t2.val[0]);
+ gsbf_top(v2.val[2], v2.val[3], t2.val[1]);
+ gsbf_top(v3.val[0], v3.val[1], t2.val[2]);
+ gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+ gsbf_bri_bot(v0.val[1], zlh.val[0], zhh.val[0], 0, neon_qmvq, t.val[0]);
+ gsbf_bri_bot(v0.val[3], zlh.val[0], zhh.val[0], 1, neon_qmvq, t.val[1]);
+ gsbf_bri_bot(v1.val[1], zlh.val[0], zhh.val[0], 2, neon_qmvq, t.val[2]);
+ gsbf_bri_bot(v1.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[3]);
+
+ gsbf_bri_bot(v2.val[1], zlh.val[0], zhh.val[0], 4, neon_qmvq, t2.val[0]);
+ gsbf_bri_bot(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+ gsbf_bri_bot(v3.val[1], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+ gsbf_bri_bot(v3.val[3], zlh.val[0], zhh.val[0], 7, neon_qmvq, t2.val[3]);
+
+ // 0: 2
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ barrett(v0.val[0], neon_qmvq, t.val[0]);
+ barrett(v1.val[0], neon_qmvq, t.val[1]);
+ barrett(v2.val[0], neon_qmvq, t.val[2]);
+ barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+ // 0: 0.5
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ // Layer 4
+ // v0.val[0]: 0, 1, 2, 3 | 4, 5, 6, 7
+ // v0.val[1]: 8, 9, 10, 11 | 12, 13, 14, 15
+ // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+ // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+ // 0 - 2, 1 - 3
+
+ gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+ gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+ gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+ gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+ gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+ gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+ gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+ gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+ gsbf_bri_bot(v0.val[2], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[0]);
+ gsbf_bri_bot(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+ gsbf_bri_bot(v1.val[2], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+ gsbf_bri_bot(v1.val[3], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[3]);
+
+ gsbf_bri_bot(v2.val[2], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[0]);
+ gsbf_bri_bot(v2.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[1]);
+ gsbf_bri_bot(v3.val[2], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[2]);
+ gsbf_bri_bot(v3.val[3], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[3]);
+
+ // 0: 2.5
+ // 1: 2.5
+ // 2: 1.5
+ // 3: 1.5
+
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // 0: 0.5
+ // 1: 0.5
+ // 2: 0.5
+ // 3: 0.5
+
+ // Layer 5
+ // Cross block
+ // v0.0->3 - v1.0->3
+ gsbf_top_x4(v0, v1, t);
+ gsbf_top_x4(v2, v3, t2);
+
+ gsbf_bri_bot_x4(v1, zlh.val[1], zhh.val[1], 4, 4, 4, 4, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 5, 5, 5, 5, neon_qmvq, t2);
+
+ // v0: 1
+ // v1: 0.9
+ // v2: 1
+ // v3: 0.9
+
+ // Layer 6
+ // Cross block
+ // v0.0->3 - v2.0->3
+ gsbf_top_x4(v0, v2, t);
+ gsbf_top_x4(v1, v3, t2);
+
+ gsbf_bri_bot_x4(v2, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t2);
+
+ // v0: 2
+ // v1: 1.8
+ // v2: 1.3
+ // v3: 1.2
+
+ vstore_s16_x4(&a[j], v0);
+ vstore_s16_x4(&a[j + 32], v1);
+ vstore_s16_x4(&a[j + 64], v2);
+ vstore_s16_x4(&a[j + 96], v3);
+ }
+
+ ptr_invntt_br += 8 * ninv;
+ ptr_invntt_qinv_br += 8 * ninv;
+ zl.val[0] = vld1q_s16(ptr_invntt_br);
+ zh.val[0] = vld1q_s16(ptr_invntt_qinv_br);
+
+ // Layer 7, 8, 9
+ int16x8x2_t u0, u1, u2, u3, u4, u5, u6, u7;
+
+ for (j = 0; j < 128; j += 16) {
+ vload_s16_x2(u0, &a[j]);
+ vload_s16_x2(u1, &a[j + 128]);
+ vload_s16_x2(u2, &a[j + 256]);
+ vload_s16_x2(u3, &a[j + 384]);
+
+ vload_s16_x2(u4, &a[j + 512]);
+ vload_s16_x2(u5, &a[j + 640]);
+ vload_s16_x2(u6, &a[j + 768]);
+ vload_s16_x2(u7, &a[j + 896]);
+
+ // 2
+ barrett_x2(u0, 0, 1, 0, 1, neon_qmvq, t);
+ barrett_x2(u1, 0, 1, 2, 3, neon_qmvq, t);
+ barrett_x2(u2, 0, 1, 0, 1, neon_qmvq, t);
+ barrett_x2(u3, 0, 1, 2, 3, neon_qmvq, t);
+
+ barrett_x2(u4, 0, 1, 0, 1, neon_qmvq, t2);
+ barrett_x2(u5, 0, 1, 2, 3, neon_qmvq, t2);
+ barrett_x2(u6, 0, 1, 0, 1, neon_qmvq, t2);
+ barrett_x2(u7, 0, 1, 2, 3, neon_qmvq, t2);
+
+ // u0, 4: 0.5
+ // u1, 5: 0.5
+ // u2, 6: 0.5
+ // u3, 7: 0.5
+
+ // Layer 7
+ // u0 - u1, u2 - u3
+ // u4 - u5, u6 - u7
+ gsbf_top(u0.val[0], u1.val[0], t.val[0]);
+ gsbf_top(u0.val[1], u1.val[1], t.val[1]);
+ gsbf_top(u2.val[0], u3.val[0], t.val[2]);
+ gsbf_top(u2.val[1], u3.val[1], t.val[3]);
+
+ gsbf_top(u4.val[0], u5.val[0], t2.val[0]);
+ gsbf_top(u4.val[1], u5.val[1], t2.val[1]);
+ gsbf_top(u6.val[0], u7.val[0], t2.val[2]);
+ gsbf_top(u6.val[1], u7.val[1], t2.val[3]);
+
+ gsbf_bri_bot(u1.val[0], zl.val[0], zh.val[0], 0, neon_qmvq, t.val[0]);
+ gsbf_bri_bot(u1.val[1], zl.val[0], zh.val[0], 0, neon_qmvq, t.val[1]);
+ gsbf_bri_bot(u3.val[0], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[2]);
+ gsbf_bri_bot(u3.val[1], zl.val[0], zh.val[0], 1, neon_qmvq, t.val[3]);
+
+ gsbf_bri_bot(u5.val[0], zl.val[0], zh.val[0], 2, neon_qmvq, t2.val[0]);
+ gsbf_bri_bot(u5.val[1], zl.val[0], zh.val[0], 2, neon_qmvq, t2.val[1]);
+ gsbf_bri_bot(u7.val[0], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[2]);
+ gsbf_bri_bot(u7.val[1], zl.val[0], zh.val[0], 3, neon_qmvq, t2.val[3]);
+
+ // u0, 4: 1
+ // u1, 5: .87
+ // u2, 6: 1
+ // u3, 7: .87
+
+ // Layer 8
+ // u0 - u2, u1 - u3
+ // u4 - u6, u5 - u7
+ gsbf_top(u0.val[0], u2.val[0], t.val[0]);
+ gsbf_top(u0.val[1], u2.val[1], t.val[1]);
+ gsbf_top(u1.val[0], u3.val[0], t.val[2]);
+ gsbf_top(u1.val[1], u3.val[1], t.val[3]);
+
+ gsbf_top(u4.val[0], u6.val[0], t2.val[0]);
+ gsbf_top(u4.val[1], u6.val[1], t2.val[1]);
+ gsbf_top(u5.val[0], u7.val[0], t2.val[2]);
+ gsbf_top(u5.val[1], u7.val[1], t2.val[3]);
+
+ gsbf_bri_bot(u2.val[0], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[0]);
+ gsbf_bri_bot(u2.val[1], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[1]);
+ gsbf_bri_bot(u3.val[0], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[2]);
+ gsbf_bri_bot(u3.val[1], zl.val[0], zh.val[0], 4, neon_qmvq, t.val[3]);
+
+ gsbf_bri_bot(u6.val[0], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[0]);
+ gsbf_bri_bot(u6.val[1], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[1]);
+ gsbf_bri_bot(u7.val[0], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[2]);
+ gsbf_bri_bot(u7.val[1], zl.val[0], zh.val[0], 5, neon_qmvq, t2.val[3]);
+
+ // u0, 4: 2
+ // u2, 6: 1.25
+ // u1, 5: 1.75
+ // u3, 7: 1.15
+
+ barrett_x2(u0, 0, 1, 0, 1, neon_qmvq, t);
+ barrett_x2(u4, 0, 1, 2, 3, neon_qmvq, t);
+ barrett_x2(u1, 0, 1, 0, 1, neon_qmvq, t2);
+ barrett_x2(u5, 0, 1, 2, 3, neon_qmvq, t2);
+
+ // u0, 4: 0.5
+ // u2, 6: 1.25
+ // u1, 5: 0.5
+ // u3, 7: 1.15
+
+ // Layer 9
+ // u0 - u4, u1 - u5
+ // u2 - u6, u3 - u7
+ gsbf_top(u0.val[0], u4.val[0], t.val[0]);
+ gsbf_top(u0.val[1], u4.val[1], t.val[1]);
+ gsbf_top(u1.val[0], u5.val[0], t.val[2]);
+ gsbf_top(u1.val[1], u5.val[1], t.val[3]);
+
+ gsbf_top(u2.val[0], u6.val[0], t2.val[0]);
+ gsbf_top(u2.val[1], u6.val[1], t2.val[1]);
+ gsbf_top(u3.val[0], u7.val[0], t2.val[2]);
+ gsbf_top(u3.val[1], u7.val[1], t2.val[3]);
+
+ gsbf_bri_bot(u4.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[0]);
+ gsbf_bri_bot(u4.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[1]);
+ gsbf_bri_bot(u5.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[2]);
+ gsbf_bri_bot(u5.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t.val[3]);
+
+ gsbf_bri_bot(u6.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[0]);
+ gsbf_bri_bot(u6.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[1]);
+ gsbf_bri_bot(u7.val[0], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[2]);
+ gsbf_bri_bot(u7.val[1], zl.val[0], zh.val[0], 6, neon_qmvq, t2.val[3]);
+
+ // u0, 4: 1, .87
+ // u2, 6: 2.5, 1.5
+ // u1, 5: 1, .87
+ // u3, 7: 2.3, 1.4
+
+ if (ninv == INVNTT_NINV) {
+ barmul_invntt_x2(u0, zl.val[0], zh.val[0], 7, neon_qmvq, t);
+ barmul_invntt_x2(u1, zl.val[0], zh.val[0], 7, neon_qmvq, t);
+ barmul_invntt_x2(u2, zl.val[0], zh.val[0], 7, neon_qmvq, t2);
+ barmul_invntt_x2(u3, zl.val[0], zh.val[0], 7, neon_qmvq, t2);
+ }
+
+ // u0, 4: .87, .87
+ // u2, 6: 1.5, 1.5
+ // u1, 5: .87, .87
+ // u3, 7: 1.4, 1.4
+
+ barrett_x2(u2, 0, 1, 0, 1, neon_qmvq, t);
+ barrett_x2(u6, 0, 1, 2, 3, neon_qmvq, t);
+ barrett_x2(u3, 0, 1, 0, 1, neon_qmvq, t2);
+ barrett_x2(u7, 0, 1, 2, 3, neon_qmvq, t2);
+
+ // u0, 4: .87, .87
+ // u2, 6: .5, .5
+ // u1, 5: .87, .87
+ // u3, 7: .5, .5
+
+ vstore_s16_x2(&a[j], u0);
+ vstore_s16_x2(&a[j + 128], u1);
+ vstore_s16_x2(&a[j + 256], u2);
+ vstore_s16_x2(&a[j + 384], u3);
+
+ vstore_s16_x2(&a[j + 512], u4);
+ vstore_s16_x2(&a[j + 640], u5);
+ vstore_s16_x2(&a[j + 768], u6);
+ vstore_s16_x2(&a[j + 896], u7);
+ }
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+ // Total SIMD registers: 29 = 28 + 1
+ int16x8x4_t a, b, c, d, e1, e2, t, k; // 28
+ int16x8_t neon_qmvm; // 1
+ neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+
+ for (unsigned i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &f[i]);
+ vload_s16_x4(b, &g[i]);
+ vload_s16_x4(c, &f[i + 32]);
+ vload_s16_x4(d, &g[i + 32]);
+
+ montmul_x8(e1, e2, a, b, c, d, neon_qmvm, t, k);
+
+ vstore_s16_x4(&f[i], e1);
+ vstore_s16_x4(&f[i + 32], e2);
+ }
+}
+
+/* ===================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.c
new file mode 100644
index 000000000..f6dbf1178
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.c
@@ -0,0 +1,732 @@
+#include "ntt_consts.h"
+#include "params.h"
+
+#define PADDING 0
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_qmvq[8] = {FALCON_Q, FALCON_QINV,
+ FALCON_MONT, FALCON_NINV_MONT,
+ FALCON_V, 0,
+ FALCON_MONT_BR, FALCON_NINV_MONT_BR
+ };
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_br[] = {
+ PADDING, -1479, -5146, 4043, -1305, 722, 5736, -4134,
+ 3542, -4821, 2639, 2319, -1170, -955, -790, 1260,
+ 4388, 4632, -5755, 2426, 334, 1428, 1696, PADDING,
+ 2401, 2401, 2401, 2401, -5101, -5101, -5101, -5101,
+ 390, 390, 390, 390, -3833, -3833, -3833, -3833,
+ 354, 354, 354, 354, -2912, -2912, -2912, -2912,
+ 5012, 5012, 5012, 5012, 2859, 2859, 2859, 2859,
+ 442, 442, 442, 442, -1067, -1067, -1067, -1067,
+ 773, 773, 773, 773, 3778, 3778, 3778, 3778,
+ 4861, 4861, 4861, 4861, 5698, 5698, 5698, 5698,
+ -2481, -2481, -2481, -2481, -1045, -1045, -1045, -1045,
+ 49, 1263, 5915, 1483, -2500, -1489, -1583, -5942,
+ 1512, 350, -1815, 5383, 5369, -2057, -3202, 4493,
+ -2738, -5868, -5735, 2655, -3009, 1693, 174, 723,
+ -1975, -3757, 347, 2925, -3315, -426, 1858, 4754,
+ 7, 845, 3154, 3285, 216, -5526, 767, -2213,
+ 3120, -6086, -3941, 3536, 3229, -1706, 1282, 2021,
+ 3944, 5604, 2171, -1265, -2945, 2633, -3232, 4855,
+ -2941, -5662, 3837, 3221, 4050, 844, -980, 4590,
+ 1936, 3723, 5054, -4360, 50, 769, -3805, 4153,
+ -6105, 5646, 3753, 5370, 4730, 3929, -3572, -2832,
+ 4099, -5530, -3480, 3007, 5349, 1406, -293, -3769,
+ -567, 5289, 2595, 4273, -5207, 5202, -682, -5082,
+ -3504, -2625, -949, -3201, 3014, 5086, -1326, 2013,
+ -3289, 729, 3241, 2881, 3284, -5092, -2089, PADDING,
+ 1017, 1017, 1017, 1017, 1632, 1632, 1632, 1632,
+ 27, 27, 27, 27, -3763, -3763, -3763, -3763,
+ 1537, 1537, 1537, 1537, 4714, 4714, 4714, 4714,
+ -2678, -2678, -2678, -2678, 5019, 5019, 5019, 5019,
+ -4885, -4885, -4885, -4885, -5084, -5084, -5084, -5084,
+ -3066, -3066, -3066, -3066, -1440, -1440, -1440, -1440,
+ 242, 242, 242, 242, -4143, -4143, -4143, -4143,
+ 3704, 3704, 3704, 3704, -545, -545, -545, -545,
+ 3030, 4115, 2361, -1843, 2908, 218, 3434, -3529,
+ 3963, 576, 6142, -2447, 1954, -2051, -2882, -1805,
+ 3991, -3969, -2767, 156, 2281, 5876, -2031, 5333,
+ 3772, 418, 5908, -453, 5429, -4774, -4737, 1293,
+ -3469, -4443, 4693, -2293, 1802, 5103, -4411, 1223,
+ -1280, -24, -904, -5547, 881, 1015, 5461, 2637,
+ 4684, -5135, -4987, 3670, 578, -450, -4661, -2622,
+ 5618, 5789, 5043, 3090, 3065, -5703, -5900, -4719,
+ 6138, -3418, 2338, -417, 1555, -1891, -1590, -2334,
+ 614, -1371, -2485, -5039, -365, -1927, -2946, -4510,
+ 3360, 63, 2373, 3808, 5368, 1944, -510, -5386,
+ -1658, 3502, 826, 1398, 1506, 4483, 910, -751,
+ -2545, -563, -2975, 4846, -2747, -3135, 3712, -3694,
+ -5179, -1759, -3707, 3382, -355, -2548, -4231, PADDING,
+ 1002, 1002, 1002, 1002, 5088, 5088, 5088, 5088,
+ -4976, -4976, -4976, -4976, -3780, -3780, -3780, -3780,
+ -2437, -2437, -2437, -2437, 6022, 6022, 6022, 6022,
+ -2566, -2566, -2566, -2566, -6039, -6039, -6039, -6039,
+ 5011, 5011, 5011, 5011, -4284, -4284, -4284, -4284,
+ -1607, -1607, -1607, -1607, -875, -875, -875, -875,
+ 3646, 3646, 3646, 3646, 2987, 2987, 2987, 2987,
+ -2187, -2187, -2187, -2187, -2422, -2422, -2422, -2422,
+ 295, 6099, 5766, 652, -4016, 4077, -3762, -2919,
+ 325, -1404, -1146, -948, 5990, 1159, -3728, -4049,
+ 3329, 4298, -168, 2692, 5961, -5106, -1962, 1594,
+ -6122, -2555, -5184, -1200, 1360, 3956, -6119, 5297,
+ 4518, 1160, 2730, -2253, 2478, 4194, -1783, -4565,
+ -5170, -865, 189, -1763, -1530, -3869, 5832, -1734,
+ -5275, -1251, 2035, -1882, -4770, 5287, -5673, -5406,
+ 4834, -2828, -4113, 3840, 3451, -1241, -5781, -2643,
+ 3094, 4820, 5411, 1868, -2840, 3019, -5078, 4974,
+ 2672, 1279, 3116, 2209, 1694, -4423, 1350, -3815,
+ -1790, -5410, 1040, -6125, 944, -3669, -3020, -4665,
+ 2712, 4352, 72, -1842, -4094, 4378, -3045, 1095,
+ 3621, -3006, -2744, 4805, -3553, -1062, -2294, 3637,
+ 3459, 145, -5542, -2731, -3932, -4890, -5911, PADDING,
+ -1065, -1065, -1065, -1065, -404, -404, -404, -404,
+ 1168, 1168, 1168, 1168, -1207, -1207, -1207, -1207,
+ 493, 493, 493, 493, -5444, -5444, -5444, -5444,
+ -4337, -4337, -4337, -4337, 1378, 1378, 1378, 1378,
+ 2143, 2143, 2143, 2143, -4645, -4645, -4645, -4645,
+ 5277, 5277, 5277, 5277, 3248, 3248, 3248, 3248,
+ -4096, -4096, -4096, -4096, 2381, 2381, 2381, 2381,
+ -435, -435, -435, -435, 1912, 1912, 1912, 1912,
+ -4079, -1058, 922, 441, 1958, 4322, 1112, 2078,
+ 4046, 709, -3150, 1319, 4240, -3570, -6065, -835,
+ 2459, 683, 3656, -64, -1566, 5782, -2948, -2503,
+ -3123, -1747, -3054, -5486, -4433, -5919, 3834, -5257,
+ 2873, -791, -1120, -21, 874, 170, 2307, -648,
+ -1030, 3821, 4649, 2929, 1573, 3793, -502, 2602,
+ 1849, -3268, -4301, 457, -879, 982, 4218, -3454,
+ -4504, 530, 3578, -3466, -2046, -2957, 3317, 139,
+ 2827, 2434, -2535, -5808, -2301, -5650, 4289, -150,
+ -466, 1681, 5969, 6026, -3846, -6063, 5118, -1901,
+ 5776, 3795, -4523, -8, -2593, -2276, 4390, -3758,
+ 778, 2626, 4697, 1701, 2940, -1481, -2532, 3332,
+ -1646, 5728, -4591, 3091, -81, -4320, -1000, -2842,
+ 480, 1022, 9, -2468, 339, 5791, 544, PADDING,
+ 2166, 2166, 2166, 2166, -113, -113, -113, -113,
+ -160, -160, -160, -160, -3, -3, -3, -3,
+ 3636, 3636, 3636, 3636, 5291, 5291, 5291, 5291,
+ -1426, -1426, -1426, -1426, 1663, 1663, 1663, 1663,
+ 3915, 3915, 3915, 3915, -4919, -4919, -4919, -4919,
+ 3149, 3149, 3149, 3149, 4437, 4437, 4437, 4437,
+ 4938, 4938, 4938, 4938, 2704, 2704, 2704, 2704,
+ -4654, -4654, -4654, -4654, -1777, -1777, -1777, -1777,
+ -5241, -2920, -4169, -3127, -5468, 1010, -3482, 787,
+ 5057, 4698, 4780, -3445, -192, 1321, 4912, -2049,
+ 677, -5874, -6055, -3336, 1323, -2766, -52, 3174,
+ 1579, -431, -2505, 5906, 3957, -2839, 151, -2127,
+ 343, 4538, -5211, 1208, -1705, -416, 716, 2164,
+ 5412, -3278, 3515, 1218, -1536, 2429, 1373, 717,
+ -3368, 4238, -4222, -540, 3163, 6127, 1389, 4404,
+ 3359, 5209, 3678, -1928, 1826, 4489, 1136, 3708,
+ -3448, -1908, 1866, -4727, 2450, 814, -2110, -5416,
+ -4209, -5993, -438, 5061, -1721, -4103, -2982, -3589,
+ 4227, -612, 1526, -125, 4032, -4840, -2068, -346,
+ -3205, 1092, 4265, 464, 2926, -3171, 3449, -3238,
+ 1212, 5023, 5828, -2963, -4896, -3051, 2366, -1673,
+ 4278, -5331, -4989, -4177, -3584, 1381, -2525, PADDING,
+ 3364, 3364, 3364, 3364, 4057, 4057, 4057, 4057,
+ -2847, -2847, -2847, -2847, 2174, 2174, 2174, 2174,
+ -5042, -5042, -5042, -5042, 4053, 4053, 4053, 4053,
+ 5195, 5195, 5195, 5195, -4895, -4895, -4895, -4895,
+ 1689, 1689, 1689, 1689, -3271, -3271, -3271, -3271,
+ -4414, -4414, -4414, -4414, 4372, 4372, 4372, 4372,
+ -2305, -2305, -2305, -2305, 2645, 2645, 2645, 2645,
+ -2780, -2780, -2780, -2780, 1484, 1484, 1484, 1484,
+ -58, -241, 3532, -1003, 1956, -5009, -885, -6008,
+ 3477, -5681, 142, -1105, -2844, 3438, -975, 4212,
+ -3029, -5594, 4782, 5886, -4213, 504, 2302, -605,
+ -421, -4080, 3602, 6068, -3600, 3263, 6077, -4624,
+ 2065, 3495, -3534, -1756, 2275, 4267, 5063, -1518,
+ -1275, -1176, 4860, -1445, -5987, 579, -2769, -5966,
+ -3975, -5835, 1417, -4505, 3744, 2528, 5102, -5588,
+ 4924, 1014, 1327, 3942, 2717, 3200, 5836, 2260,
+ 5826, 4564, 3961, 4145, 2461, 5653, -4176, -3765,
+ 5508, -5734, 1125, -1131, -5596, 3889, 3114, 212,
+ 4883, 3087, 5676, 2257, 4963, -3056, -412, -5845,
+ 4781, -448, 3607, -5232, 60, -1535, -4566, 68,
+ 3195, -3328, -5777, -1177, -4255, -1635, -2768, -953,
+ -3748, 827, 5767, 2476, 118, 2197, -5067, PADDING,
+ -3247, -3247, -3247, -3247, -3978, -3978, -3978, -3978,
+ -2370, -2370, -2370, -2370, 5332, 5332, 5332, 5332,
+ 1630, 1630, 1630, 1630, 5407, 5407, 5407, 5407,
+ -1153, -1153, -1153, -1153, -2249, -2249, -2249, -2249,
+ -2686, -2686, -2686, -2686, -2969, -2969, -2969, -2969,
+ 2865, 2865, 2865, 2865, 3510, 3510, 3510, 3510,
+ -2126, -2126, -2126, -2126, 3186, 3186, 3186, 3186,
+ -2884, -2884, -2884, -2884, -4048, -4048, -4048, -4048,
+ -4467, -4789, -5537, 4749, 4449, -5456, -147, -3789,
+ 6118, -3818, 1190, -2683, 3860, 5445, -4536, -1050,
+ 5079, -3262, 2169, -522, -4324, 4916, -4075, 5315,
+ -1278, -2344, 1973, -5574, -3514, -1041, 5925, -1018,
+ 180, -4605, -1409, 204, -1468, -3407, -1344, -2483,
+ 4739, -5518, -3028, -364, -1236, -5246, 3121, 1057,
+ -406, 146, 1403, 6094, -239, 994, 4670, 5464,
+ 3375, -3393, -4913, 3825, -2947, 636, -622, 5672,
+ 4138, 2689, -5219, 5509, -3981, 463, -3042, -2054,
+ -4251, 1226, 5216, -2360, -3017, 4475, 4705, -2600,
+ -1687, 5268, 1804, -5189, -2900, 4554, -512, 4906,
+ -2291, 4335, 3528, -4235, -3982, 5609, -1737, 4499,
+ 5860, -4978, 1351, -140, -1853, -4611, -726, 3949,
+ -3296, 4452, 2396, -4354, 130, 2837, -5374, PADDING,
+ -2399, -2399, -2399, -2399, -5191, -5191, -5191, -5191,
+ -3000, -3000, -3000, -3000, 3016, 3016, 3016, 3016,
+ -5559, -5559, -5559, -5559, -2178, -2178, -2178, -2178,
+ 3985, 3985, 3985, 3985, 3531, 3531, 3531, 3531,
+ -3400, -3400, -3400, -3400, -3136, -3136, -3136, -3136,
+ 671, 671, 671, 671, 243, 243, 243, 243,
+ 420, 420, 420, 420, 1544, 1544, 1544, 1544,
+ 4905, 4905, 4905, 4905, 476, 476, 476, 476,
+ 654, 3565, 1702, 1987, -5529, 5206, 3199, -56,
+ 6136, -5862, -5415, -3643, 4948, -6137, 400, -1728,
+ 5339, 5446, 3710, 6093, 468, -3988, 316, -382,
+ -2033, -3998, 3879, 1922, -1359, -5435, 973, -1254,
+ 5598, -1892, -5724, -1029, 5959, -3959, 2442, 5115,
+ -1314, 2894, -5690, -3947, 3343, 1522, -20, 4608,
+ 4578, -375, -1836, -2185, 6085, -1038, -2231, 2800,
+ 506, 1392, 3276, 2212, -1942, 2575, 2776, -5478,
+ 3344, -3624, -1325, -1945, -2148, 5797, 1248, 4939,
+ 1744, -3654, -2455, 338, -4119, -2151, 5002, 5163,
+ 377, 1620, -425, -392, -4167, -923, -6092, 193,
+ 1255, 5784, -3338, -2674, -3408, 1165, -1178, 3511,
+}; // 1024->1416
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_qinv_br[] = {
+ PADDING, -3943, -13721, 10780, -3479, 1925, 15294, -11023,
+ 9444, -12854, 7036, 6183, -3119, -2546, -2106, 3359,
+ 11700, 12350, -15345, 6468, 890, 3807, 4522, PADDING,
+ 6402, 6402, 6402, 6402, -13601, -13601, -13601, -13601,
+ 1039, 1039, 1039, 1039, -10220, -10220, -10220, -10220,
+ 943, 943, 943, 943, -7764, -7764, -7764, -7764,
+ 13364, 13364, 13364, 13364, 7623, 7623, 7623, 7623,
+ 1178, 1178, 1178, 1178, -2845, -2845, -2845, -2845,
+ 2061, 2061, 2061, 2061, 10073, 10073, 10073, 10073,
+ 12961, 12961, 12961, 12961, 15193, 15193, 15193, 15193,
+ -6615, -6615, -6615, -6615, -2786, -2786, -2786, -2786,
+ 130, 3367, 15772, 3954, -6666, -3970, -4220, -15844,
+ 4031, 933, -4839, 14353, 14316, -5484, -8537, 11980,
+ -7300, -15646, -15292, 7079, -8023, 4514, 463, 1927,
+ -5266, -10017, 925, 7799, -8839, -1135, 4954, 12676,
+ 18, 2253, 8409, 8759, 575, -14734, 2045, -5900,
+ 8319, -16228, -10508, 9428, 8609, -4548, 3418, 5388,
+ 10516, 14942, 5788, -3373, -7852, 7020, -8617, 12945,
+ -7842, -15097, 10231, 8588, 10799, 2250, -2613, 12239,
+ 5162, 9927, 13476, -11625, 133, 2050, -10145, 11073,
+ -16278, 15054, 10007, 14318, 12612, 10476, -9524, -7551,
+ 10929, -14745, -9279, 8018, 14262, 3749, -781, -10049,
+ -1511, 14102, 6919, 11393, -13884, 13870, -1818, -13550,
+ -9343, -6999, -2530, -8535, 8036, 13561, -3535, 5367,
+ -8769, 1943, 8641, 7682, 8756, -13577, -5570, PADDING,
+ 2711, 2711, 2711, 2711, 4351, 4351, 4351, 4351,
+ 71, 71, 71, 71, -10033, -10033, -10033, -10033,
+ 4098, 4098, 4098, 4098, 12569, 12569, 12569, 12569,
+ -7140, -7140, -7140, -7140, 13382, 13382, 13382, 13382,
+ -13025, -13025, -13025, -13025, -13556, -13556, -13556, -13556,
+ -8175, -8175, -8175, -8175, -3839, -3839, -3839, -3839,
+ 645, 645, 645, 645, -11047, -11047, -11047, -11047,
+ 9876, 9876, 9876, 9876, -1453, -1453, -1453, -1453,
+ 8079, 10972, 6295, -4914, 7754, 581, 9156, -9409,
+ 10567, 1535, 16377, -6524, 5210, -5468, -7684, -4812,
+ 10641, -10583, -7378, 415, 6082, 15668, -5415, 14220,
+ 10057, 1114, 15753, -1207, 14476, -12729, -12630, 3447,
+ -9249, -11847, 12513, -6114, 4804, 13606, -11761, 3261,
+ -3413, -63, -2410, -14790, 2349, 2706, 14561, 7031,
+ 12489, -13692, -13297, 9785, 1541, -1199, -12428, -6991,
+ 14980, 15436, 13446, 8239, 8172, -15206, -15732, -12582,
+ 16366, -9113, 6234, -1111, 4146, -5042, -4239, -6223,
+ 1637, -3655, -6626, -13436, -973, -5138, -7855, -12025,
+ 8959, 167, 6327, 10153, 14313, 5183, -1359, -14361,
+ -4420, 9337, 2202, 3727, 4015, 11953, 2426, -2002,
+ -6786, -1501, -7932, 12921, -7324, -8359, 9897, -9849,
+ -13809, -4690, -9884, 9017, -946, -6794, -11281, PADDING,
+ 2671, 2671, 2671, 2671, 13566, 13566, 13566, 13566,
+ -13268, -13268, -13268, -13268, -10079, -10079, -10079, -10079,
+ -6498, -6498, -6498, -6498, 16057, 16057, 16057, 16057,
+ -6842, -6842, -6842, -6842, -16102, -16102, -16102, -16102,
+ 13361, 13361, 13361, 13361, -11423, -11423, -11423, -11423,
+ -4284, -4284, -4284, -4284, -2333, -2333, -2333, -2333,
+ 9721, 9721, 9721, 9721, 7964, 7964, 7964, 7964,
+ -5831, -5831, -5831, -5831, -6458, -6458, -6458, -6458,
+ 786, 16262, 15374, 1738, -10708, 10871, -10031, -7783,
+ 866, -3743, -3055, -2527, 15972, 3090, -9940, -10796,
+ 8876, 11460, -447, 7178, 15894, -13614, -5231, 4250,
+ -16324, -6812, -13822, -3199, 3626, 10548, -16316, 14124,
+ 12047, 3093, 7279, -6007, 6607, 11183, -4754, -12172,
+ -13785, -2306, 503, -4700, -4079, -10316, 15550, -4623,
+ -14065, -3335, 5426, -5018, -12718, 14097, -15126, -14414,
+ 12889, -7540, -10967, 10239, 9201, -3309, -15414, -7047,
+ 8249, 12852, 14428, 4980, -7572, 8050, -13540, 13262,
+ 7124, 3410, 8308, 5890, 4516, -11793, 3599, -10172,
+ -4772, -14425, 2773, -16332, 2517, -9783, -8052, -12438,
+ 7231, 11604, 191, -4911, -10916, 11673, -8119, 2919,
+ 9655, -8015, -7316, 12812, -9473, -2831, -6116, 9697,
+ 9223, 386, -14777, -7282, -10484, -13038, -15761, PADDING,
+ -2839, -2839, -2839, -2839, -1077, -1077, -1077, -1077,
+ 3114, 3114, 3114, 3114, -3218, -3218, -3218, -3218,
+ 1314, 1314, 1314, 1314, -14516, -14516, -14516, -14516,
+ -11564, -11564, -11564, -11564, 3674, 3674, 3674, 3674,
+ 5714, 5714, 5714, 5714, -12385, -12385, -12385, -12385,
+ 14070, 14070, 14070, 14070, 8660, 8660, 8660, 8660,
+ -10921, -10921, -10921, -10921, 6348, 6348, 6348, 6348,
+ -1159, -1159, -1159, -1159, 5098, 5098, 5098, 5098,
+ -10876, -2821, 2458, 1175, 5220, 11524, 2965, 5540,
+ 10788, 1890, -8399, 3517, 11305, -9519, -16172, -2226,
+ 6556, 1821, 9748, -170, -4175, 15417, -7860, -6674,
+ -8327, -4658, -8143, -14628, -11820, -15782, 10223, -14017,
+ 7660, -2109, -2986, -55, 2330, 453, 6151, -1727,
+ -2746, 10188, 12396, 7810, 4194, 10113, -1338, 6938,
+ 4930, -8713, -11468, 1218, -2343, 2618, 11247, -9209,
+ -12009, 1413, 9540, -9241, -5455, -7884, 8844, 370,
+ 7538, 6490, -6759, -15486, -6135, -15065, 11436, -399,
+ -1242, 4482, 15916, 16068, -10255, -16166, 13646, -5068,
+ 15401, 10119, -12060, -21, -6914, -6068, 11705, -10020,
+ 2074, 7002, 12524, 4535, 7839, -3949, -6751, 8884,
+ -4388, 15273, -12241, 8241, -215, -11519, -2666, -7578,
+ 1279, 2725, 23, -6580, 903, 15441, 1450, PADDING,
+ 5775, 5775, 5775, 5775, -301, -301, -301, -301,
+ -426, -426, -426, -426, -7, -7, -7, -7,
+ 9695, 9695, 9695, 9695, 14108, 14108, 14108, 14108,
+ -3802, -3802, -3802, -3802, 4434, 4434, 4434, 4434,
+ 10439, 10439, 10439, 10439, -13116, -13116, -13116, -13116,
+ 8396, 8396, 8396, 8396, 11831, 11831, 11831, 11831,
+ 13166, 13166, 13166, 13166, 7210, 7210, 7210, 7210,
+ -12409, -12409, -12409, -12409, -4738, -4738, -4738, -4738,
+ -13974, -7786, -11116, -8337, -14580, 2693, -9284, 2098,
+ 13484, 12526, 12745, -9185, -511, 3522, 13097, -5463,
+ 1805, -15662, -16145, -8895, 3527, -7375, -138, 8463,
+ 4210, -1149, -6679, 15748, 10551, -7570, 402, -5671,
+ 914, 12100, -13894, 3221, -4546, -1109, 1909, 5770,
+ 14430, -8740, 9372, 3247, -4095, 6476, 3661, 1911,
+ -8980, 11300, -11257, -1439, 8433, 16337, 3703, 11743,
+ 8956, 13889, 9807, -5140, 4868, 11969, 3029, 9887,
+ -9193, -5087, 4975, -12604, 6532, 2170, -5626, -14441,
+ -11223, -15980, -1167, 13494, -4588, -10940, -7951, -9569,
+ 11271, -1631, 4069, -333, 10751, -12905, -5514, -922,
+ -8545, 2911, 11372, 1237, 7802, -8455, 9196, -8633,
+ 3231, 13393, 15540, -7900, -13054, -8135, 6308, -4460,
+ 11407, -14214, -13302, -11137, -9556, 3682, -6732, PADDING,
+ 8969, 8969, 8969, 8969, 10817, 10817, 10817, 10817,
+ -7591, -7591, -7591, -7591, 5796, 5796, 5796, 5796,
+ -13444, -13444, -13444, -13444, 10807, 10807, 10807, 10807,
+ 13852, 13852, 13852, 13852, -13052, -13052, -13052, -13052,
+ 4503, 4503, 4503, 4503, -8721, -8721, -8721, -8721,
+ -11769, -11769, -11769, -11769, 11657, 11657, 11657, 11657,
+ -6146, -6146, -6146, -6146, 7052, 7052, 7052, 7052,
+ -7412, -7412, -7412, -7412, 3957, 3957, 3957, 3957,
+ -154, -642, 9417, -2674, 5215, -13356, -2359, -16020,
+ 9271, -15148, 378, -2946, -7583, 9167, -2599, 11231,
+ -8076, -14916, 12750, 15694, -11233, 1343, 6138, -1613,
+ -1122, -10879, 9604, 16180, -9599, 8700, 16204, -12329,
+ 5506, 9319, -9423, -4682, 6066, 11377, 13500, -4047,
+ -3399, -3135, 12958, -3853, -15964, 1543, -7383, -15908,
+ -10599, -15558, 3778, -12012, 9983, 6740, 13604, -14900,
+ 13129, 2703, 3538, 10511, 7244, 8532, 15561, 6026,
+ 15534, 12169, 10561, 11052, 6562, 15073, -11135, -10039,
+ 14686, -15289, 2999, -3015, -14921, 10369, 8303, 565,
+ 13020, 8231, 15134, 6018, 13233, -8148, -1098, -15585,
+ 12748, -1194, 9617, -13950, 159, -4093, -12175, 181,
+ 8519, -8873, -15404, -3138, -11345, -4359, -7380, -2541,
+ -9993, 2205, 15377, 6602, 314, 5858, -13510, PADDING,
+ -8657, -8657, -8657, -8657, -10607, -10607, -10607, -10607,
+ -6319, -6319, -6319, -6319, 14217, 14217, 14217, 14217,
+ 4346, 4346, 4346, 4346, 14417, 14417, 14417, 14417,
+ -3074, -3074, -3074, -3074, -5996, -5996, -5996, -5996,
+ -7162, -7162, -7162, -7162, -7916, -7916, -7916, -7916,
+ 7639, 7639, 7639, 7639, 9359, 9359, 9359, 9359,
+ -5668, -5668, -5668, -5668, 8495, 8495, 8495, 8495,
+ -7690, -7690, -7690, -7690, -10793, -10793, -10793, -10793,
+ -11911, -12769, -14764, 12662, 11863, -14548, -391, -10103,
+ 16313, -10180, 3173, -7154, 10292, 14518, -12095, -2799,
+ 13542, -8697, 5783, -1391, -11529, 13108, -10865, 14172,
+ -3407, -6250, 5260, -14862, -9369, -2775, 15798, -2714,
+ 479, -12279, -3757, 543, -3914, -9084, -3583, -6620,
+ 12636, -14713, -8074, -970, -3295, -13988, 8321, 2818,
+ -1082, 389, 3741, 16249, -637, 2650, 12452, 14569,
+ 8999, -9047, -13100, 10199, -7858, 1695, -1658, 15124,
+ 11033, 7170, -13916, 14689, -10615, 1234, -8111, -5476,
+ -11335, 3269, 13908, -6292, -8044, 11932, 12545, -6932,
+ -4498, 14046, 4810, -13836, -7732, 12143, -1365, 13081,
+ -6108, 11559, 9407, -11292, -10617, 14956, -4631, 11996,
+ 15625, -13273, 3602, -373, -4940, -12294, -1935, 10529,
+ -8788, 11871, 6388, -11609, 346, 7564, -14329, PADDING,
+ -6396, -6396, -6396, -6396, -13841, -13841, -13841, -13841,
+ -7999, -7999, -7999, -7999, 8042, 8042, 8042, 8042,
+ -14822, -14822, -14822, -14822, -5807, -5807, -5807, -5807,
+ 10625, 10625, 10625, 10625, 9415, 9415, 9415, 9415,
+ -9065, -9065, -9065, -9065, -8361, -8361, -8361, -8361,
+ 1789, 1789, 1789, 1789, 647, 647, 647, 647,
+ 1119, 1119, 1119, 1119, 4116, 4116, 4116, 4116,
+ 13078, 13078, 13078, 13078, 1269, 1269, 1269, 1269,
+ 1743, 9505, 4538, 5298, -14742, 13881, 8529, -149,
+ 16361, -15630, -14438, -9713, 13193, -16364, 1066, -4607,
+ 14236, 14521, 9892, 16246, 1247, -10633, 842, -1018,
+ -5420, -10660, 10343, 5124, -3623, -14492, 2594, -3343,
+ 14926, -5044, -15262, -2743, 15889, -10556, 6511, 13638,
+ -3503, 7716, -15172, -10524, 8913, 4058, -53, 12287,
+ 12207, -999, -4895, -5826, 16225, -2767, -5948, 7466,
+ 1349, 3711, 8735, 5898, -5178, 6866, 7402, -14606,
+ 8916, -9663, -3533, -5186, -5727, 15457, 3327, 13169,
+ 4650, -9743, -6546, 901, -10983, -5735, 13337, 13766,
+ 1005, 4319, -1133, -1045, -11111, -2461, -16244, 514,
+ 3346, 15422, -8900, -7130, -9087, 3106, -3141, 9361,
+}; // 1416
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_br[] = {
+ -3511, 1178, -1165, 3408, 2674, 3338, -5784, -1255,
+ -193, 6092, 923, 4167, 392, 425, -1620, -377,
+ -5163, -5002, 2151, 4119, -338, 2455, 3654, -1744,
+ -4939, -1248, -5797, 2148, 1945, 1325, 3624, -3344,
+ 5478, -2776, -2575, 1942, -2212, -3276, -1392, -506,
+ -2800, 2231, 1038, -6085, 2185, 1836, 375, -4578,
+ -4608, 20, -1522, -3343, 3947, 5690, -2894, 1314,
+ -5115, -2442, 3959, -5959, 1029, 5724, 1892, -5598,
+ 1254, -973, 5435, 1359, -1922, -3879, 3998, 2033,
+ 382, -316, 3988, -468, -6093, -3710, -5446, -5339,
+ 1728, -400, 6137, -4948, 3643, 5415, 5862, -6136,
+ 56, -3199, -5206, 5529, -1987, -1702, -3565, -654,
+ -476, -476, -476, -476, -4905, -4905, -4905, -4905,
+ -1544, -1544, -1544, -1544, -420, -420, -420, -420,
+ -243, -243, -243, -243, -671, -671, -671, -671,
+ 3136, 3136, 3136, 3136, 3400, 3400, 3400, 3400,
+ -3531, -3531, -3531, -3531, -3985, -3985, -3985, -3985,
+ 2178, 2178, 2178, 2178, 5559, 5559, 5559, 5559,
+ -3016, -3016, -3016, -3016, 3000, 3000, 3000, 3000,
+ 5191, 5191, 5191, 5191, 2399, 2399, 2399, 2399,
+ 5374, -2837, -130, 4354, -2396, -4452, 3296, -3949,
+ 726, 4611, 1853, 140, -1351, 4978, -5860, PADDING,
+ -4499, 1737, -5609, 3982, 4235, -3528, -4335, 2291,
+ -4906, 512, -4554, 2900, 5189, -1804, -5268, 1687,
+ 2600, -4705, -4475, 3017, 2360, -5216, -1226, 4251,
+ 2054, 3042, -463, 3981, -5509, 5219, -2689, -4138,
+ -5672, 622, -636, 2947, -3825, 4913, 3393, -3375,
+ -5464, -4670, -994, 239, -6094, -1403, -146, 406,
+ -1057, -3121, 5246, 1236, 364, 3028, 5518, -4739,
+ 2483, 1344, 3407, 1468, -204, 1409, 4605, -180,
+ 1018, -5925, 1041, 3514, 5574, -1973, 2344, 1278,
+ -5315, 4075, -4916, 4324, 522, -2169, 3262, -5079,
+ 1050, 4536, -5445, -3860, 2683, -1190, 3818, -6118,
+ 3789, 147, 5456, -4449, -4749, 5537, 4789, 4467,
+ 4048, 4048, 4048, 4048, 2884, 2884, 2884, 2884,
+ -3186, -3186, -3186, -3186, 2126, 2126, 2126, 2126,
+ -3510, -3510, -3510, -3510, -2865, -2865, -2865, -2865,
+ 2969, 2969, 2969, 2969, 2686, 2686, 2686, 2686,
+ 2249, 2249, 2249, 2249, 1153, 1153, 1153, 1153,
+ -5407, -5407, -5407, -5407, -1630, -1630, -1630, -1630,
+ -5332, -5332, -5332, -5332, 2370, 2370, 2370, 2370,
+ 3978, 3978, 3978, 3978, 3247, 3247, 3247, 3247,
+ 5067, -2197, -118, -2476, -5767, -827, 3748, 953,
+ 2768, 1635, 4255, 1177, 5777, 3328, -3195, PADDING,
+ -68, 4566, 1535, -60, 5232, -3607, 448, -4781,
+ 5845, 412, 3056, -4963, -2257, -5676, -3087, -4883,
+ -212, -3114, -3889, 5596, 1131, -1125, 5734, -5508,
+ 3765, 4176, -5653, -2461, -4145, -3961, -4564, -5826,
+ -2260, -5836, -3200, -2717, -3942, -1327, -1014, -4924,
+ 5588, -5102, -2528, -3744, 4505, -1417, 5835, 3975,
+ 5966, 2769, -579, 5987, 1445, -4860, 1176, 1275,
+ 1518, -5063, -4267, -2275, 1756, 3534, -3495, -2065,
+ 4624, -6077, -3263, 3600, -6068, -3602, 4080, 421,
+ 605, -2302, -504, 4213, -5886, -4782, 5594, 3029,
+ -4212, 975, -3438, 2844, 1105, -142, 5681, -3477,
+ 6008, 885, 5009, -1956, 1003, -3532, 241, 58,
+ -1484, -1484, -1484, -1484, 2780, 2780, 2780, 2780,
+ -2645, -2645, -2645, -2645, 2305, 2305, 2305, 2305,
+ -4372, -4372, -4372, -4372, 4414, 4414, 4414, 4414,
+ 3271, 3271, 3271, 3271, -1689, -1689, -1689, -1689,
+ 4895, 4895, 4895, 4895, -5195, -5195, -5195, -5195,
+ -4053, -4053, -4053, -4053, 5042, 5042, 5042, 5042,
+ -2174, -2174, -2174, -2174, 2847, 2847, 2847, 2847,
+ -4057, -4057, -4057, -4057, -3364, -3364, -3364, -3364,
+ 2525, -1381, 3584, 4177, 4989, 5331, -4278, 1673,
+ -2366, 3051, 4896, 2963, -5828, -5023, -1212, PADDING,
+ 3238, -3449, 3171, -2926, -464, -4265, -1092, 3205,
+ 346, 2068, 4840, -4032, 125, -1526, 612, -4227,
+ 3589, 2982, 4103, 1721, -5061, 438, 5993, 4209,
+ 5416, 2110, -814, -2450, 4727, -1866, 1908, 3448,
+ -3708, -1136, -4489, -1826, 1928, -3678, -5209, -3359,
+ -4404, -1389, -6127, -3163, 540, 4222, -4238, 3368,
+ -717, -1373, -2429, 1536, -1218, -3515, 3278, -5412,
+ -2164, -716, 416, 1705, -1208, 5211, -4538, -343,
+ 2127, -151, 2839, -3957, -5906, 2505, 431, -1579,
+ -3174, 52, 2766, -1323, 3336, 6055, 5874, -677,
+ 2049, -4912, -1321, 192, 3445, -4780, -4698, -5057,
+ -787, 3482, -1010, 5468, 3127, 4169, 2920, 5241,
+ 1777, 1777, 1777, 1777, 4654, 4654, 4654, 4654,
+ -2704, -2704, -2704, -2704, -4938, -4938, -4938, -4938,
+ -4437, -4437, -4437, -4437, -3149, -3149, -3149, -3149,
+ 4919, 4919, 4919, 4919, -3915, -3915, -3915, -3915,
+ -1663, -1663, -1663, -1663, 1426, 1426, 1426, 1426,
+ -5291, -5291, -5291, -5291, -3636, -3636, -3636, -3636,
+ 3, 3, 3, 3, 160, 160, 160, 160,
+ 113, 113, 113, 113, -2166, -2166, -2166, -2166,
+ -544, -5791, -339, 2468, -9, -1022, -480, 2842,
+ 1000, 4320, 81, -3091, 4591, -5728, 1646, PADDING,
+ -3332, 2532, 1481, -2940, -1701, -4697, -2626, -778,
+ 3758, -4390, 2276, 2593, 8, 4523, -3795, -5776,
+ 1901, -5118, 6063, 3846, -6026, -5969, -1681, 466,
+ 150, -4289, 5650, 2301, 5808, 2535, -2434, -2827,
+ -139, -3317, 2957, 2046, 3466, -3578, -530, 4504,
+ 3454, -4218, -982, 879, -457, 4301, 3268, -1849,
+ -2602, 502, -3793, -1573, -2929, -4649, -3821, 1030,
+ 648, -2307, -170, -874, 21, 1120, 791, -2873,
+ 5257, -3834, 5919, 4433, 5486, 3054, 1747, 3123,
+ 2503, 2948, -5782, 1566, 64, -3656, -683, -2459,
+ 835, 6065, 3570, -4240, -1319, 3150, -709, -4046,
+ -2078, -1112, -4322, -1958, -441, -922, 1058, 4079,
+ -1912, -1912, -1912, -1912, 435, 435, 435, 435,
+ -2381, -2381, -2381, -2381, 4096, 4096, 4096, 4096,
+ -3248, -3248, -3248, -3248, -5277, -5277, -5277, -5277,
+ 4645, 4645, 4645, 4645, -2143, -2143, -2143, -2143,
+ -1378, -1378, -1378, -1378, 4337, 4337, 4337, 4337,
+ 5444, 5444, 5444, 5444, -493, -493, -493, -493,
+ 1207, 1207, 1207, 1207, -1168, -1168, -1168, -1168,
+ 404, 404, 404, 404, 1065, 1065, 1065, 1065,
+ 5911, 4890, 3932, 2731, 5542, -145, -3459, -3637,
+ 2294, 1062, 3553, -4805, 2744, 3006, -3621, PADDING,
+ -1095, 3045, -4378, 4094, 1842, -72, -4352, -2712,
+ 4665, 3020, 3669, -944, 6125, -1040, 5410, 1790,
+ 3815, -1350, 4423, -1694, -2209, -3116, -1279, -2672,
+ -4974, 5078, -3019, 2840, -1868, -5411, -4820, -3094,
+ 2643, 5781, 1241, -3451, -3840, 4113, 2828, -4834,
+ 5406, 5673, -5287, 4770, 1882, -2035, 1251, 5275,
+ 1734, -5832, 3869, 1530, 1763, -189, 865, 5170,
+ 4565, 1783, -4194, -2478, 2253, -2730, -1160, -4518,
+ -5297, 6119, -3956, -1360, 1200, 5184, 2555, 6122,
+ -1594, 1962, 5106, -5961, -2692, 168, -4298, -3329,
+ 4049, 3728, -1159, -5990, 948, 1146, 1404, -325,
+ 2919, 3762, -4077, 4016, -652, -5766, -6099, -295,
+ 2422, 2422, 2422, 2422, 2187, 2187, 2187, 2187,
+ -2987, -2987, -2987, -2987, -3646, -3646, -3646, -3646,
+ 875, 875, 875, 875, 1607, 1607, 1607, 1607,
+ 4284, 4284, 4284, 4284, -5011, -5011, -5011, -5011,
+ 6039, 6039, 6039, 6039, 2566, 2566, 2566, 2566,
+ -6022, -6022, -6022, -6022, 2437, 2437, 2437, 2437,
+ 3780, 3780, 3780, 3780, 4976, 4976, 4976, 4976,
+ -5088, -5088, -5088, -5088, -1002, -1002, -1002, -1002,
+ 4231, 2548, 355, -3382, 3707, 1759, 5179, 3694,
+ -3712, 3135, 2747, -4846, 2975, 563, 2545, PADDING,
+ 751, -910, -4483, -1506, -1398, -826, -3502, 1658,
+ 5386, 510, -1944, -5368, -3808, -2373, -63, -3360,
+ 4510, 2946, 1927, 365, 5039, 2485, 1371, -614,
+ 2334, 1590, 1891, -1555, 417, -2338, 3418, -6138,
+ 4719, 5900, 5703, -3065, -3090, -5043, -5789, -5618,
+ 2622, 4661, 450, -578, -3670, 4987, 5135, -4684,
+ -2637, -5461, -1015, -881, 5547, 904, 24, 1280,
+ -1223, 4411, -5103, -1802, 2293, -4693, 4443, 3469,
+ -1293, 4737, 4774, -5429, 453, -5908, -418, -3772,
+ -5333, 2031, -5876, -2281, -156, 2767, 3969, -3991,
+ 1805, 2882, 2051, -1954, 2447, -6142, -576, -3963,
+ 3529, -3434, -218, -2908, 1843, -2361, -4115, -3030,
+ 545, 545, 545, 545, -3704, -3704, -3704, -3704,
+ 4143, 4143, 4143, 4143, -242, -242, -242, -242,
+ 1440, 1440, 1440, 1440, 3066, 3066, 3066, 3066,
+ 5084, 5084, 5084, 5084, 4885, 4885, 4885, 4885,
+ -5019, -5019, -5019, -5019, 2678, 2678, 2678, 2678,
+ -4714, -4714, -4714, -4714, -1537, -1537, -1537, -1537,
+ 3763, 3763, 3763, 3763, -27, -27, -27, -27,
+ -1632, -1632, -1632, -1632, -1017, -1017, -1017, -1017,
+ 2089, 5092, -3284, -2881, -3241, -729, 3289, -2013,
+ 1326, -5086, -3014, 3201, 949, 2625, 3504, PADDING,
+ 5082, 682, -5202, 5207, -4273, -2595, -5289, 567,
+ 3769, 293, -1406, -5349, -3007, 3480, 5530, -4099,
+ 2832, 3572, -3929, -4730, -5370, -3753, -5646, 6105,
+ -4153, 3805, -769, -50, 4360, -5054, -3723, -1936,
+ -4590, 980, -844, -4050, -3221, -3837, 5662, 2941,
+ -4855, 3232, -2633, 2945, 1265, -2171, -5604, -3944,
+ -2021, -1282, 1706, -3229, -3536, 3941, 6086, -3120,
+ 2213, -767, 5526, -216, -3285, -3154, -845, -7,
+ -4754, -1858, 426, 3315, -2925, -347, 3757, 1975,
+ -723, -174, -1693, 3009, -2655, 5735, 5868, 2738,
+ -4493, 3202, 2057, -5369, -5383, 1815, -350, -1512,
+ 5942, 1583, 1489, 2500, -1483, -5915, -1263, -49,
+ 1045, 1045, 1045, 1045, 2481, 2481, 2481, 2481,
+ -5698, -5698, -5698, -5698, -4861, -4861, -4861, -4861,
+ -3778, -3778, -3778, -3778, -773, -773, -773, -773,
+ 1067, 1067, 1067, 1067, -442, -442, -442, -442,
+ -2859, -2859, -2859, -2859, -5012, -5012, -5012, -5012,
+ 2912, 2912, 2912, 2912, -354, -354, -354, -354,
+ 3833, 3833, 3833, 3833, -390, -390, -390, -390,
+ 5101, 5101, 5101, 5101, -2401, -2401, -2401, -2401,
+ -1696, -1428, -334, -2426, 5755, -4632, -4388, -1260,
+ 790, 955, 1170, -2319, -2639, 4821, -3542, PADDING,
+ 4134, -5736, -722, 1305, -4043, 5146, 1479, PADDING, // dup
+ 4134, -5736, -722, 1305, -4043, 5146, 6830, 12277, // ninv=1
+}; // 1424
+
+const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_qinv_br[] = {
+ -9361, 3141, -3106, 9087, 7130, 8900, -15422, -3346,
+ -514, 16244, 2461, 11111, 1045, 1133, -4319, -1005,
+ -13766, -13337, 5735, 10983, -901, 6546, 9743, -4650,
+ -13169, -3327, -15457, 5727, 5186, 3533, 9663, -8916,
+ 14606, -7402, -6866, 5178, -5898, -8735, -3711, -1349,
+ -7466, 5948, 2767, -16225, 5826, 4895, 999, -12207,
+ -12287, 53, -4058, -8913, 10524, 15172, -7716, 3503,
+ -13638, -6511, 10556, -15889, 2743, 15262, 5044, -14926,
+ 3343, -2594, 14492, 3623, -5124, -10343, 10660, 5420,
+ 1018, -842, 10633, -1247, -16246, -9892, -14521, -14236,
+ 4607, -1066, 16364, -13193, 9713, 14438, 15630, -16361,
+ 149, -8529, -13881, 14742, -5298, -4538, -9505, -1743,
+ -1269, -1269, -1269, -1269, -13078, -13078, -13078, -13078,
+ -4116, -4116, -4116, -4116, -1119, -1119, -1119, -1119,
+ -647, -647, -647, -647, -1789, -1789, -1789, -1789,
+ 8361, 8361, 8361, 8361, 9065, 9065, 9065, 9065,
+ -9415, -9415, -9415, -9415, -10625, -10625, -10625, -10625,
+ 5807, 5807, 5807, 5807, 14822, 14822, 14822, 14822,
+ -8042, -8042, -8042, -8042, 7999, 7999, 7999, 7999,
+ 13841, 13841, 13841, 13841, 6396, 6396, 6396, 6396,
+ 14329, -7564, -346, 11609, -6388, -11871, 8788, -10529,
+ 1935, 12294, 4940, 373, -3602, 13273, -15625, PADDING,
+ -11996, 4631, -14956, 10617, 11292, -9407, -11559, 6108,
+ -13081, 1365, -12143, 7732, 13836, -4810, -14046, 4498,
+ 6932, -12545, -11932, 8044, 6292, -13908, -3269, 11335,
+ 5476, 8111, -1234, 10615, -14689, 13916, -7170, -11033,
+ -15124, 1658, -1695, 7858, -10199, 13100, 9047, -8999,
+ -14569, -12452, -2650, 637, -16249, -3741, -389, 1082,
+ -2818, -8321, 13988, 3295, 970, 8074, 14713, -12636,
+ 6620, 3583, 9084, 3914, -543, 3757, 12279, -479,
+ 2714, -15798, 2775, 9369, 14862, -5260, 6250, 3407,
+ -14172, 10865, -13108, 11529, 1391, -5783, 8697, -13542,
+ 2799, 12095, -14518, -10292, 7154, -3173, 10180, -16313,
+ 10103, 391, 14548, -11863, -12662, 14764, 12769, 11911,
+ 10793, 10793, 10793, 10793, 7690, 7690, 7690, 7690,
+ -8495, -8495, -8495, -8495, 5668, 5668, 5668, 5668,
+ -9359, -9359, -9359, -9359, -7639, -7639, -7639, -7639,
+ 7916, 7916, 7916, 7916, 7162, 7162, 7162, 7162,
+ 5996, 5996, 5996, 5996, 3074, 3074, 3074, 3074,
+ -14417, -14417, -14417, -14417, -4346, -4346, -4346, -4346,
+ -14217, -14217, -14217, -14217, 6319, 6319, 6319, 6319,
+ 10607, 10607, 10607, 10607, 8657, 8657, 8657, 8657,
+ 13510, -5858, -314, -6602, -15377, -2205, 9993, 2541,
+ 7380, 4359, 11345, 3138, 15404, 8873, -8519, PADDING,
+ -181, 12175, 4093, -159, 13950, -9617, 1194, -12748,
+ 15585, 1098, 8148, -13233, -6018, -15134, -8231, -13020,
+ -565, -8303, -10369, 14921, 3015, -2999, 15289, -14686,
+ 10039, 11135, -15073, -6562, -11052, -10561, -12169, -15534,
+ -6026, -15561, -8532, -7244, -10511, -3538, -2703, -13129,
+ 14900, -13604, -6740, -9983, 12012, -3778, 15558, 10599,
+ 15908, 7383, -1543, 15964, 3853, -12958, 3135, 3399,
+ 4047, -13500, -11377, -6066, 4682, 9423, -9319, -5506,
+ 12329, -16204, -8700, 9599, -16180, -9604, 10879, 1122,
+ 1613, -6138, -1343, 11233, -15694, -12750, 14916, 8076,
+ -11231, 2599, -9167, 7583, 2946, -378, 15148, -9271,
+ 16020, 2359, 13356, -5215, 2674, -9417, 642, 154,
+ -3957, -3957, -3957, -3957, 7412, 7412, 7412, 7412,
+ -7052, -7052, -7052, -7052, 6146, 6146, 6146, 6146,
+ -11657, -11657, -11657, -11657, 11769, 11769, 11769, 11769,
+ 8721, 8721, 8721, 8721, -4503, -4503, -4503, -4503,
+ 13052, 13052, 13052, 13052, -13852, -13852, -13852, -13852,
+ -10807, -10807, -10807, -10807, 13444, 13444, 13444, 13444,
+ -5796, -5796, -5796, -5796, 7591, 7591, 7591, 7591,
+ -10817, -10817, -10817, -10817, -8969, -8969, -8969, -8969,
+ 6732, -3682, 9556, 11137, 13302, 14214, -11407, 4460,
+ -6308, 8135, 13054, 7900, -15540, -13393, -3231, PADDING,
+ 8633, -9196, 8455, -7802, -1237, -11372, -2911, 8545,
+ 922, 5514, 12905, -10751, 333, -4069, 1631, -11271,
+ 9569, 7951, 10940, 4588, -13494, 1167, 15980, 11223,
+ 14441, 5626, -2170, -6532, 12604, -4975, 5087, 9193,
+ -9887, -3029, -11969, -4868, 5140, -9807, -13889, -8956,
+ -11743, -3703, -16337, -8433, 1439, 11257, -11300, 8980,
+ -1911, -3661, -6476, 4095, -3247, -9372, 8740, -14430,
+ -5770, -1909, 1109, 4546, -3221, 13894, -12100, -914,
+ 5671, -402, 7570, -10551, -15748, 6679, 1149, -4210,
+ -8463, 138, 7375, -3527, 8895, 16145, 15662, -1805,
+ 5463, -13097, -3522, 511, 9185, -12745, -12526, -13484,
+ -2098, 9284, -2693, 14580, 8337, 11116, 7786, 13974,
+ 4738, 4738, 4738, 4738, 12409, 12409, 12409, 12409,
+ -7210, -7210, -7210, -7210, -13166, -13166, -13166, -13166,
+ -11831, -11831, -11831, -11831, -8396, -8396, -8396, -8396,
+ 13116, 13116, 13116, 13116, -10439, -10439, -10439, -10439,
+ -4434, -4434, -4434, -4434, 3802, 3802, 3802, 3802,
+ -14108, -14108, -14108, -14108, -9695, -9695, -9695, -9695,
+ 7, 7, 7, 7, 426, 426, 426, 426,
+ 301, 301, 301, 301, -5775, -5775, -5775, -5775,
+ -1450, -15441, -903, 6580, -23, -2725, -1279, 7578,
+ 2666, 11519, 215, -8241, 12241, -15273, 4388, PADDING,
+ -8884, 6751, 3949, -7839, -4535, -12524, -7002, -2074,
+ 10020, -11705, 6068, 6914, 21, 12060, -10119, -15401,
+ 5068, -13646, 16166, 10255, -16068, -15916, -4482, 1242,
+ 399, -11436, 15065, 6135, 15486, 6759, -6490, -7538,
+ -370, -8844, 7884, 5455, 9241, -9540, -1413, 12009,
+ 9209, -11247, -2618, 2343, -1218, 11468, 8713, -4930,
+ -6938, 1338, -10113, -4194, -7810, -12396, -10188, 2746,
+ 1727, -6151, -453, -2330, 55, 2986, 2109, -7660,
+ 14017, -10223, 15782, 11820, 14628, 8143, 4658, 8327,
+ 6674, 7860, -15417, 4175, 170, -9748, -1821, -6556,
+ 2226, 16172, 9519, -11305, -3517, 8399, -1890, -10788,
+ -5540, -2965, -11524, -5220, -1175, -2458, 2821, 10876,
+ -5098, -5098, -5098, -5098, 1159, 1159, 1159, 1159,
+ -6348, -6348, -6348, -6348, 10921, 10921, 10921, 10921,
+ -8660, -8660, -8660, -8660, -14070, -14070, -14070, -14070,
+ 12385, 12385, 12385, 12385, -5714, -5714, -5714, -5714,
+ -3674, -3674, -3674, -3674, 11564, 11564, 11564, 11564,
+ 14516, 14516, 14516, 14516, -1314, -1314, -1314, -1314,
+ 3218, 3218, 3218, 3218, -3114, -3114, -3114, -3114,
+ 1077, 1077, 1077, 1077, 2839, 2839, 2839, 2839,
+ 15761, 13038, 10484, 7282, 14777, -386, -9223, -9697,
+ 6116, 2831, 9473, -12812, 7316, 8015, -9655, PADDING,
+ -2919, 8119, -11673, 10916, 4911, -191, -11604, -7231,
+ 12438, 8052, 9783, -2517, 16332, -2773, 14425, 4772,
+ 10172, -3599, 11793, -4516, -5890, -8308, -3410, -7124,
+ -13262, 13540, -8050, 7572, -4980, -14428, -12852, -8249,
+ 7047, 15414, 3309, -9201, -10239, 10967, 7540, -12889,
+ 14414, 15126, -14097, 12718, 5018, -5426, 3335, 14065,
+ 4623, -15550, 10316, 4079, 4700, -503, 2306, 13785,
+ 12172, 4754, -11183, -6607, 6007, -7279, -3093, -12047,
+ -14124, 16316, -10548, -3626, 3199, 13822, 6812, 16324,
+ -4250, 5231, 13614, -15894, -7178, 447, -11460, -8876,
+ 10796, 9940, -3090, -15972, 2527, 3055, 3743, -866,
+ 7783, 10031, -10871, 10708, -1738, -15374, -16262, -786,
+ 6458, 6458, 6458, 6458, 5831, 5831, 5831, 5831,
+ -7964, -7964, -7964, -7964, -9721, -9721, -9721, -9721,
+ 2333, 2333, 2333, 2333, 4284, 4284, 4284, 4284,
+ 11423, 11423, 11423, 11423, -13361, -13361, -13361, -13361,
+ 16102, 16102, 16102, 16102, 6842, 6842, 6842, 6842,
+ -16057, -16057, -16057, -16057, 6498, 6498, 6498, 6498,
+ 10079, 10079, 10079, 10079, 13268, 13268, 13268, 13268,
+ -13566, -13566, -13566, -13566, -2671, -2671, -2671, -2671,
+ 11281, 6794, 946, -9017, 9884, 4690, 13809, 9849,
+ -9897, 8359, 7324, -12921, 7932, 1501, 6786, PADDING,
+ 2002, -2426, -11953, -4015, -3727, -2202, -9337, 4420,
+ 14361, 1359, -5183, -14313, -10153, -6327, -167, -8959,
+ 12025, 7855, 5138, 973, 13436, 6626, 3655, -1637,
+ 6223, 4239, 5042, -4146, 1111, -6234, 9113, -16366,
+ 12582, 15732, 15206, -8172, -8239, -13446, -15436, -14980,
+ 6991, 12428, 1199, -1541, -9785, 13297, 13692, -12489,
+ -7031, -14561, -2706, -2349, 14790, 2410, 63, 3413,
+ -3261, 11761, -13606, -4804, 6114, -12513, 11847, 9249,
+ -3447, 12630, 12729, -14476, 1207, -15753, -1114, -10057,
+ -14220, 5415, -15668, -6082, -415, 7378, 10583, -10641,
+ 4812, 7684, 5468, -5210, 6524, -16377, -1535, -10567,
+ 9409, -9156, -581, -7754, 4914, -6295, -10972, -8079,
+ 1453, 1453, 1453, 1453, -9876, -9876, -9876, -9876,
+ 11047, 11047, 11047, 11047, -645, -645, -645, -645,
+ 3839, 3839, 3839, 3839, 8175, 8175, 8175, 8175,
+ 13556, 13556, 13556, 13556, 13025, 13025, 13025, 13025,
+ -13382, -13382, -13382, -13382, 7140, 7140, 7140, 7140,
+ -12569, -12569, -12569, -12569, -4098, -4098, -4098, -4098,
+ 10033, 10033, 10033, 10033, -71, -71, -71, -71,
+ -4351, -4351, -4351, -4351, -2711, -2711, -2711, -2711,
+ 5570, 13577, -8756, -7682, -8641, -1943, 8769, -5367,
+ 3535, -13561, -8036, 8535, 2530, 6999, 9343, PADDING,
+ 13550, 1818, -13870, 13884, -11393, -6919, -14102, 1511,
+ 10049, 781, -3749, -14262, -8018, 9279, 14745, -10929,
+ 7551, 9524, -10476, -12612, -14318, -10007, -15054, 16278,
+ -11073, 10145, -2050, -133, 11625, -13476, -9927, -5162,
+ -12239, 2613, -2250, -10799, -8588, -10231, 15097, 7842,
+ -12945, 8617, -7020, 7852, 3373, -5788, -14942, -10516,
+ -5388, -3418, 4548, -8609, -9428, 10508, 16228, -8319,
+ 5900, -2045, 14734, -575, -8759, -8409, -2253, -18,
+ -12676, -4954, 1135, 8839, -7799, -925, 10017, 5266,
+ -1927, -463, -4514, 8023, -7079, 15292, 15646, 7300,
+ -11980, 8537, 5484, -14316, -14353, 4839, -933, -4031,
+ 15844, 4220, 3970, 6666, -3954, -15772, -3367, -130,
+ 2786, 2786, 2786, 2786, 6615, 6615, 6615, 6615,
+ -15193, -15193, -15193, -15193, -12961, -12961, -12961, -12961,
+ -10073, -10073, -10073, -10073, -2061, -2061, -2061, -2061,
+ 2845, 2845, 2845, 2845, -1178, -1178, -1178, -1178,
+ -7623, -7623, -7623, -7623, -13364, -13364, -13364, -13364,
+ 7764, 7764, 7764, 7764, -943, -943, -943, -943,
+ 10220, 10220, 10220, 10220, -1039, -1039, -1039, -1039,
+ 13601, 13601, 13601, 13601, -6402, -6402, -6402, -6402,
+ -4522, -3807, -890, -6468, 15345, -12350, -11700, -3359,
+ 2106, 2546, 3119, -6183, -7036, 12854, -9444, PADDING,
+ 11023, -15294, -1925, 3479, -10780, 13721, 3943, PADDING, // dup
+ 11023, -15294, -1925, 3479, -10780, 13721, 18211, 32736, // ninv=1
+}; // 1424
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.h
new file mode 100644
index 000000000..f04568d7c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/ntt_consts.h
@@ -0,0 +1,23 @@
+#ifndef NTT_CONSTS
+#define NTT_CONSTS
+
+#include
+
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_qmvq[8];
+
+/*
+ * Table for NTT, binary case:
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ */
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_ntt_qinv_br[];
+
+/*
+ * Table for inverse NTT
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED1024_AARCH64_invntt_qinv_br[];
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/params.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/params.h
new file mode 100644
index 000000000..d494a4806
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/params.h
@@ -0,0 +1,17 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define FALCON_LOGN 10
+
+#define FALCON_N (1 << FALCON_LOGN)
+#define FALCON_Q 12289
+#define FALCON_QINV (-12287) // pow(12289, -1, pow(2, 16)) - pow(2, 16)
+#define FALCON_V 5461 // Barrett reduction
+#define FALCON_MONT 4091 // pow(2, 16, 12289)
+#define FALCON_MONT_BR 10908 // (4091 << 16)//q//2
+
+#define FALCON_NINV_MONT 64 // pow(1024, -1, 12289) * pow(2, 16, 12289)
+#define FALCON_NINV_MONT_BR 170 // (64 << 16) // q //2
+#define FALCON_LOG2_NINV_MONT 6
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly.h
new file mode 100644
index 000000000..2d7509746
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly.h
@@ -0,0 +1,42 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "inner.h"
+#include "params.h"
+
+typedef enum ntt_domain {
+ NTT_NONE = 0,
+ NTT_MONT = 1,
+ NTT_MONT_INV = 2,
+} ntt_domain_t;
+
+typedef enum invntt_domain {
+ INVNTT_NONE = 0,
+ INVNTT_NINV = 1,
+} invntt_domain_t;
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]);
+
+uint16_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+ const int8_t low, const int8_t high);
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+ const int16_t low, const int16_t high);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_float.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_float.c
new file mode 100644
index 000000000..10a302cf1
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_float.c
@@ -0,0 +1,1459 @@
+/*
+ * Poly FFT
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(fpr *c, const fpr *restrict a,
+ const fpr *restrict b, unsigned logn) {
+ float64x2x4_t neon_a, neon_b, neon_c;
+ float64x2x2_t neon_a2, neon_b2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+ switch (logn) {
+ case 1:
+ // n = 2;
+ vload(neon_a.val[0], &a[0]);
+ vload(neon_b.val[0], &b[0]);
+
+ vfadd(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+ vstore(&c[0], neon_c.val[0]);
+ break;
+
+ case 2:
+ // n = 4
+ vloadx2(neon_a2, &a[0]);
+ vloadx2(neon_b2, &b[0]);
+
+ vfadd(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+ vfadd(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+ vstorex2(&c[0], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+ vloadx4(neon_b, &b[i]);
+
+ vfaddx4(neon_c, neon_a, neon_b);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+/*
+ * c = a - b
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(fpr *c, const fpr *restrict a,
+ const fpr *restrict b, unsigned logn) {
+ float64x2x4_t neon_a, neon_b, neon_c;
+ float64x2x2_t neon_a2, neon_b2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+ switch (logn) {
+ case 1:
+ vload(neon_a.val[0], &a[0]);
+ vload(neon_b.val[0], &b[0]);
+
+ vfsub(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+ vstore(&c[0], neon_c.val[0]);
+ break;
+
+ case 2:
+ vloadx2(neon_a2, &a[0]);
+ vloadx2(neon_b2, &b[0]);
+
+ vfsub(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+ vfsub(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+ vstorex2(&c[0], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+ vloadx4(neon_b, &b[i]);
+
+ vfsubx4(neon_c, neon_a, neon_b);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+/*
+ * c = -a
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(fpr *c, const fpr *restrict a,
+ unsigned logn) {
+ float64x2x4_t neon_a, neon_c;
+ float64x2x2_t neon_a2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+
+ switch (logn) {
+ case 1:
+ vload(neon_a.val[0], &a[0]);
+
+ vfneg(neon_c.val[0], neon_a.val[0]);
+
+ vstore(&c[0], neon_c.val[0]);
+ break;
+
+ case 2:
+ vloadx2(neon_a2, &a[0]);
+
+ vfneg(neon_c2.val[0], neon_a2.val[0]);
+ vfneg(neon_c2.val[1], neon_a2.val[1]);
+
+ vstorex2(&c[0], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+
+ vfnegx4(neon_c, neon_a);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a,
+ unsigned logn) {
+
+ float64x2x4_t neon_a, neon_c;
+ float64x2x2_t neon_a2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1;
+ c[1] = fpr_neg(a[1]);
+ break;
+
+ case 2:
+ // n = 4; hn = 2
+ vload(neon_a.val[0], &a[2]);
+ vfneg(neon_c.val[0], neon_a.val[0]);
+ vstore(&c[2], neon_c.val[0]);
+ break;
+
+ case 3:
+ // n = 8; hn = 4
+ vloadx2(neon_a2, &a[4]);
+ vfneg(neon_c2.val[0], neon_a2.val[0]);
+ vfneg(neon_c2.val[1], neon_a2.val[1]);
+ vstorex2(&c[4], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = hn; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+
+ vfnegx4(neon_c, neon_a);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log1(
+ fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ a_re = a[0];
+ a_im = a[1];
+ b_re = b[0];
+ b_im = b[1];
+
+ c_re = a_re * b_re - a_im * b_im;
+ c_im = a_re * b_im + a_im * b_re;
+
+ c[0] = c_re;
+ c[1] = c_im;
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log2(
+ fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+ // n = 4
+ float64x2x2_t neon_a, neon_b, neon_c;
+ float64x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+ // 0: re, re
+ // 1: im, im
+ vloadx2(neon_a, &a[0]);
+ vloadx2(neon_b, &b[0]);
+
+ a_re = neon_a.val[0];
+ a_im = neon_a.val[1];
+ b_re = neon_b.val[0];
+ b_im = neon_b.val[1];
+
+ FPC_MUL(c_re, c_im, a_re, a_im, b_re, b_im);
+
+ neon_c.val[0] = c_re;
+ neon_c.val[1] = c_im;
+
+ vstorex2(&c[0], neon_c);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log3(
+ fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+ // n = 8
+ float64x2x4_t neon_a, neon_b, neon_c;
+ float64x2x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+ vloadx4(neon_a, &a[0]);
+ vloadx4(neon_b, &b[0]);
+
+ a_re.val[0] = neon_a.val[0];
+ a_re.val[1] = neon_a.val[1];
+ a_im.val[0] = neon_a.val[2];
+ a_im.val[1] = neon_a.val[3];
+
+ b_re.val[0] = neon_b.val[0];
+ b_re.val[1] = neon_b.val[1];
+ b_im.val[0] = neon_b.val[2];
+ b_im.val[1] = neon_b.val[3];
+
+ FPC_MULx2(c_re, c_im, a_re, a_im, b_re, b_im);
+
+ neon_c.val[0] = c_re.val[0];
+ neon_c.val[1] = c_re.val[1];
+ neon_c.val[2] = c_im.val[0];
+ neon_c.val[3] = c_im.val[1];
+
+ vstorex4(&c[0], neon_c);
+}
+
+/* see inner.h */
+/*
+ * c = a * b
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(fpr *c, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ // Total 32 registers
+ float64x2x4_t a_re, b_re, a_im, b_im; // 24
+ float64x2x4_t c_re, c_im; // 8
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log1(c, a, b);
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log2(c, a, b);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_log3(c, a, b);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ FPC_MULx4(c_re, c_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+ break;
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log1(
+ fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+ const fpr *restrict b) {
+ fpr a_re, a_im, b_re, b_im, c_re, c_im, d_re, d_im;
+
+ a_re = a[0];
+ a_im = a[1];
+ b_re = b[0];
+ b_im = b[1];
+ d_re = d[0];
+ d_im = d[1];
+
+ c_re = a_re * b_re - a_im * b_im;
+ c_im = a_re * b_im + a_im * b_re;
+
+ c[0] = c_re + d_re;
+ c[1] = c_im + d_im;
+
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log2(
+ fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+ const fpr *restrict b) {
+ // n = 4
+ float64x2x2_t neon_a, neon_b, neon_d;
+ float64x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+ // 0: re, re
+ // 1: im, im
+ vloadx2(neon_a, &a[0]);
+ vloadx2(neon_b, &b[0]);
+ vloadx2(neon_d, &d[0]);
+
+ a_re = neon_a.val[0];
+ a_im = neon_a.val[1];
+ b_re = neon_b.val[0];
+ b_im = neon_b.val[1];
+ d_re = neon_d.val[0];
+ d_im = neon_d.val[1];
+
+ FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ neon_d.val[0] = d_re;
+ neon_d.val[1] = d_im;
+
+ vstorex2(&c[0], neon_d);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log3(
+ fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+ const fpr *restrict b) {
+ // n = 8
+ float64x2x4_t neon_a, neon_b, neon_d;
+ float64x2x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+ vloadx4(neon_a, &a[0]);
+ vloadx4(neon_b, &b[0]);
+ vloadx4(neon_d, &d[0]);
+
+ a_re.val[0] = neon_a.val[0];
+ a_re.val[1] = neon_a.val[1];
+ a_im.val[0] = neon_a.val[2];
+ a_im.val[1] = neon_a.val[3];
+
+ b_re.val[0] = neon_b.val[0];
+ b_re.val[1] = neon_b.val[1];
+ b_im.val[0] = neon_b.val[2];
+ b_im.val[1] = neon_b.val[3];
+
+ d_re.val[0] = neon_d.val[0];
+ d_re.val[1] = neon_d.val[1];
+ d_im.val[0] = neon_d.val[2];
+ d_im.val[1] = neon_d.val[3];
+
+ FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ neon_d.val[0] = d_re.val[0];
+ neon_d.val[1] = d_re.val[1];
+ neon_d.val[2] = d_im.val[0];
+ neon_d.val[3] = d_im.val[1];
+
+ vstorex4(&c[0], neon_d);
+}
+
+/* see inner.h */
+/*
+ * c = d + a * b
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(fpr *c, const fpr *restrict d,
+ const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ // Total 32 registers
+ float64x2x4_t a_re, b_re, a_im, b_im, d_re, d_im; // 32
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log1(c, d, a, b);
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log2(c, d, a, b);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft_add_log3(c, d, a, b);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+ vloadx4(d_re, &d[i]);
+ vloadx4(d_im, &d[i + hn]);
+
+ FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&c[i], d_re);
+ vstorex4(&c[i + hn], d_im);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(fpr *d, fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+
+ float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&d[i], d_re);
+ vstorex4(&d[i + hn], d_im);
+ }
+}
+
+// c = d + a*b
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+
+ float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+ vloadx4(d_re, &d[i]);
+ vloadx4(d_im, &d[i + hn]);
+
+ FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&c[i], d_re);
+ vstorex4(&c[i + hn], d_im);
+ }
+}
+
+/* see inner.h */
+/*
+ * c = a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(fpr *c,
+ const fpr *restrict a,
+ unsigned logn) {
+
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ float64x2x4_t a_re, a_im, c_re, c_im; // 16
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+
+ vfdupx4(c_im, 0);
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+
+ vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+ vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+ vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+ vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+}
+
+/*
+ * c = d + a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(fpr *c,
+ const fpr *restrict d,
+ const fpr *restrict a,
+ unsigned logn) {
+
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ float64x2x4_t a_re, a_im, d_re; // 16
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(d_re, &d[i]);
+
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(d_re.val[0], d_re.val[0], a_im.val[0], a_im.val[0]);
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(d_re.val[1], d_re.val[1], a_im.val[1], a_im.val[1]);
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], a_re.val[2]);
+ vfmla(d_re.val[2], d_re.val[2], a_im.val[2], a_im.val[2]);
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], a_re.val[3]);
+ vfmla(d_re.val[3], d_re.val[3], a_im.val[3], a_im.val[3]);
+
+ vstorex4(&c[i], d_re);
+ }
+}
+
+/* see inner.h */
+/*
+ * c = a * scalar_x
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x,
+ unsigned logn) {
+ // assert(logn >= 3);
+ // Total SIMD registers: 9
+ const unsigned falcon_n = 1 << logn;
+ float64x2x4_t neon_a, neon_c; // 8
+ float64x2_t neon_x; // 1
+ neon_x = vdupq_n_f64(x);
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+
+ vfmulx4_i(neon_c, neon_a, neon_x);
+
+ vstorex4(&c[i], neon_c);
+ }
+}
+
+/* see inner.h
+ * Unused in the implementation
+ */
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_fft(fpr *restrict c,
+ const fpr *restrict a,
+ const fpr *restrict b,
+ unsigned logn) {
+
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, b_im, c_re, c_im, m;
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ vfmulx4(m, b_re, b_re);
+ vfmlax4(m, m, b_im, b_im);
+
+ vfmulx4(c_re, a_re, b_re);
+ vfmlax4(c_re, c_re, a_im, b_im);
+
+ vfinvx4(m, m);
+
+ vfmulx4(c_im, a_im, b_re);
+ vfmlsx4(c_im, c_im, a_re, b_im);
+
+ vfmulx4(c_re, c_re, m);
+ vfmulx4(c_im, c_im, m);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+ const fpr *restrict a,
+ const fpr *restrict b,
+ unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, b_im, c_re;
+ float64x2x2_t x, y;
+ float64x2_t z;
+
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1; i = 0
+ /*
+ * x_re = a[0];
+ * x_im = a[1];
+ * y_re = b[0];
+ * y_im = b[1];
+ * d[0] = 1.0/( (x_re*x_re) + (x_im*x_im) + (y_re*y_re) + (y_im*y_im) );
+ */
+ vload(a_re.val[0], &a[0]);
+ vload(b_re.val[0], &b[0]);
+ vfmul(a_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(c_re.val[0], a_re.val[0], b_re.val[0], b_re.val[0]);
+ d[0] = 1.0 / vaddvq_f64(c_re.val[0]);
+ break;
+
+ case 2:
+ // n = 4; hn = 2; i = 0, 1
+ vloadx2(x, &a[0]);
+ vloadx2(y, &b[0]);
+
+ vfmul(z, x.val[0], x.val[0]);
+ vfmla(z, z, x.val[1], x.val[1]);
+ vfmla(z, z, y.val[0], y.val[0]);
+ vfmla(z, z, y.val[1], y.val[1]);
+ vfinv(z, z);
+
+ vstore(&d[0], z);
+ break;
+
+ case 3:
+ // n = 8; hn = 4; i = 0,1,2,3
+ vloadx4(a_re, &a[0]);
+ vloadx4(b_re, &b[0]);
+
+ vfmul(x.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(x.val[0], x.val[0], b_re.val[0], b_re.val[0]);
+ vfmla(x.val[0], x.val[0], a_re.val[2], a_re.val[2]);
+ vfmla(x.val[0], x.val[0], b_re.val[2], b_re.val[2]);
+ vfinv(x.val[0], x.val[0]);
+
+ vfmul(x.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(x.val[1], x.val[1], b_re.val[1], b_re.val[1]);
+ vfmla(x.val[1], x.val[1], a_re.val[3], a_re.val[3]);
+ vfmla(x.val[1], x.val[1], b_re.val[3], b_re.val[3]);
+ vfinv(x.val[1], x.val[1]);
+
+ vstorex2(&d[0], x);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], b_re.val[0], b_re.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], b_im.val[0], b_im.val[0]);
+ vfinv(c_re.val[0], c_re.val[0]);
+
+ vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], b_re.val[1], b_re.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], b_im.val[1], b_im.val[1]);
+ vfinv(c_re.val[1], c_re.val[1]);
+
+ vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], b_re.val[2], b_re.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], b_im.val[2], b_im.val[2]);
+ vfinv(c_re.val[2], c_re.val[2]);
+
+ vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], b_re.val[3], b_re.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], b_im.val[3], b_im.val[3]);
+ vfinv(c_re.val[3], c_re.val[3]);
+
+ vstorex4(&d[i], c_re);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_add_muladj_fft(
+ fpr *restrict d, const fpr *restrict F, const fpr *restrict G,
+ const fpr *restrict f, const fpr *restrict g, unsigned logn) {
+
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t F_re, F_im, G_re, G_im;
+ float64x2x4_t f_re, f_im, g_re, g_im;
+ float64x2x4_t a_re, a_im;
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(F_re, &F[i]);
+ vloadx4(F_im, &F[i + hn]);
+ vloadx4(f_re, &f[i]);
+ vloadx4(f_im, &f[i + hn]);
+
+ FPC_MUL_CONJx4(a_re, a_im, F_re, F_im, f_re, f_im);
+
+ vloadx4(G_re, &G[i]);
+ vloadx4(g_re, &g[i]);
+
+ vloadx4(G_im, &G[i + hn]);
+ vloadx4(g_im, &g[i + hn]);
+
+ FPC_MLA_CONJx4(a_re, a_im, G_re, G_im, g_re, g_im);
+
+ vstorex4(&d[i], a_re);
+ vstorex4(&d[i + hn], a_im);
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, c_re, c_im;
+ float64x2x2_t a_re_im, b_re_im, c_re_im;
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1; i = 0
+ vload(a_re.val[0], &a[0]);
+ vfmuln(a_re.val[0], a_re.val[0], b[0]);
+ vstore(&c[0], a_re.val[0]);
+ break;
+
+ case 2:
+ // n = 4; hn = 2; i = 0, 1
+ vload2(a_re_im, &a[0]);
+ vload(b_re_im.val[0], &b[0]);
+ vfmul_lane(c_re_im.val[0], a_re_im.val[0], b_re_im.val[0], 0);
+ vfmul_lane(c_re_im.val[1], a_re_im.val[1], b_re_im.val[0], 1);
+ vstore2(&c[0], c_re_im);
+ break;
+
+ case 3:
+ // n = 8; hn = 4; i = 0,1,2,3
+ vload4(a_re, &a[0]);
+ vloadx2(b_re_im, &b[0]);
+ vfmul_lane(c_re.val[0], a_re.val[0], b_re_im.val[0], 0);
+ vfmul_lane(c_re.val[1], a_re.val[1], b_re_im.val[0], 1);
+ vfmul_lane(c_re.val[2], a_re.val[2], b_re_im.val[1], 0);
+ vfmul_lane(c_re.val[3], a_re.val[3], b_re_im.val[1], 1);
+ vstore4(&c[0], c_re);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+
+ vfmulx4(c_re, a_re, b_re);
+ vfmulx4(c_im, a_im, b_re);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, binv, c_re, c_im;
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(b_re, &b[i]);
+ vfinvx4(binv, b_re);
+
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+
+ vfmulx4(c_re, a_re, binv);
+ vfmulx4(c_im, a_im, binv);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log1(
+ const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+ float64x2x4_t g00_re, g01_re, g11_re;
+ float64x2x4_t mu_re, m;
+ float64x2_t neon_1i2;
+
+ const fpr imagine[2] = {1.0, -1.0};
+ // n = 2; hn = 1;
+ vload(g00_re.val[0], &g00[0]);
+
+ // g00_re^2 | g00_im^2
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ // 1 / ( g00_re^2 + g00_im^2 )
+ m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+ vload(g01_re.val[0], &g01[0]);
+ vload(neon_1i2, &imagine[0]);
+
+ // g01_re * g00_re | g01_im * g01_im
+ vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+ // g01_im | -g01_re
+ vswap(g01_re.val[1], g01_re.val[0]);
+ vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+ // g01_im * g00_re - g01_re * g00_im
+ vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+ mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+ // re: mu_re * g01_re + mu_im * g01_im
+ vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+ vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+ vswap(g01_re.val[2], g01_re.val[2]);
+ // im: -g01_im * mu_re + g01_re * mu_im
+ vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+ g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+ vload(g11_re.val[0], &g11[0]);
+
+ vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+ vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+ vstore(&g11[0], g11_re.val[0]);
+ vstore(&g01[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log2(
+ const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+ float64x2x2_t tmp;
+
+ // n = 4; hn = 2
+ vloadx2(tmp, &g00[0]);
+ g00_re.val[0] = tmp.val[0];
+ g00_im.val[0] = tmp.val[1];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vloadx2(tmp, &g01[0]);
+ g01_re.val[0] = tmp.val[0];
+ g01_im.val[0] = tmp.val[1];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+ vloadx2(tmp, &g11[0]);
+ g11_re.val[0] = tmp.val[0];
+ g11_im.val[0] = tmp.val[1];
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+ tmp.val[0] = d_re.val[0];
+ tmp.val[1] = d_im.val[0];
+ vstorex2(&g11[0], tmp);
+
+ vfneg(mu_im.val[0], mu_im.val[0]);
+ tmp.val[0] = mu_re.val[0];
+ tmp.val[1] = mu_im.val[0];
+ vstorex2(&g01[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log3(
+ const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+ float64x2x4_t mu_re, mu_im, m, d_re;
+ // n = 8; hn = 4
+ vloadx4(g00_re, &g00[0]);
+ g00_im.val[0] = g00_re.val[2];
+ g00_im.val[1] = g00_re.val[3];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vloadx4(g01_re, &g01[0]);
+ g01_im.val[0] = g01_re.val[2];
+ g01_im.val[1] = g01_re.val[3];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+ vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+ vloadx4(g11_re, &g11[0]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+ vstorex4(&g11[0], d_re);
+
+ vfneg(mu_re.val[2], mu_im.val[0]);
+ vfneg(mu_re.val[3], mu_im.val[1]);
+
+ vstorex4(&g01[0], mu_re);
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+ fpr *restrict g01,
+ fpr *restrict g11, unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log1(g00, g01, g11);
+
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log2(g00, g01, g11);
+
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft_log3(g00, g01, g11);
+
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(g00_re, &g00[i]);
+ vloadx4(g00_im, &g00[i + hn]);
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+ vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+ vfinv(m.val[2], m.val[2]);
+
+ vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+ vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+ vfinv(m.val[3], m.val[3]);
+
+ vloadx4(g01_re, &g01[i]);
+ vloadx4(g01_im, &g01[i + hn]);
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+ vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+ vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+ vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+ vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+ vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+ vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+ vfmulx4(mu_re, mu_re, m);
+ vfmulx4(mu_im, mu_im, m);
+ vstorex4(&g01[i], mu_re);
+
+ vloadx4(g11_re, &g11[i]);
+ vloadx4(g11_im, &g11[i + hn]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+ vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+ vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+ vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+ vstorex4(&g11[i], d_re);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+ vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+ vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+ vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+ vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+ vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+ vstorex4(&g11[i + hn], d_im);
+
+ vfnegx4(mu_im, mu_im);
+ vstorex4(&g01[i + hn], mu_im);
+ }
+ break;
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log1(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11) {
+ float64x2x4_t g00_re, g01_re, g11_re;
+ float64x2x4_t mu_re, m;
+ float64x2_t neon_1i2;
+
+ const fpr imagine[2] = {1.0, -1.0};
+ // n = 2; hn = 1;
+ vload(g00_re.val[0], &g00[0]);
+
+ // g00_re^2 | g00_im^2
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ // 1 / ( g00_re^2 + g00_im^2 )
+ m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+ vload(g01_re.val[0], &g01[0]);
+ vload(neon_1i2, &imagine[0]);
+
+ // g01_re * g00_re | g01_im * g01_im
+ vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+ // g01_im | -g01_re
+ vswap(g01_re.val[1], g01_re.val[0]);
+ vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+ // g01_im * g00_re - g01_re * g00_im
+ vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+ mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+ // re: mu_re * g01_re + mu_im * g01_im
+ vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+ vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+ vswap(g01_re.val[2], g01_re.val[2]);
+ // im: -g01_im * mu_re + g01_re * mu_im
+ vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+ g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+ vload(g11_re.val[0], &g11[0]);
+
+ vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+ vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+ vstore(&d11[0], g11_re.val[0]);
+ vstore(&l10[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log2(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+ float64x2x2_t tmp;
+
+ // n = 4; hn = 2
+ vloadx2(tmp, &g00[0]);
+ g00_re.val[0] = tmp.val[0];
+ g00_im.val[0] = tmp.val[1];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vloadx2(tmp, &g01[0]);
+ g01_re.val[0] = tmp.val[0];
+ g01_im.val[0] = tmp.val[1];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+ vloadx2(tmp, &g11[0]);
+ g11_re.val[0] = tmp.val[0];
+ g11_im.val[0] = tmp.val[1];
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+ tmp.val[0] = d_re.val[0];
+ tmp.val[1] = d_im.val[0];
+ vstorex2(&d11[0], tmp);
+
+ vfneg(mu_im.val[0], mu_im.val[0]);
+ tmp.val[0] = mu_re.val[0];
+ tmp.val[1] = mu_im.val[0];
+ vstorex2(&l10[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log3(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+ float64x2x4_t mu_re, mu_im, m, d_re;
+ // n = 8; hn = 4
+ vloadx4(g00_re, &g00[0]);
+ g00_im.val[0] = g00_re.val[2];
+ g00_im.val[1] = g00_re.val[3];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vloadx4(g01_re, &g01[0]);
+ g01_im.val[0] = g01_re.val[2];
+ g01_im.val[1] = g01_re.val[3];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+ vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+ vloadx4(g11_re, &g11[0]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+ vstorex4(&d11[0], d_re);
+
+ vfneg(mu_re.val[2], mu_im.val[0]);
+ vfneg(mu_re.val[3], mu_im.val[1]);
+
+ vstorex4(&l10[0], mu_re);
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11, unsigned logn) {
+
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log1(d11, l10, g00, g01, g11);
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log2(d11, l10, g00, g01, g11);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft_log3(d11, l10, g00, g01, g11);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(g00_re, &g00[i]);
+ vloadx4(g00_im, &g00[i + hn]);
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+ vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+ vfinv(m.val[2], m.val[2]);
+
+ vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+ vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+ vfinv(m.val[3], m.val[3]);
+
+ vloadx4(g01_re, &g01[i]);
+ vloadx4(g01_im, &g01[i + hn]);
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+ vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+ vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+ vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+ vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+ vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+ vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+ vfmulx4(mu_re, mu_re, m);
+ vfmulx4(mu_im, mu_im, m);
+ vstorex4(&l10[i], mu_re);
+
+ vloadx4(g11_re, &g11[i]);
+ vloadx4(g11_im, &g11[i + hn]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+ vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+ vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+ vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+ vstorex4(&d11[i], d_re);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+ vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+ vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+ vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+ vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+ vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+ vstorex4(&d11[i + hn], d_im);
+
+ vfnegx4(mu_im, mu_im);
+ vstorex4(&l10[i + hn], mu_im);
+ }
+ break;
+ }
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm,
+ const unsigned falcon_n) {
+ float64x2x4_t neon_t0;
+ uint16x8x4_t neon_hm;
+ uint16x8_t neon_zero;
+ uint32x4x4_t neon_hmu32[2];
+ int64x2x4_t neon_hms64[4];
+ neon_zero = vdupq_n_u16(0);
+ for (unsigned u = 0; u < falcon_n; u += 32) {
+ neon_hm = vld1q_u16_x4(&hm[u]);
+ neon_hmu32[0].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[0], neon_zero);
+ neon_hmu32[0].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[0], neon_zero);
+ neon_hmu32[0].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[1], neon_zero);
+ neon_hmu32[0].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[1], neon_zero);
+
+ neon_hmu32[1].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[2], neon_zero);
+ neon_hmu32[1].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[2], neon_zero);
+ neon_hmu32[1].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[3], neon_zero);
+ neon_hmu32[1].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[3], neon_zero);
+
+ neon_hms64[0].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[0].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[0].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+ neon_hms64[0].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+
+ neon_hms64[1].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[1].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[1].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+ neon_hms64[1].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+
+ neon_hms64[2].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[2].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[2].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+ neon_hms64[2].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+
+ neon_hms64[3].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[3].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[3].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+ neon_hms64[3].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+
+ vfcvtx4(neon_t0, neon_hms64[0]);
+ vstorex4(&t0[u], neon_t0);
+
+ vfcvtx4(neon_t0, neon_hms64[1]);
+ vstorex4(&t0[u + 8], neon_t0);
+
+ vfcvtx4(neon_t0, neon_hms64[2]);
+ vstorex4(&t0[u + 16], neon_t0);
+
+ vfcvtx4(neon_t0, neon_hms64[3]);
+ vstorex4(&t0[u + 24], neon_t0);
+ }
+}
+
+fpr PQCLEAN_FALCONPADDED1024_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2) {
+ float64x2x4_t r1, r11, r2, r22;
+ float64x2x4_t bnorm, bnorm2;
+
+ vfdupx4(bnorm, 0);
+ vfdupx4(bnorm2, 0);
+
+ for (unsigned i = 0; i < FALCON_N;) {
+ vloadx4(r1, &rt1[i]);
+ i += 8;
+
+ vfmla(bnorm.val[0], bnorm.val[0], r1.val[0], r1.val[0]);
+ vfmla(bnorm.val[1], bnorm.val[1], r1.val[1], r1.val[1]);
+ vfmla(bnorm.val[2], bnorm.val[2], r1.val[2], r1.val[2]);
+ vfmla(bnorm.val[3], bnorm.val[3], r1.val[3], r1.val[3]);
+
+ vloadx4(r11, &rt1[i]);
+ i += 8;
+
+ vfmla(bnorm2.val[0], bnorm2.val[0], r11.val[0], r11.val[0]);
+ vfmla(bnorm2.val[1], bnorm2.val[1], r11.val[1], r11.val[1]);
+ vfmla(bnorm2.val[2], bnorm2.val[2], r11.val[2], r11.val[2]);
+ vfmla(bnorm2.val[3], bnorm2.val[3], r11.val[3], r11.val[3]);
+ }
+
+ for (unsigned i = 0; i < FALCON_N;) {
+ vloadx4(r2, &rt2[i]);
+ i += 8;
+
+ vfmla(bnorm.val[0], bnorm.val[0], r2.val[0], r2.val[0]);
+ vfmla(bnorm.val[1], bnorm.val[1], r2.val[1], r2.val[1]);
+ vfmla(bnorm.val[2], bnorm.val[2], r2.val[2], r2.val[2]);
+ vfmla(bnorm.val[3], bnorm.val[3], r2.val[3], r2.val[3]);
+
+ vloadx4(r22, &rt2[i]);
+ i += 8;
+
+ vfmla(bnorm2.val[0], bnorm2.val[0], r22.val[0], r22.val[0]);
+ vfmla(bnorm2.val[1], bnorm2.val[1], r22.val[1], r22.val[1]);
+ vfmla(bnorm2.val[2], bnorm2.val[2], r22.val[2], r22.val[2]);
+ vfmla(bnorm2.val[3], bnorm2.val[3], r22.val[3], r22.val[3]);
+ }
+
+ vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[1]);
+ vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[1]);
+ vfadd(bnorm.val[2], bnorm.val[2], bnorm.val[3]);
+ vfadd(bnorm2.val[2], bnorm2.val[2], bnorm2.val[3]);
+ vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[2]);
+ vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[2]);
+
+ vfadd(bnorm.val[0], bnorm.val[0], bnorm2.val[0]);
+
+ return vaddvq_f64(bnorm.val[0]);
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_int.c
new file mode 100644
index 000000000..d9a353970
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/poly_int.c
@@ -0,0 +1,501 @@
+/*
+ * poly_int.c
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+#include "macrous.h"
+#include "params.h"
+#include "poly.h"
+#include "ntt_consts.h"
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]) {
+ // Total SIMD registers: 24 = 16 + 8
+ int16x8x4_t a, b, e, f; // 16
+ int8x16x4_t c, d; // 8
+
+ for (int i = 0; i < FALCON_N; i += 128) {
+ c = vld1q_s8_x4(&in[i]);
+
+ a.val[0] = vmovl_s8(vget_low_s8(c.val[0]));
+ a.val[2] = vmovl_s8(vget_low_s8(c.val[1]));
+ b.val[0] = vmovl_s8(vget_low_s8(c.val[2]));
+ b.val[2] = vmovl_s8(vget_low_s8(c.val[3]));
+
+ a.val[1] = vmovl_high_s8(c.val[0]);
+ a.val[3] = vmovl_high_s8(c.val[1]);
+ b.val[1] = vmovl_high_s8(c.val[2]);
+ b.val[3] = vmovl_high_s8(c.val[3]);
+
+ d = vld1q_s8_x4(&in[i + 64]);
+
+ e.val[0] = vmovl_s8(vget_low_s8(d.val[0]));
+ e.val[2] = vmovl_s8(vget_low_s8(d.val[1]));
+ f.val[0] = vmovl_s8(vget_low_s8(d.val[2]));
+ f.val[2] = vmovl_s8(vget_low_s8(d.val[3]));
+
+ e.val[1] = vmovl_high_s8(d.val[0]);
+ e.val[3] = vmovl_high_s8(d.val[1]);
+ f.val[1] = vmovl_high_s8(d.val[2]);
+ f.val[3] = vmovl_high_s8(d.val[3]);
+
+ vst1q_s16_x4(&out[i], a);
+ vst1q_s16_x4(&out[i + 32], b);
+ vst1q_s16_x4(&out[i + 64], e);
+ vst1q_s16_x4(&out[i + 96], f);
+ }
+}
+
+/*
+ * Return f[] = f[]/g[] % 12289
+ * See assembly https://godbolt.org/z/od3Ex7Mbx
+ */
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+ // Total SIMD registers: 24 = 4 + 19 + 1
+ int16x8x4_t src, dst, t, k; // 4
+ int16x8x4_t y0, y1, y2, y3, y4, y5,
+ y6, y7, y8, y9, y10, y11, y12,
+ y13, y14, y15, y16, y17, y18; // 19
+ int16x8_t neon_qmvm; // 1
+
+ neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+
+ for (int i = 0; i < FALCON_N; i += 32) {
+ // Find y0 = g^12287
+ vload_s16_x4(y0, &g[i]);
+
+ // y0 is already in Montgomery domain
+
+ montmul_x4(y1, y0, y0, neon_qmvm, t);
+ montmul_x4(y2, y1, y0, neon_qmvm, k);
+ montmul_x4(y3, y2, y1, neon_qmvm, t);
+ montmul_x4(y4, y3, y3, neon_qmvm, k);
+ montmul_x4(y5, y4, y4, neon_qmvm, t);
+ montmul_x4(y6, y5, y5, neon_qmvm, k);
+ montmul_x4(y7, y6, y6, neon_qmvm, t);
+ montmul_x4(y8, y7, y7, neon_qmvm, k);
+ montmul_x4(y9, y8, y2, neon_qmvm, t);
+ montmul_x4(y10, y9, y8, neon_qmvm, k);
+ montmul_x4(y11, y10, y10, neon_qmvm, t);
+ montmul_x4(y12, y11, y11, neon_qmvm, k);
+ montmul_x4(y13, y12, y9, neon_qmvm, t);
+ montmul_x4(y14, y13, y13, neon_qmvm, k);
+ montmul_x4(y15, y14, y14, neon_qmvm, t);
+ montmul_x4(y16, y15, y10, neon_qmvm, k);
+ montmul_x4(y17, y16, y16, neon_qmvm, t);
+ montmul_x4(y18, y17, y0, neon_qmvm, k);
+
+ vload_s16_x4(src, &f[i]);
+
+ montmul_x4(dst, y18, src, neon_qmvm, t);
+
+ vstore_s16_x4(&f[i], dst);
+ }
+}
+
+/*
+ * f = g - s
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]) {
+ // Total SIMD registers: 29 = 28 + 1
+ int16x8x4_t a, b, c, d, e, h, t; // 28
+ int16x8_t neon_qmvm; // 1
+ neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED1024_AARCH64_qmvq);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &g[i]);
+ vload_s16_x4(b, &s[i]);
+
+ e.val[0] = vsubq_s16(a.val[0], b.val[0]);
+ e.val[1] = vsubq_s16(a.val[1], b.val[1]);
+ e.val[2] = vsubq_s16(a.val[2], b.val[2]);
+ e.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+ vload_s16_x4(c, &g[i + 32]);
+ vload_s16_x4(d, &s[i + 32]);
+
+ h.val[0] = vsubq_s16(c.val[0], d.val[0]);
+ h.val[1] = vsubq_s16(c.val[1], d.val[1]);
+ h.val[2] = vsubq_s16(c.val[2], d.val[2]);
+ h.val[3] = vsubq_s16(c.val[3], d.val[3]);
+
+ barrett_x4(e, neon_qmvm, t);
+ barrett_x4(h, neon_qmvm, t);
+
+ vstore_s16_x4(&f[i], e);
+ vstore_s16_x4(&f[i + 32], h);
+ }
+}
+
+/*
+ * Check f[] has 0
+ * Return:
+ * 1 if 0 in f[]
+ * otherwise, 0
+ */
+uint16_t PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]) {
+ // Total SIMD registers: 22 = 12 + 8 + 2
+ int16x8x4_t a, b; // 8
+ uint16x8x4_t c, d, e1; // 12
+ uint16x8x2_t e2; // 2
+
+ e2.val[1] = vdupq_n_u16(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &f[i]);
+
+ // Compare bitwise Equal to zero (vector)
+ // a == 0 ? 1 : 0;
+ c.val[0] = vceqzq_s16(a.val[0]);
+ c.val[1] = vceqzq_s16(a.val[1]);
+ c.val[2] = vceqzq_s16(a.val[2]);
+ c.val[3] = vceqzq_s16(a.val[3]);
+
+ vload_s16_x4(b, &f[i + 32]);
+
+ d.val[0] = vceqzq_s16(b.val[0]);
+ d.val[1] = vceqzq_s16(b.val[1]);
+ d.val[2] = vceqzq_s16(b.val[2]);
+ d.val[3] = vceqzq_s16(b.val[3]);
+
+ e1.val[0] = vorrq_u16(d.val[0], c.val[0]);
+ e1.val[1] = vorrq_u16(d.val[1], c.val[1]);
+ e1.val[2] = vorrq_u16(d.val[2], c.val[2]);
+ e1.val[3] = vorrq_u16(d.val[3], c.val[3]);
+
+ e1.val[0] = vorrq_u16(e1.val[0], e1.val[2]);
+ e1.val[1] = vorrq_u16(e1.val[1], e1.val[3]);
+
+ e2.val[0] = vorrq_u16(e1.val[0], e1.val[1]);
+
+ e2.val[1] = vorrq_u16(e2.val[1], e2.val[0]);
+ }
+
+ uint16_t ret = vmaxvq_u16(e2.val[1]);
+
+ return ret;
+}
+
+/*
+ * Branchless conditional addtion with FALCON_Q if coeffcient is < 0
+ * If coefficient is larger than Q, it is subtracted with Q
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) {
+ // Total SIMD registers: 26 = 8 + 16 + 1 + 1
+ uint16x8x4_t b0, b1; // 8
+ int16x8x4_t a0, a1, c0, c1; // 16
+ int16x8_t neon_q; // 1
+ uint16x8_t neon_2q; // 1
+
+ neon_q = vdupq_n_s16(FALCON_Q);
+ neon_2q = vdupq_n_u16(FALCON_Q << 1);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a0, &f[i]);
+
+ b0.val[0] = vcltzq_s16(a0.val[0]);
+ b0.val[1] = vcltzq_s16(a0.val[1]);
+ b0.val[2] = vcltzq_s16(a0.val[2]);
+ b0.val[3] = vcltzq_s16(a0.val[3]);
+
+ vload_s16_x4(a1, &f[i + 32]);
+
+ // Conditional addition with 2*FALCON_Q
+ b1.val[0] = vcltzq_s16(a1.val[0]);
+ b1.val[1] = vcltzq_s16(a1.val[1]);
+ b1.val[2] = vcltzq_s16(a1.val[2]);
+ b1.val[3] = vcltzq_s16(a1.val[3]);
+
+ c0.val[0] = vreinterpretq_s16_u16(vandq_u16(b0.val[0], neon_2q));
+ c0.val[1] = vreinterpretq_s16_u16(vandq_u16(b0.val[1], neon_2q));
+ c0.val[2] = vreinterpretq_s16_u16(vandq_u16(b0.val[2], neon_2q));
+ c0.val[3] = vreinterpretq_s16_u16(vandq_u16(b0.val[3], neon_2q));
+
+ c1.val[0] = vreinterpretq_s16_u16(vandq_u16(b1.val[0], neon_2q));
+ c1.val[1] = vreinterpretq_s16_u16(vandq_u16(b1.val[1], neon_2q));
+ c1.val[2] = vreinterpretq_s16_u16(vandq_u16(b1.val[2], neon_2q));
+ c1.val[3] = vreinterpretq_s16_u16(vandq_u16(b1.val[3], neon_2q));
+
+ vadd_x4(a0, a0, c0);
+ vadd_x4(a1, a1, c1);
+
+ // a > Q ? 1 : 0
+ b0.val[0] = vcgtq_s16(a0.val[0], neon_q);
+ b0.val[1] = vcgtq_s16(a0.val[1], neon_q);
+ b0.val[2] = vcgtq_s16(a0.val[2], neon_q);
+ b0.val[3] = vcgtq_s16(a0.val[3], neon_q);
+
+ b1.val[0] = vcgtq_s16(a1.val[0], neon_q);
+ b1.val[1] = vcgtq_s16(a1.val[1], neon_q);
+ b1.val[2] = vcgtq_s16(a1.val[2], neon_q);
+ b1.val[3] = vcgtq_s16(a1.val[3], neon_q);
+
+ // Conditional subtraction with FALCON_Q
+
+ c0.val[0] = vandq_s16(vreinterpretq_s16_u16(b0.val[0]), neon_q);
+ c0.val[1] = vandq_s16(vreinterpretq_s16_u16(b0.val[1]), neon_q);
+ c0.val[2] = vandq_s16(vreinterpretq_s16_u16(b0.val[2]), neon_q);
+ c0.val[3] = vandq_s16(vreinterpretq_s16_u16(b0.val[3]), neon_q);
+
+ c1.val[0] = vandq_s16(vreinterpretq_s16_u16(b1.val[0]), neon_q);
+ c1.val[1] = vandq_s16(vreinterpretq_s16_u16(b1.val[1]), neon_q);
+ c1.val[2] = vandq_s16(vreinterpretq_s16_u16(b1.val[2]), neon_q);
+ c1.val[3] = vandq_s16(vreinterpretq_s16_u16(b1.val[3]), neon_q);
+
+ vsub_x4(a0, a0, c0);
+ vsub_x4(a1, a1, c1);
+
+ vstore_s16_x4(&f[i], a0);
+ vstore_s16_x4(&f[i + 32], a1);
+ }
+}
+
+/*
+ * Perform conditional subtraction with Q and compare with min, max = -127, 127
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]) {
+ // Total SIMD registers: 32
+ int16x8x4_t a, f; // 8
+ int16x8x4_t d0, d1; // 8
+ uint16x8x4_t c0, c1, x0, x1; // 16
+ uint16x8x2_t e; // 2
+ int8x16x4_t g; // 4
+ int16x8_t neon_127, neon__127, neon_q_2, neon__q_2; // 4
+ uint16x8_t neon_q; // 1
+ neon_127 = vdupq_n_s16(127);
+ neon__127 = vdupq_n_s16(-127);
+ neon_q = vdupq_n_u16(FALCON_Q);
+ neon_q_2 = vdupq_n_s16(FALCON_Q >> 1);
+ neon__q_2 = vdupq_n_s16(-(FALCON_Q >> 1));
+
+ e.val[1] = vdupq_n_u16(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &t[i]);
+ vload_s16_x4(f, &t[i + 32]);
+
+ // Conditional subtraction with FALCON_Q
+ // a >= Q/2 ? 1 : 0
+ c0.val[0] = vcgeq_s16(a.val[0], neon_q_2);
+ c0.val[1] = vcgeq_s16(a.val[1], neon_q_2);
+ c0.val[2] = vcgeq_s16(a.val[2], neon_q_2);
+ c0.val[3] = vcgeq_s16(a.val[3], neon_q_2);
+
+ c1.val[0] = vcgeq_s16(f.val[0], neon_q_2);
+ c1.val[1] = vcgeq_s16(f.val[1], neon_q_2);
+ c1.val[2] = vcgeq_s16(f.val[2], neon_q_2);
+ c1.val[3] = vcgeq_s16(f.val[3], neon_q_2);
+
+ // Perform subtraction with Q
+ d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+ d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+ d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+ d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+ d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+ d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+ d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+ d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+ vsub_x4(a, a, d0);
+ vsub_x4(f, f, d1);
+
+ // -Q/2 > a ? 1: 0
+ c0.val[0] = vcgtq_s16(neon__q_2, a.val[0]);
+ c0.val[1] = vcgtq_s16(neon__q_2, a.val[1]);
+ c0.val[2] = vcgtq_s16(neon__q_2, a.val[2]);
+ c0.val[3] = vcgtq_s16(neon__q_2, a.val[3]);
+
+ c1.val[0] = vcgtq_s16(neon__q_2, f.val[0]);
+ c1.val[1] = vcgtq_s16(neon__q_2, f.val[1]);
+ c1.val[2] = vcgtq_s16(neon__q_2, f.val[2]);
+ c1.val[3] = vcgtq_s16(neon__q_2, f.val[3]);
+
+ // Perform addition with Q
+ d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+ d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+ d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+ d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+ d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+ d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+ d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+ d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+ vadd_x4(a, a, d0);
+ vadd_x4(f, f, d1);
+
+ g.val[0] = vmovn_high_s16(vmovn_s16(a.val[0]), a.val[1]);
+ g.val[1] = vmovn_high_s16(vmovn_s16(a.val[2]), a.val[3]);
+ g.val[2] = vmovn_high_s16(vmovn_s16(f.val[0]), f.val[1]);
+ g.val[3] = vmovn_high_s16(vmovn_s16(f.val[2]), f.val[3]);
+
+ vst1q_s8_x4(&G[i], g);
+
+ // -127 > a ? 1 : 0
+ c0.val[0] = vcgtq_s16(neon__127, a.val[0]);
+ c0.val[1] = vcgtq_s16(neon__127, a.val[1]);
+ c0.val[2] = vcgtq_s16(neon__127, a.val[2]);
+ c0.val[3] = vcgtq_s16(neon__127, a.val[3]);
+ // a > 127 ? 1 : 0
+ c1.val[0] = vcgtq_s16(a.val[0], neon_127);
+ c1.val[1] = vcgtq_s16(a.val[1], neon_127);
+ c1.val[2] = vcgtq_s16(a.val[2], neon_127);
+ c1.val[3] = vcgtq_s16(a.val[3], neon_127);
+
+ // -127 > f ? 1 : 0
+ x0.val[0] = vcgtq_s16(neon__127, f.val[0]);
+ x0.val[1] = vcgtq_s16(neon__127, f.val[1]);
+ x0.val[2] = vcgtq_s16(neon__127, f.val[2]);
+ x0.val[3] = vcgtq_s16(neon__127, f.val[3]);
+ // f > 127 ? 1 : 0
+ x1.val[0] = vcgtq_s16(f.val[0], neon_127);
+ x1.val[1] = vcgtq_s16(f.val[1], neon_127);
+ x1.val[2] = vcgtq_s16(f.val[2], neon_127);
+ x1.val[3] = vcgtq_s16(f.val[3], neon_127);
+
+ c0.val[0] = vorrq_u16(c0.val[0], c1.val[0]);
+ c0.val[1] = vorrq_u16(c0.val[1], c1.val[1]);
+ c0.val[2] = vorrq_u16(c0.val[2], c1.val[2]);
+ c0.val[3] = vorrq_u16(c0.val[3], c1.val[3]);
+
+ x0.val[0] = vorrq_u16(x0.val[0], x1.val[0]);
+ x0.val[1] = vorrq_u16(x0.val[1], x1.val[1]);
+ x0.val[2] = vorrq_u16(x0.val[2], x1.val[2]);
+ x0.val[3] = vorrq_u16(x0.val[3], x1.val[3]);
+
+ c0.val[0] = vorrq_u16(c0.val[0], x0.val[0]);
+ c0.val[1] = vorrq_u16(c0.val[1], x0.val[1]);
+ c0.val[2] = vorrq_u16(c0.val[2], x0.val[2]);
+ c0.val[3] = vorrq_u16(c0.val[3], x0.val[3]);
+
+ c0.val[0] = vorrq_u16(c0.val[0], c0.val[2]);
+ c0.val[1] = vorrq_u16(c0.val[1], c0.val[3]);
+
+ e.val[0] = vorrq_u16(c0.val[0], c0.val[1]);
+
+ e.val[1] = vorrq_u16(e.val[1], e.val[0]);
+ }
+ if (vmaxvq_u16(e.val[1])) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+ const int8_t low, const int8_t high) {
+ // Total SIMD registers: 15
+ int8x16x4_t a; // 4
+ uint8x16x4_t c, d; // 8
+ uint8x16_t e; // 1
+ int8x16_t neon_low, neon_high; // 2
+
+ neon_high = vdupq_n_s8(high);
+ neon_low = vdupq_n_s8(low);
+ e = vdupq_n_u8(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ a = vld1q_s8_x4(&t[i]);
+
+ // low > a ? 1 : 0
+ c.val[0] = vcgtq_s8(neon_low, a.val[0]);
+ c.val[1] = vcgtq_s8(neon_low, a.val[1]);
+ c.val[2] = vcgtq_s8(neon_low, a.val[2]);
+ c.val[3] = vcgtq_s8(neon_low, a.val[3]);
+ // a > high ? 1 : 0
+ d.val[0] = vcgtq_s8(a.val[0], neon_high);
+ d.val[1] = vcgtq_s8(a.val[1], neon_high);
+ d.val[2] = vcgtq_s8(a.val[2], neon_high);
+ d.val[3] = vcgtq_s8(a.val[3], neon_high);
+
+ c.val[0] = vorrq_u8(c.val[0], d.val[0]);
+ c.val[1] = vorrq_u8(c.val[1], d.val[1]);
+ c.val[2] = vorrq_u8(c.val[2], d.val[2]);
+ c.val[3] = vorrq_u8(c.val[3], d.val[3]);
+
+ c.val[0] = vorrq_u8(c.val[0], c.val[2]);
+ c.val[1] = vorrq_u8(c.val[1], c.val[3]);
+
+ c.val[0] = vorrq_u8(c.val[0], c.val[1]);
+
+ e = vorrq_u8(e, c.val[0]);
+
+ if (vmaxvq_u8(e)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ * Work for FALCON_N >= 32, or FALCON_LOGN >= 5
+ */
+int PQCLEAN_FALCONPADDED1024_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+ const int16_t low, const int16_t high) {
+ // Total SIMD registers = 15
+ int16x8x4_t a; // 4
+ uint16x8x4_t c, d; // 8
+ uint16x8_t e; // 1
+ int16x8_t neon_low, neon_high; // 2
+
+ neon_high = vdupq_n_s16(high);
+ neon_low = vdupq_n_s16(low);
+ e = vdupq_n_u16(0);
+
+ for (int i = 0; i < FALCON_N; i += 32) {
+ a = vld1q_s16_x4(&t[i]);
+
+ // low > a ? 1 : 0
+ c.val[0] = vcgtq_s16(neon_low, a.val[0]);
+ c.val[1] = vcgtq_s16(neon_low, a.val[1]);
+ c.val[2] = vcgtq_s16(neon_low, a.val[2]);
+ c.val[3] = vcgtq_s16(neon_low, a.val[3]);
+ // a > high ? 1 : 0
+ d.val[0] = vcgtq_s16(a.val[0], neon_high);
+ d.val[1] = vcgtq_s16(a.val[1], neon_high);
+ d.val[2] = vcgtq_s16(a.val[2], neon_high);
+ d.val[3] = vcgtq_s16(a.val[3], neon_high);
+
+ c.val[0] = vorrq_u16(c.val[0], d.val[0]);
+ c.val[1] = vorrq_u16(c.val[1], d.val[1]);
+ c.val[2] = vorrq_u16(c.val[2], d.val[2]);
+ c.val[3] = vorrq_u16(c.val[3], d.val[3]);
+
+ c.val[0] = vorrq_u16(c.val[0], c.val[2]);
+ c.val[1] = vorrq_u16(c.val[1], c.val[3]);
+
+ c.val[0] = vorrq_u16(c.val[0], c.val[1]);
+
+ e = vorrq_u16(e, c.val[0]);
+
+ if (vmaxvq_u16(e)) {
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/pqclean.c
new file mode 100644
index 000000000..8cc756323
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/pqclean.c
@@ -0,0 +1,377 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include
+#include
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN 40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ * private key:
+ * header byte: 0101nnnn
+ * private f (6 or 5 bits by element, depending on degree)
+ * private g (6 or 5 bits by element, depending on degree)
+ * private F (8 bits by element)
+ *
+ * public key:
+ * header byte: 0000nnnn
+ * public h (14 bits by element)
+ *
+ * signature:
+ * header byte: 0011nnnn
+ * nonce (r) 40 bytes
+ * value (s) compressed format
+ * padding to PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES bytes
+ *
+ * message + signature:
+ * signature PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES bytes
+ * message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk) {
+ union {
+ uint8_t b[28 * FALCON_N];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N];
+ uint16_t h[FALCON_N];
+ unsigned char seed[48];
+ inner_shake256_context rng;
+ size_t u, v;
+
+ /*
+ * Generate key pair.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&rng);
+ inner_shake256_inject(&rng, seed, sizeof seed);
+ inner_shake256_flip(&rng);
+ PQCLEAN_FALCONPADDED1024_AARCH64_keygen(&rng, f, g, F, NULL, h, FALCON_LOGN, tmp.b);
+ inner_shake256_ctx_release(&rng);
+
+ /*
+ * Encode private key.
+ */
+ sk[0] = 0x50 + FALCON_LOGN;
+ u = 1;
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+ f, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+ g, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+ F, PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[FALCON_LOGN]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+
+ /*
+ * Encode public key.
+ */
+ pk[0] = 0x00 + FALCON_LOGN;
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_modq_encode(
+ pk + 1, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1,
+ h, FALCON_LOGN);
+ if (v != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ union {
+ uint8_t b[72 * FALCON_N];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N], G[FALCON_N];
+ struct {
+ int16_t sig[FALCON_N];
+ uint16_t hm[FALCON_N];
+ } r;
+ unsigned char seed[48];
+ inner_shake256_context sc;
+ size_t u, v;
+
+ /*
+ * Decode the private key.
+ */
+ if (sk[0] != 0x50 + FALCON_LOGN) {
+ return -1;
+ }
+ u = 1;
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(
+ f, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN],
+ sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(
+ g, PQCLEAN_FALCONPADDED1024_AARCH64_max_fg_bits[FALCON_LOGN],
+ sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_trim_i8_decode(
+ F, PQCLEAN_FALCONPADDED1024_AARCH64_max_FG_bits[FALCON_LOGN],
+ sk + u, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+ if (!PQCLEAN_FALCONPADDED1024_AARCH64_complete_private(G, f, g, F, tmp.b)) {
+ return -1;
+ }
+
+ /*
+ * Create a random nonce (40 bytes).
+ */
+ randombytes(nonce, NONCELEN);
+
+ /*
+ * Hash message nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(&sc, r.hm, FALCON_LOGN, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Initialize a RNG.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, seed, sizeof seed);
+ inner_shake256_flip(&sc);
+
+ /*
+ * Compute and return the signature. This loops until a signature
+ * value is found that fits in the provided buffer.
+ */
+ for (;;) {
+ PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_comp_encode(sigbuf, sigbuflen, r.sig);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ memset(sigbuf + v, 0, sigbuflen - v);
+ return 0;
+ }
+ }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+ const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ union {
+ uint8_t b[2 * FALCON_N];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int16_t h[FALCON_N];
+ int16_t hm[FALCON_N];
+ int16_t sig[FALCON_N];
+ inner_shake256_context sc;
+ size_t v;
+
+ /*
+ * Decode public key.
+ */
+ if (pk[0] != 0x00 + FALCON_LOGN) {
+ return -1;
+ }
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_modq_decode( (uint16_t *) h,
+ pk + 1, PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1, FALCON_LOGN)
+ != PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+ // We move the conversion to NTT domain of `h` inside verify_raw()
+
+ /*
+ * Decode signature.
+ */
+ if (sigbuflen == 0) {
+ return -1;
+ }
+
+ v = PQCLEAN_FALCONPADDED1024_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+ if (v == 0) {
+ return -1;
+ }
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
+
+ /*
+ * Hash nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED1024_AARCH64_hash_to_point_ct(&sc, (uint16_t *) hm, FALCON_LOGN, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Verify signature.
+ */
+ if (!PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw(hm, sig, h, (int16_t *) tmp.b)) {
+ return -1;
+ }
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ size_t vlen;
+
+ vlen = PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sig[0] = 0x30 + FALCON_LOGN;
+ *siglen = 1 + NONCELEN + vlen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ if (siglen < 1 + NONCELEN) {
+ return -1;
+ }
+ if (sig[0] != 0x30 + FALCON_LOGN) {
+ return -1;
+ }
+ return do_verify(sig + 1,
+ sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ uint8_t *sigbuf;
+ size_t sigbuflen;
+
+ /*
+ * Move the message to its final location; this is a memmove() so
+ * it handles overlaps properly.
+ */
+ memmove(sm + PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES, m, mlen);
+ sigbuf = sm + 1 + NONCELEN;
+ sigbuflen = PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sm[0] = 0x30 + FALCON_LOGN;
+ sigbuflen ++;
+ *smlen = mlen + NONCELEN + sigbuflen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+ const uint8_t *sigbuf;
+ size_t pmlen, sigbuflen;
+
+ if (smlen < PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES) {
+ return -1;
+ }
+ sigbuflen = PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+ pmlen = smlen - PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES;
+ if (sm[0] != 0x30 + FALCON_LOGN) {
+ return -1;
+ }
+ sigbuf = sm + 1 + NONCELEN;
+
+ /*
+ * The one-byte signature header has been verified. Nonce is at sm+1
+ * followed by the signature (pointed to by sigbuf). The message
+ * follows the signature value.
+ */
+ if (do_verify(sm + 1, sigbuf, sigbuflen,
+ sm + PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES, pmlen, pk) < 0) {
+ return -1;
+ }
+
+ /*
+ * Signature is correct, we just have to copy/move the message
+ * to its final destination. The memmove() properly handles
+ * overlaps.
+ */
+ memmove(m, sm + PQCLEAN_FALCONPADDED1024_AARCH64_CRYPTO_BYTES, pmlen);
+ *mlen = pmlen;
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/rng.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/rng.c
new file mode 100644
index 000000000..33ed43d88
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/rng.c
@@ -0,0 +1,194 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+#include
+#include "inner.h"
+
+int PQCLEAN_FALCONPADDED1024_AARCH64_get_seed(void *seed, size_t len) {
+ unsigned char tmp[48];
+ for (size_t i = 0; i < len; i++) {
+ tmp[i] = (unsigned char) i;
+ }
+ memcpy(seed, tmp, len);
+ return 1;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(prng *p, inner_shake256_context *src) {
+ /*
+ * To ensure reproducibility for a given seed, we
+ * must enforce little-endian interpretation of
+ * the state words.
+ */
+ uint8_t tmp[56];
+ uint64_t th, tl;
+ int i;
+
+ inner_shake256_extract(src, tmp, 56);
+ for (i = 0; i < 14; i ++) {
+ uint32_t w;
+
+ w = (uint32_t)tmp[(i << 2) + 0]
+ | ((uint32_t)tmp[(i << 2) + 1] << 8)
+ | ((uint32_t)tmp[(i << 2) + 2] << 16)
+ | ((uint32_t)tmp[(i << 2) + 3] << 24);
+ *(uint32_t *)(p->state.d + (i << 2)) = w;
+ }
+ tl = *(uint32_t *)(p->state.d + 48);
+ th = *(uint32_t *)(p->state.d + 52);
+ *(uint64_t *)(p->state.d + 48) = tl + (th << 32);
+ PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(prng *p) {
+
+ static const uint32_t CW[] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+ };
+
+ uint64_t cc;
+ size_t u;
+
+ /*
+ * State uses local endianness. Only the output bytes must be
+ * converted to little endian (if used on a big-endian machine).
+ */
+ cc = *(uint64_t *)(p->state.d + 48);
+ for (u = 0; u < 8; u ++) {
+ uint32_t state[16];
+ size_t v;
+ int i;
+
+ memcpy(&state[0], CW, sizeof CW);
+ memcpy(&state[4], p->state.d, 48);
+ state[14] ^= (uint32_t)cc;
+ state[15] ^= (uint32_t)(cc >> 32);
+ for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d) do { \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 16) | (state[d] >> 16); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 12) | (state[b] >> 20); \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 8) | (state[d] >> 24); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 7) | (state[b] >> 25); \
+ } while (0)
+
+ QROUND( 0, 4, 8, 12);
+ QROUND( 1, 5, 9, 13);
+ QROUND( 2, 6, 10, 14);
+ QROUND( 3, 7, 11, 15);
+ QROUND( 0, 5, 10, 15);
+ QROUND( 1, 6, 11, 12);
+ QROUND( 2, 7, 8, 13);
+ QROUND( 3, 4, 9, 14);
+
+#undef QROUND
+
+ }
+
+ for (v = 0; v < 4; v ++) {
+ state[v] += CW[v];
+ }
+ for (v = 4; v < 14; v ++) {
+ state[v] += ((uint32_t *)p->state.d)[v - 4];
+ }
+ state[14] += ((uint32_t *)p->state.d)[10]
+ ^ (uint32_t)cc;
+ state[15] += ((uint32_t *)p->state.d)[11]
+ ^ (uint32_t)(cc >> 32);
+ cc ++;
+
+ /*
+ * We mimic the interleaving that is used in the AVX2
+ * implementation.
+ */
+ for (v = 0; v < 16; v ++) {
+ p->buf.d[(u << 2) + (v << 5) + 0] =
+ (uint8_t)state[v];
+ p->buf.d[(u << 2) + (v << 5) + 1] =
+ (uint8_t)(state[v] >> 8);
+ p->buf.d[(u << 2) + (v << 5) + 2] =
+ (uint8_t)(state[v] >> 16);
+ p->buf.d[(u << 2) + (v << 5) + 3] =
+ (uint8_t)(state[v] >> 24);
+ }
+ }
+ *(uint64_t *)(p->state.d + 48) = cc;
+
+ p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len) {
+ uint8_t *buf;
+
+ buf = dst;
+ while (len > 0) {
+ size_t clen;
+
+ clen = (sizeof p->buf.d) - p->ptr;
+ if (clen > len) {
+ clen = len;
+ }
+ memcpy(buf, p->buf.d, clen);
+ buf += clen;
+ len -= clen;
+ p->ptr += clen;
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED1024_AARCH64_prng_refill(p);
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sampler.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sampler.c
new file mode 100644
index 000000000..1b2e4cde9
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sampler.c
@@ -0,0 +1,292 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_gaussian0_sampler(prng *p) {
+
+ static const uint32_t dist[] = {
+ 10745844u, 3068844u, 3741698u,
+ 5559083u, 1580863u, 8248194u,
+ 2260429u, 13669192u, 2736639u,
+ 708981u, 4421575u, 10046180u,
+ 169348u, 7122675u, 4136815u,
+ 30538u, 13063405u, 7650655u,
+ 4132u, 14505003u, 7826148u,
+ 417u, 16768101u, 11363290u,
+ 31u, 8444042u, 8086568u,
+ 1u, 12844466u, 265321u,
+ 0u, 1232676u, 13644283u,
+ 0u, 38047u, 9111839u,
+ 0u, 870u, 6138264u,
+ 0u, 14u, 12545723u,
+ 0u, 0u, 3104126u,
+ 0u, 0u, 28824u,
+ 0u, 0u, 198u,
+ 0u, 0u, 1u
+ };
+
+ uint32_t v0, v1, v2, hi;
+ uint64_t lo;
+ int z;
+
+ /*
+ * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+ */
+ lo = prng_get_u64(p);
+ hi = prng_get_u8(p);
+ v0 = (uint32_t)lo & 0xFFFFFF;
+ v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+ v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+ /*
+ * Sampled value is z, such that v0..v2 is lower than the first
+ * z elements of the table.
+ */
+
+ uint32x4x3_t w;
+ uint32x4_t x0, x1, x2, cc0, cc1, cc2, zz;
+ uint32x2x3_t wh;
+ uint32x2_t cc0h, cc1h, cc2h, zzh;
+ x0 = vdupq_n_u32(v0);
+ x1 = vdupq_n_u32(v1);
+ x2 = vdupq_n_u32(v2);
+
+ // 0: 0, 3, 6, 9
+ // 1: 1, 4, 7, 10
+ // 2: 2, 5, 8, 11
+ // v0 - w0
+ // v1 - w1
+ // v2 - w2
+ // cc1 - cc0 >> 31
+ // cc2 - cc1 >> 31
+ // z + cc2 >> 31
+ w = vld3q_u32(&dist[0]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vshrq_n_u32(cc2, 31);
+
+ w = vld3q_u32(&dist[12]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vsraq_n_u32(zz, cc2, 31);
+
+ w = vld3q_u32(&dist[24]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vsraq_n_u32(zz, cc2, 31);
+
+ w = vld3q_u32(&dist[36]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vsraq_n_u32(zz, cc2, 31);
+
+ // 0: 48, 51
+ // 1: 49, 52
+ // 2: 50, 53
+ wh = vld3_u32(&dist[48]);
+ cc0h = vsub_u32(vget_low_u32(x0), wh.val[2]);
+ cc1h = vsub_u32(vget_low_u32(x1), wh.val[1]);
+ cc2h = vsub_u32(vget_low_u32(x2), wh.val[0]);
+ cc1h = (uint32x2_t)vsra_n_s32((int32x2_t)cc1h, (int32x2_t)cc0h, 31);
+ cc2h = (uint32x2_t)vsra_n_s32((int32x2_t)cc2h, (int32x2_t)cc1h, 31);
+ zzh = vshr_n_u32(cc2h, 31);
+
+ z = (int) (vaddvq_u32(zz) + vaddv_u32(zzh));
+ return z;
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+ int s, i;
+ fpr r;
+ uint32_t sw, w;
+ uint64_t z;
+
+ /*
+ * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+ * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+ */
+ s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+ r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+ /*
+ * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+ * (the minimum value for sigma), r = 0 and b = 1, then we get
+ * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+ * with probability about 0.000000000230383991, which is
+ * approximatively equal to 2^(-32). In any case, if s >= 64,
+ * then BerExp will be non-zero with probability less than
+ * 2^(-64), so we can simply saturate s at 63.
+ */
+ sw = (uint32_t)s;
+ sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+ s = (int)sw;
+
+ /*
+ * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+ * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+ * We scale it up to 2^64, then right-shift it by s bits because
+ * we really want exp(-x) = 2^(-s)*exp(-r).
+ *
+ * The "-1" operation makes sure that the value fits on 64 bits
+ * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+ * case). The bias is negligible since fpr_expm_p63() only computes
+ * with 51 bits of precision or so.
+ */
+ z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+ /*
+ * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+ * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+ * PRNG output to limit its consumption, the sign of the difference
+ * yields the expected result.
+ */
+ i = 64;
+ do {
+ i -= 8;
+ w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+ } while (!w && i > 0);
+ return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AARCH64_sampler(void *ctx, fpr mu, fpr isigma) {
+ sampler_context *spc;
+ int s;
+ fpr r, dss, ccs;
+
+ spc = ctx;
+
+ /*
+ * Center is mu. We compute mu = s + r where s is an integer
+ * and 0 <= r < 1.
+ */
+ s = (int)fpr_floor(mu);
+ r = fpr_sub(mu, fpr_of(s));
+
+ /*
+ * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+ */
+ dss = fpr_half(fpr_sqr(isigma));
+
+ /*
+ * ccs = sigma_min / sigma = sigma_min * isigma.
+ */
+ ccs = fpr_mul(isigma, spc->sigma_min);
+
+ /*
+ * We now need to sample on center r.
+ */
+ for (;;) {
+ int z0, z, b;
+ fpr x;
+
+ /*
+ * Sample z for a Gaussian distribution. Then get a
+ * random bit b to turn the sampling into a bimodal
+ * distribution: if b = 1, we use z+1, otherwise we
+ * use -z. We thus have two situations:
+ *
+ * - b = 1: z >= 1 and sampled against a Gaussian
+ * centered on 1.
+ * - b = 0: z <= 0 and sampled against a Gaussian
+ * centered on 0.
+ */
+ z0 = PQCLEAN_FALCONPADDED1024_AARCH64_gaussian0_sampler(&spc->p);
+ b = (int)prng_get_u8(&spc->p) & 1;
+ z = b + ((b << 1) - 1) * z0;
+
+ /*
+ * Rejection sampling. We want a Gaussian centered on r;
+ * but we sampled against a Gaussian centered on b (0 or
+ * 1). But we know that z is always in the range where
+ * our sampling distribution is greater than the Gaussian
+ * distribution, so rejection works.
+ *
+ * We got z with distribution:
+ * G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+ * We target distribution:
+ * S(z) = exp(-((z-r)^2)/(2*sigma^2))
+ * Rejection sampling works by keeping the value z with
+ * probability S(z)/G(z), and starting again otherwise.
+ * This requires S(z) <= G(z), which is the case here.
+ * Thus, we simply need to keep our z with probability:
+ * P = exp(-x)
+ * where:
+ * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+ *
+ * Here, we scale up the Bernouilli distribution, which
+ * makes rejection more probable, but makes rejection
+ * rate sufficiently decorrelated from the Gaussian
+ * center and standard deviation that the whole sampler
+ * can be said to be constant-time.
+ */
+ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+ x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+ if (BerExp(&spc->p, x, ccs)) {
+ /*
+ * Rejection sampling was centered on r, but the
+ * actual center is mu = s + r.
+ */
+ return s + z;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sign.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sign.c
new file mode 100644
index 000000000..48e0d8dee
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/sign.c
@@ -0,0 +1,951 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+#include "util.h"
+#include
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ * N = 2^logn
+ * phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+ /*
+ * For logn = 0 (polynomials are constant), the "tree" is a
+ * single element. Otherwise, the tree node has size 2^logn, and
+ * has two child trees for size logn-1 each. Thus, treesize s()
+ * must fulfill these two relations:
+ *
+ * s(0) = 1
+ * s(logn) = (2^logn) + 2*s(logn-1)
+ */
+ return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *restrict tree,
+ fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) {
+ size_t n, hn;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g0[0];
+ return;
+ }
+ hn = n >> 1;
+
+ /*
+ * The LDL decomposition yields L (which is written in the tree)
+ * and the diagonal of D. Since d00 = g0, we just write d11
+ * into tmp.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+ /*
+ * Split d00 (currently in g0) and d11 (currently in tmp). We
+ * reuse g0 and g1 as temporary storage spaces:
+ * d00 splits into g1, g1+hn
+ * d11 splits into g0, g0+hn
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(g1, g1 + hn, g0, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+ /*
+ * Each split result is the first row of a new auto-adjoint
+ * quasicyclic matrix for the next recursive step.
+ */
+ ffLDL_fft_inner(tree + n,
+ g1, g1 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11,
+ unsigned logn, fpr *restrict tmp) {
+ size_t n, hn;
+ fpr *d00, *d11;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g00[0];
+ return;
+ }
+ hn = n >> 1;
+ d00 = tmp;
+ d11 = tmp + n;
+ tmp += n << 1;
+
+ memcpy(d00, g00, n * sizeof * g00);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(tmp, tmp + hn, d00, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(d00, d00 + hn, d11, logn);
+ memcpy(d11, tmp, n * sizeof * tmp);
+
+ ffLDL_fft_inner(tree + n, d11, d11 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), d00, d00 + hn, logn - 1, tmp);
+
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+ /*
+ * TODO: make an iterative version.
+ */
+ size_t n;
+
+ n = MKN(logn);
+ if (n == 1) {
+ /*
+ * We actually store in the tree leaf the inverse of
+ * the value mandated by the specification: this
+ * saves a division both here and in the sampler.
+ */
+ tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma_10);
+ } else {
+ ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+ ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+ orig_logn, logn - 1);
+ }
+}
+
+/* =================================================================== */
+
+/*
+ * The expanded private key contains:
+ * - The B0 matrix (four elements)
+ * - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+ (void)logn;
+ return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+ return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+ return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+ return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+ return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_expand_privkey(fpr *restrict expanded_key,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ uint8_t *restrict tmp) {
+ fpr *rf, *rg, *rF, *rG;
+ fpr *b00, *b01, *b10, *b11;
+ fpr *g00, *g01, *g11, *gxx;
+ fpr *tree;
+
+ b00 = expanded_key + skoff_b00(FALCON_LOGN);
+ b01 = expanded_key + skoff_b01(FALCON_LOGN);
+ b10 = expanded_key + skoff_b10(FALCON_LOGN);
+ b11 = expanded_key + skoff_b11(FALCON_LOGN);
+ tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+ /*
+ * We load the private key elements directly into the B0 matrix,
+ * since B0 = [[g, -f], [G, -F]].
+ */
+ rg = b00;
+ rf = b01;
+ rG = b10;
+ rF = b11;
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rg, g, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rg, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rf, f, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rf, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(rf, rf, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rG, G, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rG, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(rF, F, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(rF, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(rF, rF, FALCON_LOGN);
+
+ /*
+ * Compute the FFT for the key elements, and negate f and F.
+ */
+
+ /*
+ * The Gram matrix is G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle).
+ */
+ g00 = (fpr *)tmp;
+ g01 = g00 + FALCON_N;
+ g11 = g01 + FALCON_N;
+ gxx = g11 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(g00, b00, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(g00, g00, b01, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(g01, b00, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(g01, g01, b01, b11, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(g11, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(g11, g11, b11, FALCON_LOGN);
+
+ /*
+ * Compute the Falcon tree.
+ */
+ ffLDL_fft(tree, g00, g01, g11, FALCON_LOGN, gxx);
+
+ /*
+ * Normalize tree.
+ */
+ ffLDL_binary_normalize(tree, FALCON_LOGN, FALCON_LOGN);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+ fpr *restrict t0, fpr *restrict t1,
+ fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
+ unsigned orig_logn, unsigned logn, fpr *restrict tmp) {
+ size_t n, hn;
+ fpr *z0, *z1;
+
+ /*
+ * Deepest level: the LDL tree leaf value is just g00 (the
+ * array has length only 1 at this point); we normalize it
+ * with regards to sigma, then use it for sampling.
+ */
+ if (logn == 0) {
+ fpr leaf;
+
+ leaf = g00[0];
+ leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma_10);
+ t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+ t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+ return;
+ }
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Decompose G into LDL. We only need d00 (identical to g00),
+ * d11, and l10; we do that in place.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_LDL_fft(g00, g01, g11, logn);
+
+ /*
+ * Split d00 and d11 and expand them into half-size quasi-cyclic
+ * Gram matrices. We also save l10 in tmp[].
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(tmp, tmp + hn, g00, logn);
+ memcpy(g00, tmp, n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(tmp, tmp + hn, g11, logn);
+ memcpy(g11, tmp, n * sizeof * tmp);
+ memcpy(tmp, g01, n * sizeof * g01);
+ memcpy(g01, g00, hn * sizeof * g00);
+ memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+ /*
+ * The half-size Gram matrices for the recursive LDL tree
+ * building are now:
+ * - left sub-tree: g00, g00+hn, g01
+ * - right sub-tree: g11, g11+hn, g01+hn
+ * l10 is in tmp[].
+ */
+
+ /*
+ * We split t1 and use the first recursive call on the two
+ * halves, using the right sub-tree. The result is merged
+ * back into tmp + 2*n.
+ */
+ z1 = tmp + n;
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+ g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * l10.
+ * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+ * in tmp + (n << 1). The buffer in z1 is free.
+ *
+ * In the end, z1 is written over t1, and tb0 is in t0.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(z1, t1, tmp + (n << 1), logn);
+ memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t0, t0, tmp, z1, logn);
+
+ /*
+ * Second recursive invocation, on the split tb0 (currently in t0)
+ * and the left sub-tree.
+ */
+ z0 = tmp;
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z0, z0 + hn, t0, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+ g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+ fpr *restrict z0, fpr *restrict z1,
+ const fpr *restrict tree,
+ const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
+ fpr *restrict tmp) {
+ size_t n, hn;
+ const fpr *tree0, *tree1;
+
+ /*
+ * When logn == 2, we inline the last two recursion levels.
+ */
+ if (logn == 2) {
+ fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ tree0 = tree + 4;
+ tree1 = tree + 8;
+
+ /*
+ * We split t1 into w*, then do the recursive invocation,
+ * with output in w*. We finally merge back into z1.
+ */
+ // Split
+ a_re = t1[0];
+ a_im = t1[2];
+ b_re = t1[1];
+ b_im = t1[3];
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ // Sampling
+ x0 = w2;
+ x1 = w3;
+ sigma = tree1[3];
+ w2 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, w2);
+ a_im = fpr_sub(x1, w3);
+ b_re = tree1[0];
+ b_im = tree1[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree1[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ // Merge
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z1[0] = w0 = fpr_add(a_re, c_re);
+ z1[2] = w2 = fpr_add(a_im, c_im);
+ z1[1] = w1 = fpr_sub(a_re, c_re);
+ z1[3] = w3 = fpr_sub(a_im, c_im);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+ */
+ w0 = fpr_sub(t1[0], w0);
+ w1 = fpr_sub(t1[1], w1);
+ w2 = fpr_sub(t1[2], w2);
+ w3 = fpr_sub(t1[3], w3);
+
+ a_re = w0;
+ a_im = w2;
+ b_re = tree[0];
+ b_im = tree[2];
+ w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ a_re = w1;
+ a_im = w3;
+ b_re = tree[1];
+ b_im = tree[3];
+ w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+ w0 = fpr_add(w0, t0[0]);
+ w1 = fpr_add(w1, t0[1]);
+ w2 = fpr_add(w2, t0[2]);
+ w3 = fpr_add(w3, t0[3]);
+
+ /*
+ * Second recursive invocation.
+ */
+ // Split
+ a_re = w0;
+ a_im = w2;
+ b_re = w1;
+ b_im = w3;
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ // Sampling
+ x0 = w2;
+ x1 = w3;
+ sigma = tree0[3];
+ w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree0[0];
+ b_im = tree0[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree0[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ // Merge
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z0[0] = fpr_add(a_re, c_re);
+ z0[2] = fpr_add(a_im, c_im);
+ z0[1] = fpr_sub(a_re, c_re);
+ z0[3] = fpr_sub(a_im, c_im);
+
+ return;
+ }
+
+ /*
+ * Case logn == 1 is reachable only when using Falcon-2 (the
+ * smallest size for which Falcon is mathematically defined, but
+ * of course way too insecure to be of any use).
+ */
+ if (logn == 1) {
+ fpr x0, x1, y0, y1, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ x0 = t1[0];
+ x1 = t1[1];
+ sigma = tree[3];
+ z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree[0];
+ b_im = tree[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, t0[0]);
+ x1 = fpr_add(c_im, t0[1]);
+ sigma = tree[2];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+ return;
+ }
+
+ /*
+ * General recursive case (logn >= 2).
+ */
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ tree0 = tree + n;
+ tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+ /*
+ * We split t1 into z1 (reused as temporary storage), then do
+ * the recursive invocation, with output in tmp. We finally
+ * merge back into z1.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree1, z1, z1 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub(tmp, t1, z1, logn);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(tmp, t0, tmp, tree, logn);
+
+ /*
+ * Second recursive invocation.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(z0, z0 + hn, tmp, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree0, z0, z0 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const fpr *restrict expanded_key,
+ const uint16_t *hm, fpr *restrict tmp) {
+ fpr *t0, *t1, *tx, *ty;
+ const fpr *b00, *b01, *b10, *b11, *tree;
+ fpr ni;
+ int16_t *s1tmp, *s2tmp;
+
+ t0 = tmp;
+ t1 = t0 + FALCON_N;
+ b00 = expanded_key + skoff_b00(FALCON_LOGN);
+ b01 = expanded_key + skoff_b01(FALCON_LOGN);
+ b10 = expanded_key + skoff_b10(FALCON_LOGN);
+ b11 = expanded_key + skoff_b11(FALCON_LOGN);
+ tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(t0, FALCON_LOGN);
+ ni = fpr_inverse_of_q;
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+ tx = t1 + FALCON_N;
+ ty = tx + FALCON_N;
+
+ /*
+ * Apply sampling. Output is written back in [tx, ty].
+ */
+ ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, FALCON_LOGN, ty + FALCON_N);
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t0, tx, b00, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t0, t0, ty, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t0, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t1, tx, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t1, t1, ty, b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t1, FALCON_LOGN);
+
+ /*
+ * Compute the signature.
+ */
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+
+ s1tmp = (int16_t *)tx;
+ s2tmp = (int16_t *)tmp;
+
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+ memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+ memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const int8_t *restrict f, const int8_t *restrict g,
+ const int8_t *restrict F, const int8_t *restrict G,
+ const uint16_t *hm, fpr *restrict tmp) {
+ fpr *t0, *t1, *tx, *ty;
+ fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+ fpr ni;
+ int16_t *s1tmp, *s2tmp;
+
+ /*
+ * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+ */
+ b00 = tmp;
+ b01 = b00 + FALCON_N;
+ b10 = b01 + FALCON_N;
+ b11 = b10 + FALCON_N;
+ t0 = b11 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b00, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b10, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+ /*
+ * Compute the Gram matrix G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle). g10 is not kept
+ * since it is equal to adj(g01).
+ *
+ * We _replace_ the matrix B with the Gram matrix, but we
+ * must keep b01 and b11 for computing the target vector.
+ *
+ * Memory layout:
+ * b00 | b01 | b10 | b11 | t0 | t1
+ * g00 | g01 | g11 | b01 | t0 | t1
+ */
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_fft(t1, b00, b10, FALCON_LOGN); // t1 <- b00*adj(b10)
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(t0, b01, FALCON_LOGN); // t0 <- b01*adj(b01)
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(b00, b00, FALCON_LOGN); // b00 <- b00*adj(b00)
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_add(b00, b00, t0, FALCON_LOGN); // b00 <- g00
+
+ memcpy(t0, b01, FALCON_N * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_muladj_add_fft(b01, t1, b01, b11, FALCON_LOGN); // b01 <- b01*adj(b11)
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_fft(b10, b10, FALCON_LOGN); // b10 <- b10*adj(b10)
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulselfadj_add_fft(b10, b10, b11, FALCON_LOGN); // t1 = g11 <- b11*adj(b11)
+
+ /*
+ * We rename variables to make things clearer. The three elements
+ * of the Gram matrix uses the first 3*n slots of tmp[], followed
+ * by b11 and b01 (in that order).
+ */
+ g00 = b00;
+ g01 = b01;
+ g11 = b10;
+ b01 = t0;
+ t0 = b01 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ /*
+ * Memory layout at that point:
+ * g00 g01 g11 b11 b01 t0 t1
+ */
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(t0, FALCON_LOGN);
+ ni = fpr_inverse_of_q;
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+ /*
+ * b01 and b11 can be discarded, so we move back (t0,t1).
+ * Memory layout is now:
+ * g00 g01 g11 t0 t1
+ */
+ memcpy(b11, t0, FALCON_N * 2 * sizeof * t0);
+ t0 = g11 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ /*
+ * Apply sampling; result is written over (t0,t1).
+ * t1, g00
+ */
+ ffSampling_fft_dyntree(samp, samp_ctx,
+ t0, t1, g00, g01, g11, FALCON_LOGN, FALCON_LOGN, t1 + FALCON_N);
+
+ /*
+ * We arrange the layout back to:
+ * b00 b01 b10 b11 t0 t1
+ *
+ * We did not conserve the matrix basis, so we must recompute
+ * it now.
+ */
+ b00 = tmp;
+ b01 = b00 + FALCON_N;
+ b10 = b01 + FALCON_N;
+ b11 = b10 + FALCON_N;
+ memmove(b11 + FALCON_N, t0, FALCON_N * 2 * sizeof * t0);
+ t0 = b11 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b00, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b10, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_FFT(b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+ tx = t1 + FALCON_N;
+ ty = tx + FALCON_N;
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(tx, t0, b00, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_fft(ty, t0, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t0, tx, t1, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_mul_add_fft(t1, ty, t1, b11, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t0, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED1024_AARCH64_iFFT(t1, FALCON_LOGN);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s1tmp = (int16_t *)tx;
+ s2tmp = (int16_t *)tmp;
+
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+ memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+ memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *restrict expanded_key,
+ const uint16_t *hm, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min_10;
+ PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED1024_AARCH64_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_tree(samp, samp_ctx, sig, expanded_key, hm, ftmp)) {
+ break;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *restrict f, const int8_t *restrict g,
+ const int8_t *restrict F, const int8_t *restrict G,
+ const uint16_t *hm, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min_10;
+ PQCLEAN_FALCONPADDED1024_AARCH64_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED1024_AARCH64_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_dyn(samp, samp_ctx, sig, f, g, F, G, hm, ftmp)) {
+ break;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.c
new file mode 100644
index 000000000..92300bb57
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.c
@@ -0,0 +1,71 @@
+/*
+ * Utils function
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "util.h"
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+void PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, const unsigned logn) {
+ float64x2x4_t neon_flo64, neon_fhi64;
+ int64x2x4_t neon_lo64, neon_hi64;
+ int32x4_t neon_lo32[2], neon_hi32[2];
+ int16x8_t neon_lo16, neon_hi16;
+ int8x16_t neon_8;
+
+ const unsigned falcon_n = 1 << logn;
+
+ for (unsigned i = 0; i < falcon_n; i += 16) {
+ neon_8 = vld1q_s8(&t[i]);
+
+ // Extend from 8 to 16 bit
+ // x7 | x6 | x5 | x5 - x3 | x2 | x1 | x0
+ neon_lo16 = vmovl_s8(vget_low_s8(neon_8));
+ neon_hi16 = vmovl_high_s8(neon_8);
+
+ // Extend from 16 to 32 bit
+ // xxx3 | xxx2 | xxx1 | xxx0
+ neon_lo32[0] = vmovl_s16(vget_low_s16(neon_lo16));
+ neon_lo32[1] = vmovl_high_s16(neon_lo16);
+ neon_hi32[0] = vmovl_s16(vget_low_s16(neon_hi16));
+ neon_hi32[1] = vmovl_high_s16(neon_hi16);
+
+ // Extend from 32 to 64 bit
+ neon_lo64.val[0] = vmovl_s32(vget_low_s32(neon_lo32[0]));
+ neon_lo64.val[1] = vmovl_high_s32(neon_lo32[0]);
+ neon_lo64.val[2] = vmovl_s32(vget_low_s32(neon_lo32[1]));
+ neon_lo64.val[3] = vmovl_high_s32(neon_lo32[1]);
+
+ neon_hi64.val[0] = vmovl_s32(vget_low_s32(neon_hi32[0]));
+ neon_hi64.val[1] = vmovl_high_s32(neon_hi32[0]);
+ neon_hi64.val[2] = vmovl_s32(vget_low_s32(neon_hi32[1]));
+ neon_hi64.val[3] = vmovl_high_s32(neon_hi32[1]);
+
+ vfcvtx4(neon_flo64, neon_lo64);
+ vfcvtx4(neon_fhi64, neon_hi64);
+
+ vstorex4(&r[i], neon_flo64);
+ vstorex4(&r[i + 8], neon_fhi64);
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.h b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.h
new file mode 100644
index 000000000..78bd83343
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/util.h
@@ -0,0 +1,8 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#define poly_small_to_fp PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/vrfy.c
new file mode 100644
index 000000000..0aa6015da
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_aarch64/vrfy.c
@@ -0,0 +1,174 @@
+/*
+ * Falcon signature verification.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt(int16_t *h) {
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+}
+
+void PQCLEAN_FALCONPADDED1024_AARCH64_to_ntt_monty(int16_t *h) {
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_MONT);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+ int16_t *h, int16_t *tmp) {
+ int16_t *tt = tmp;
+
+ /*
+ * Compute s1 = c0 - s2*h mod phi mod q (in tt[]).
+ */
+
+ memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT_INV);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(tt, h);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(tt, INVNTT_NONE);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(tt, c0, tt);
+
+ /*
+ * Signature is valid if and only if the aggregate (s1,s2) vector
+ * is short enough.
+ */
+ return PQCLEAN_FALCONPADDED1024_AARCH64_is_short(tt, s2);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_compute_public(int16_t *h, const int8_t *f, const int8_t *g, int16_t *tmp) {
+ int16_t *tt = tmp;
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(h, g);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(tt, f);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT);
+
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(tt)) {
+ return 0;
+ }
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(h, tt);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_convert_to_unsigned(h);
+
+ return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_complete_private(int8_t *G, const int8_t *f,
+ const int8_t *g, const int8_t *F,
+ uint8_t *tmp) {
+ int16_t *t1, *t2;
+
+ t1 = (int16_t *)tmp;
+ t2 = t1 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(t1, g);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(t1, NTT_NONE);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(t2, F);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(t2, NTT_MONT);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_montmul_ntt(t1, t2);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_int8_to_int16(t2, f);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(t2, NTT_MONT);
+
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(t2)) {
+ return 0;
+ }
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(t1, t2);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(t1, INVNTT_NINV);
+
+ if (PQCLEAN_FALCONPADDED1024_AARCH64_poly_int16_to_int8(G, t1)) {
+ return 0;
+ }
+ return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp) {
+ int16_t *tt = (int16_t *)tmp;
+ uint16_t r;
+
+ memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT);
+
+ r = PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(tt);
+
+ return (int)(1u - (r >> 15));
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+ const int16_t *s1, const int16_t *s2,
+ uint8_t *tmp) {
+ int16_t *tt = (int16_t *)tmp;
+ uint16_t r;
+
+ /*
+ * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+ * is zero (in NTT representation) then the operation fails. We
+ * keep that information into a flag so that we do not deviate
+ * from strict constant-time processing; if all coefficients of
+ * s2 are non-zero, then the high bit of r will be zero.
+ */
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_sub_barrett(h, c0, s1);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(h, NTT_NONE);
+
+ /*
+ * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+ * and c0 - s1 into h[].
+ */
+ memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(tt, NTT_MONT);
+ r = PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(tt);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_div_12289(h, tt);
+
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+ /*
+ * Signature is acceptable if and only if it is short enough,
+ * and s2 was invertible mod phi mod q. The caller must still
+ * check that the rebuilt public key matches the expected
+ * value (e.g. through a hash).
+ */
+ r = (uint16_t) (~r & (uint16_t) - PQCLEAN_FALCONPADDED1024_AARCH64_is_short(s1, s2));
+ return (int)(r >> 15);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED1024_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp) {
+ int16_t *s2 = (int16_t *)tmp;
+
+ memcpy(s2, sig, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED1024_AARCH64_poly_ntt(s2, NTT_MONT);
+
+ int r = PQCLEAN_FALCONPADDED1024_AARCH64_poly_compare_with_zero(s2);
+
+ return r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/LICENSE b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin .
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/api.h b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/api.h
new file mode 100644
index 000000000..da6103260
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED1024_AVX2_API_H
+#define PQCLEAN_FALCONPADDED1024_AVX2_API_H
+
+#include
+#include
+
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES 2305
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES 1793
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES 1280
+
+#define PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_ALGNAME "Falcon-padded-1024"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ * public (pk): PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES
+ * private (sk): PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/codec.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/codec.c
new file mode 100644
index 000000000..84466aa71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_modq_encode(
+ void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn) {
+ size_t n, out_len, u;
+ uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ if (x[u] >= 12289) {
+ return 0;
+ }
+ }
+ out_len = ((n * 14) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << 14) | x[u];
+ acc_len += 14;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_modq_decode(
+ uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len, u;
+ const uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * 14) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ u = 0;
+ while (u < n) {
+ acc = (acc << 8) | (*buf ++);
+ acc_len += 8;
+ if (acc_len >= 14) {
+ unsigned w;
+
+ acc_len -= 14;
+ w = (acc >> acc_len) & 0x3FFF;
+ if (w >= 12289) {
+ return 0;
+ }
+ x[u ++] = (uint16_t)w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint16_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_decode(
+ int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ w |= -(w & mask2);
+ x[u ++] = (int16_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+ void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint8_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+ int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ x[u ++] = (int8_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_comp_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn) {
+ uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = out;
+
+ /*
+ * Make sure that all values are within the -2047..+2047 range.
+ */
+ for (u = 0; u < n; u ++) {
+ if (x[u] < -2047 || x[u] > +2047) {
+ return 0;
+ }
+ }
+
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ int t;
+ unsigned w;
+
+ /*
+ * Get sign and absolute value of next integer; push the
+ * sign bit.
+ */
+ acc <<= 1;
+ t = x[u];
+ if (t < 0) {
+ t = -t;
+ acc |= 1;
+ }
+ w = (unsigned)t;
+
+ /*
+ * Push the low 7 bits of the absolute value.
+ */
+ acc <<= 7;
+ acc |= w & 127u;
+ w >>= 7;
+
+ /*
+ * We pushed exactly 8 bits.
+ */
+ acc_len += 8;
+
+ /*
+ * Push as many zeros as necessary, then a one. Since the
+ * absolute value is at most 2047, w can only range up to
+ * 15 at this point, thus we will add at most 16 bits
+ * here. With the 8 bits above and possibly up to 7 bits
+ * from previous iterations, we may go up to 31 bits, which
+ * will fit in the accumulator, which is an uint32_t.
+ */
+ acc <<= (w + 1);
+ acc |= 1;
+ acc_len += w + 1;
+
+ /*
+ * Produce all full bytes.
+ */
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc >> acc_len);
+ }
+ v ++;
+ }
+ }
+
+ /*
+ * Flush remaining bits (if any).
+ */
+ if (acc_len > 0) {
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc << (8 - acc_len));
+ }
+ v ++;
+ }
+
+ return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_AVX2_comp_decode(
+ int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ const uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ unsigned b, s, m;
+
+ /*
+ * Get next eight bits: sign and low seven bits of the
+ * absolute value.
+ */
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ b = acc >> acc_len;
+ s = b & 128;
+ m = b & 127;
+
+ /*
+ * Get next bits until a 1 is reached.
+ */
+ for (;;) {
+ if (acc_len == 0) {
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ acc_len = 8;
+ }
+ acc_len --;
+ if (((acc >> acc_len) & 1) != 0) {
+ break;
+ }
+ m += 128;
+ if (m > 2047) {
+ return 0;
+ }
+ }
+
+ /*
+ * "-0" is forbidden.
+ */
+ if (s && m == 0) {
+ return 0;
+ }
+ if (s) {
+ x[u] = (int16_t) - m;
+ } else {
+ x[u] = (int16_t)m;
+ }
+ }
+
+ /*
+ * Unused bits in the last byte must be zero.
+ */
+ if ((acc & ((1u << acc_len) - 1u)) != 0) {
+ return 0;
+ }
+
+ return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ * log(n) n max(f,g) std(f,g) max(F,G) std(F,G)
+ * 1 2 129 56.31 143 60.02
+ * 2 4 123 40.93 160 46.52
+ * 3 8 97 28.97 159 38.01
+ * 4 16 100 21.48 154 32.50
+ * 5 32 71 15.41 151 29.36
+ * 6 64 59 11.07 138 27.77
+ * 7 128 39 7.91 144 27.00
+ * 8 256 32 5.63 148 26.61
+ * 9 512 22 4.00 137 26.46
+ * 10 1024 15 2.84 146 26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 7,
+ 7,
+ 6,
+ 6,
+ 5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ * log(n) n max sig coeff (theoretical)
+ * 1 2 412
+ * 2 4 583
+ * 3 8 824
+ * 4 16 1166
+ * 5 32 1649
+ * 6 64 2332
+ * 7 128 3299
+ * 8 256 4665
+ * 9 512 6598
+ * 10 1024 9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_sig_bits[] = {
+ 0, /* unused */
+ 10,
+ 11,
+ 11,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/common.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/common.c
new file mode 100644
index 000000000..affe907eb
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_vartime(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn) {
+ /*
+ * This is the straightforward per-the-spec implementation. It
+ * is not constant-time, thus it might reveal information on the
+ * plaintext (at least, enough to check the plaintext against a
+ * list of potential plaintexts) in a scenario where the
+ * attacker does not have access to the signature value or to
+ * the public key, but knows the nonce (without knowledge of the
+ * nonce, the hashed output cannot be matched against potential
+ * plaintexts).
+ */
+ size_t n;
+
+ n = (size_t)1 << logn;
+ while (n > 0) {
+ uint8_t buf[2];
+ uint32_t w;
+
+ inner_shake256_extract(sc, (void *)buf, sizeof buf);
+ w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+ if (w < 61445) {
+ while (w >= 12289) {
+ w -= 12289;
+ }
+ *x ++ = (uint16_t)w;
+ n --;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp) {
+ /*
+ * Each 16-bit sample is a value in 0..65535. The value is
+ * kept if it falls in 0..61444 (because 61445 = 5*12289)
+ * and rejected otherwise; thus, each sample has probability
+ * about 0.93758 of being selected.
+ *
+ * We want to oversample enough to be sure that we will
+ * have enough values with probability at least 1 - 2^(-256).
+ * Depending on degree N, this leads to the following
+ * required oversampling:
+ *
+ * logn n oversampling
+ * 1 2 65
+ * 2 4 67
+ * 3 8 71
+ * 4 16 77
+ * 5 32 86
+ * 6 64 100
+ * 7 128 122
+ * 8 256 154
+ * 9 512 205
+ * 10 1024 287
+ *
+ * If logn >= 7, then the provided temporary buffer is large
+ * enough. Otherwise, we use a stack buffer of 63 entries
+ * (i.e. 126 bytes) for the values that do not fit in tmp[].
+ */
+
+ static const uint16_t overtab[] = {
+ 0, /* unused */
+ 65,
+ 67,
+ 71,
+ 77,
+ 86,
+ 100,
+ 122,
+ 154,
+ 205,
+ 287
+ };
+
+ unsigned n, n2, u, m, p, over;
+ uint16_t *tt1, tt2[63];
+
+ /*
+ * We first generate m 16-bit value. Values 0..n-1 go to x[].
+ * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+ * We also reduce modulo q the values; rejected values are set
+ * to 0xFFFF.
+ */
+ n = 1U << logn;
+ n2 = n << 1;
+ over = overtab[logn];
+ m = n + over;
+ tt1 = (uint16_t *)tmp;
+ for (u = 0; u < m; u ++) {
+ uint8_t buf[2];
+ uint32_t w, wr;
+
+ inner_shake256_extract(sc, buf, sizeof buf);
+ w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+ wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+ wr |= ((w - 61445) >> 31) - 1;
+ if (u < n) {
+ x[u] = (uint16_t)wr;
+ } else if (u < n2) {
+ tt1[u - n] = (uint16_t)wr;
+ } else {
+ tt2[u - n2] = (uint16_t)wr;
+ }
+ }
+
+ /*
+ * Now we must "squeeze out" the invalid values. We do this in
+ * a logarithmic sequence of passes; each pass computes where a
+ * value should go, and moves it down by 'p' slots if necessary,
+ * where 'p' uses an increasing powers-of-two scale. It can be
+ * shown that in all cases where the loop decides that a value
+ * has to be moved down by p slots, the destination slot is
+ * "free" (i.e. contains an invalid value).
+ */
+ for (p = 1; p <= over; p <<= 1) {
+ unsigned v;
+
+ /*
+ * In the loop below:
+ *
+ * - v contains the index of the final destination of
+ * the value; it is recomputed dynamically based on
+ * whether values are valid or not.
+ *
+ * - u is the index of the value we consider ("source");
+ * its address is s.
+ *
+ * - The loop may swap the value with the one at index
+ * u-p. The address of the swap destination is d.
+ */
+ v = 0;
+ for (u = 0; u < m; u ++) {
+ uint16_t *s, *d;
+ unsigned j, sv, dv, mk;
+
+ if (u < n) {
+ s = &x[u];
+ } else if (u < n2) {
+ s = &tt1[u - n];
+ } else {
+ s = &tt2[u - n2];
+ }
+ sv = *s;
+
+ /*
+ * The value in sv should ultimately go to
+ * address v, i.e. jump back by u-v slots.
+ */
+ j = u - v;
+
+ /*
+ * We increment v for the next iteration, but
+ * only if the source value is valid. The mask
+ * 'mk' is -1 if the value is valid, 0 otherwise,
+ * so we _subtract_ mk.
+ */
+ mk = (sv >> 15) - 1U;
+ v -= mk;
+
+ /*
+ * In this loop we consider jumps by p slots; if
+ * u < p then there is nothing more to do.
+ */
+ if (u < p) {
+ continue;
+ }
+
+ /*
+ * Destination for the swap: value at address u-p.
+ */
+ if ((u - p) < n) {
+ d = &x[u - p];
+ } else if ((u - p) < n2) {
+ d = &tt1[(u - p) - n];
+ } else {
+ d = &tt2[(u - p) - n2];
+ }
+ dv = *d;
+
+ /*
+ * The swap should be performed only if the source
+ * is valid AND the jump j has its 'p' bit set.
+ */
+ mk &= -(((j & p) + 0x1FF) >> 9);
+
+ *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+ *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+ }
+ }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+ 0, /* unused */
+ 101498,
+ 208714,
+ 428865,
+ 892039,
+ 1852696,
+ 3842630,
+ 7959734,
+ 16468416,
+ 34034726,
+ 70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_is_short(
+ const int16_t *s1, const int16_t *s2, unsigned logn) {
+ /*
+ * We use the l2-norm. Code below uses only 32-bit operations to
+ * compute the square of the norm with saturation to 2^32-1 if
+ * the value exceeds 2^31-1.
+ */
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = (size_t)1 << logn;
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s1[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ z = s2[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ s |= -(ng >> 31);
+
+ return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(
+ uint32_t sqn, const int16_t *s2, unsigned logn) {
+ size_t n, u;
+ uint32_t ng;
+
+ n = (size_t)1 << logn;
+ ng = -(sqn >> 31);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s2[u];
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ }
+ sqn |= -(ng >> 31);
+
+ return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fft.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fft.c
new file mode 100644
index 000000000..2b8ca7b49
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fft.c
@@ -0,0 +1,1108 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_add(a_re, b_re); \
+ fpct_im = fpr_add(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_sub(a_re, b_re); \
+ fpct_im = fpr_sub(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+ fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+ fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ * Re(f(w_j)) -> slot rev(j)/2
+ * Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_FFT(fpr *f, unsigned logn) {
+ /*
+ * FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = N
+ * for m = 1; m < N; m *= 2:
+ * ht = t/2
+ * for i1 = 0; i1 < m; i1 ++:
+ * j1 = i1 * t
+ * s = GM[m + i1]
+ * for j = j1; j < (j1 + ht); j ++:
+ * x = f[j]
+ * y = s * f[j + ht]
+ * f[j] = x + y
+ * f[j + ht] = x - y
+ * t = ht
+ *
+ * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+ *
+ * In the description above, f[] is supposed to contain complex
+ * numbers. In our in-memory representation, the real and
+ * imaginary parts of f[k] are in array slots k and k+N/2.
+ *
+ * We only keep the first half of the complex numbers. We can
+ * see that after the first iteration, the first and second halves
+ * of the array of complex numbers have separate lives, so we
+ * simply ignore the second part.
+ */
+
+ unsigned u;
+ size_t t, n, hn, m;
+
+ /*
+ * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+ * (because GM[1] = w^rev(1) = w^(N/2) = i).
+ * In our chosen representation, this is a no-op: everything is
+ * already where it should be.
+ */
+
+ /*
+ * Subsequent iterations are truncated to use only the first
+ * half of values.
+ */
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ t = hn;
+ for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+ size_t ht, hm, i1, j1;
+
+ ht = t >> 1;
+ hm = m >> 1;
+ for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+ size_t j, j2;
+
+ j2 = j1 + ht;
+ if (ht >= 4) {
+ __m256d s_re, s_im;
+
+ s_re = _mm256_set1_pd(
+ fpr_gm_tab[((m + i1) << 1) + 0].v);
+ s_im = _mm256_set1_pd(
+ fpr_gm_tab[((m + i1) << 1) + 1].v);
+ for (j = j1; j < j2; j += 4) {
+ __m256d x_re, x_im, y_re, y_im;
+ __m256d z_re, z_im;
+
+ x_re = _mm256_loadu_pd(&f[j].v);
+ x_im = _mm256_loadu_pd(&f[j + hn].v);
+ z_re = _mm256_loadu_pd(&f[j + ht].v);
+ z_im = _mm256_loadu_pd(&f[j + ht + hn].v);
+ y_re = FMSUB(z_re, s_re,
+ _mm256_mul_pd(z_im, s_im));
+ y_im = FMADD(z_re, s_im,
+ _mm256_mul_pd(z_im, s_re));
+ _mm256_storeu_pd(&f[j].v,
+ _mm256_add_pd(x_re, y_re));
+ _mm256_storeu_pd(&f[j + hn].v,
+ _mm256_add_pd(x_im, y_im));
+ _mm256_storeu_pd(&f[j + ht].v,
+ _mm256_sub_pd(x_re, y_re));
+ _mm256_storeu_pd(&f[j + ht + hn].v,
+ _mm256_sub_pd(x_im, y_im));
+ }
+ } else {
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+ s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + ht];
+ y_im = f[j + ht + hn];
+ FPC_MUL(y_re, y_im,
+ y_re, y_im, s_re, s_im);
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(f[j + ht], f[j + ht + hn],
+ x_re, x_im, y_re, y_im);
+ }
+ }
+ }
+ t = ht;
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_iFFT(fpr *f, unsigned logn) {
+ /*
+ * Inverse FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = 1
+ * for m = N; m > 1; m /= 2:
+ * hm = m/2
+ * dt = t*2
+ * for i1 = 0; i1 < hm; i1 ++:
+ * j1 = i1 * dt
+ * s = iGM[hm + i1]
+ * for j = j1; j < (j1 + t); j ++:
+ * x = f[j]
+ * y = f[j + t]
+ * f[j] = x + y
+ * f[j + t] = s * (x - y)
+ * t = dt
+ * for i1 = 0; i1 < N; i1 ++:
+ * f[i1] = f[i1] / N
+ *
+ * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+ * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+ *
+ * In the main loop (not counting the final division loop), in
+ * all iterations except the last, the first and second half of f[]
+ * (as an array of complex numbers) are separate. In our chosen
+ * representation, we do not keep the second half.
+ *
+ * The last iteration recombines the recomputed half with the
+ * implicit half, and should yield only real numbers since the
+ * target polynomial is real; moreover, s = i at that step.
+ * Thus, when considering x and y:
+ * y = conj(x) since the final f[j] must be real
+ * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+ * filled with 2*Im(x).
+ * But we already have Re(x) and Im(x) in array slots j and j+t
+ * in our chosen representation. That last iteration is thus a
+ * simple doubling of the values in all the array.
+ *
+ * We make the last iteration a no-op by tweaking the final
+ * division into a division by N/2, not N.
+ */
+ size_t u, n, hn, t, m;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ hn = n >> 1;
+ for (u = logn; u > 1; u --) {
+ size_t hm, dt, i1, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+ size_t j, j2;
+
+ j2 = j1 + t;
+ if (t >= 4) {
+ __m256d s_re, s_im;
+
+ s_re = _mm256_set1_pd(
+ fpr_gm_tab[((hm + i1) << 1) + 0].v);
+ s_im = _mm256_set1_pd(
+ fpr_gm_tab[((hm + i1) << 1) + 1].v);
+ for (j = j1; j < j2; j += 4) {
+ __m256d x_re, x_im, y_re, y_im;
+ __m256d z_re, z_im;
+
+ x_re = _mm256_loadu_pd(&f[j].v);
+ x_im = _mm256_loadu_pd(&f[j + hn].v);
+ y_re = _mm256_loadu_pd(&f[j + t].v);
+ y_im = _mm256_loadu_pd(&f[j + t + hn].v);
+ _mm256_storeu_pd(&f[j].v,
+ _mm256_add_pd(x_re, y_re));
+ _mm256_storeu_pd(&f[j + hn].v,
+ _mm256_add_pd(x_im, y_im));
+ x_re = _mm256_sub_pd(y_re, x_re);
+ x_im = _mm256_sub_pd(x_im, y_im);
+ z_re = FMSUB(x_im, s_im,
+ _mm256_mul_pd(x_re, s_re));
+ z_im = FMADD(x_re, s_im,
+ _mm256_mul_pd(x_im, s_re));
+ _mm256_storeu_pd(&f[j + t].v, z_re);
+ _mm256_storeu_pd(&f[j + t + hn].v, z_im);
+ }
+ } else {
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+ s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + t];
+ y_im = f[j + t + hn];
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(x_re, x_im,
+ x_re, x_im, y_re, y_im);
+ FPC_MUL(f[j + t], f[j + t + hn],
+ x_re, x_im, s_re, s_im);
+ }
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * Last iteration is a no-op, provided that we divide by N/2
+ * instead of N. We need to make a special case for logn = 0.
+ */
+ if (logn > 0) {
+ fpr ni;
+
+ ni = fpr_p2_tab[logn];
+ for (u = 0; u < n; u ++) {
+ f[u] = fpr_mul(f[u], ni);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_add(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_add_pd(
+ _mm256_loadu_pd(&a[u].v),
+ _mm256_loadu_pd(&b[u].v)));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_add(a[u], b[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_sub_pd(
+ _mm256_loadu_pd(&a[u].v),
+ _mm256_loadu_pd(&b[u].v)));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_sub(a[u], b[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ __m256d s;
+
+ s = _mm256_set1_pd(-0.0);
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 8) {
+ __m256d s;
+
+ s = _mm256_set1_pd(-0.0);
+ for (u = (n >> 1); u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+ }
+ } else {
+ for (u = (n >> 1); u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ c_re = FMSUB(
+ a_re, b_re, _mm256_mul_pd(a_im, b_im));
+ c_im = FMADD(
+ a_re, b_im, _mm256_mul_pd(a_im, b_re));
+ _mm256_storeu_pd(&a[u].v, c_re);
+ _mm256_storeu_pd(&a[u + hn].v, c_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ c_re = FMADD(
+ a_re, b_re, _mm256_mul_pd(a_im, b_im));
+ c_im = FMSUB(
+ a_im, b_re, _mm256_mul_pd(a_re, b_im));
+ _mm256_storeu_pd(&a[u].v, c_re);
+ _mm256_storeu_pd(&a[u + hn].v, c_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = fpr_neg(b[u + hn]);
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d zero;
+
+ zero = _mm256_setzero_pd();
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ _mm256_storeu_pd(&a[u].v,
+ FMADD(a_re, a_re,
+ _mm256_mul_pd(a_im, a_im)));
+ _mm256_storeu_pd(&a[u + hn].v, zero);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+ a[u + hn] = fpr_zero;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ __m256d x4;
+
+ x4 = _mm256_set1_pd(x.v);
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_mul(a[u], x);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_div_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ t = _mm256_div_pd(one,
+ FMADD(b_re, b_re,
+ _mm256_mul_pd(b_im, b_im)));
+ b_re = _mm256_mul_pd(b_re, t);
+ b_im = _mm256_mul_pd(b_im, t);
+ c_re = FMADD(
+ a_re, b_re, _mm256_mul_pd(a_im, b_im));
+ c_im = FMSUB(
+ a_im, b_re, _mm256_mul_pd(a_re, b_im));
+ _mm256_storeu_pd(&a[u].v, c_re);
+ _mm256_storeu_pd(&a[u + hn].v, c_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, dv;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ dv = _mm256_div_pd(one,
+ _mm256_add_pd(
+ FMADD(a_re, a_re,
+ _mm256_mul_pd(a_im, a_im)),
+ FMADD(b_re, b_re,
+ _mm256_mul_pd(b_im, b_im))));
+ _mm256_storeu_pd(&d[u].v, dv);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+ fpr b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ d[u] = fpr_inv(fpr_add(
+ fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+ fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d F_re, F_im, G_re, G_im;
+ __m256d f_re, f_im, g_re, g_im;
+ __m256d a_re, a_im, b_re, b_im;
+
+ F_re = _mm256_loadu_pd(&F[u].v);
+ F_im = _mm256_loadu_pd(&F[u + hn].v);
+ G_re = _mm256_loadu_pd(&G[u].v);
+ G_im = _mm256_loadu_pd(&G[u + hn].v);
+ f_re = _mm256_loadu_pd(&f[u].v);
+ f_im = _mm256_loadu_pd(&f[u + hn].v);
+ g_re = _mm256_loadu_pd(&g[u].v);
+ g_im = _mm256_loadu_pd(&g[u + hn].v);
+
+ a_re = FMADD(F_re, f_re,
+ _mm256_mul_pd(F_im, f_im));
+ a_im = FMSUB(F_im, f_re,
+ _mm256_mul_pd(F_re, f_im));
+ b_re = FMADD(G_re, g_re,
+ _mm256_mul_pd(G_im, g_im));
+ b_im = FMSUB(G_im, g_re,
+ _mm256_mul_pd(G_re, g_im));
+ _mm256_storeu_pd(&d[u].v,
+ _mm256_add_pd(a_re, b_re));
+ _mm256_storeu_pd(&d[u + hn].v,
+ _mm256_add_pd(a_im, b_im));
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr F_re, F_im, G_re, G_im;
+ fpr f_re, f_im, g_re, g_im;
+ fpr a_re, a_im, b_re, b_im;
+
+ F_re = F[u];
+ F_im = F[u + hn];
+ G_re = G[u];
+ G_im = G[u + hn];
+ f_re = f[u];
+ f_im = f[u + hn];
+ g_re = g[u];
+ g_im = g[u + hn];
+
+ FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+ FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+ d[u] = fpr_add(a_re, b_re);
+ d[u + hn] = fpr_add(a_im, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, bv;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ bv = _mm256_loadu_pd(&b[u].v);
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_mul_pd(a_re, bv));
+ _mm256_storeu_pd(&a[u + hn].v,
+ _mm256_mul_pd(a_im, bv));
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ a[u] = fpr_mul(a[u], b[u]);
+ a[u + hn] = fpr_mul(a[u + hn], b[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_div_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d ib, a_re, a_im;
+
+ ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
+ _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr ib;
+
+ ib = fpr_inv(b[u]);
+ a[u] = fpr_mul(a[u], ib);
+ a[u + hn] = fpr_mul(a[u + hn], ib);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_LDL_fft(
+ const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+ g00_re = _mm256_loadu_pd(&g00[u].v);
+ g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+ g01_re = _mm256_loadu_pd(&g01[u].v);
+ g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+ g11_re = _mm256_loadu_pd(&g11[u].v);
+ g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+ t = _mm256_div_pd(one,
+ FMADD(g00_re, g00_re,
+ _mm256_mul_pd(g00_im, g00_im)));
+ g00_re = _mm256_mul_pd(g00_re, t);
+ g00_im = _mm256_mul_pd(g00_im, t);
+ mu_re = FMADD(g01_re, g00_re,
+ _mm256_mul_pd(g01_im, g00_im));
+ mu_im = FMSUB(g01_re, g00_im,
+ _mm256_mul_pd(g01_im, g00_re));
+ xi_re = FMSUB(mu_re, g01_re,
+ _mm256_mul_pd(mu_im, g01_im));
+ xi_im = FMADD(mu_im, g01_re,
+ _mm256_mul_pd(mu_re, g01_im));
+ _mm256_storeu_pd(&g11[u].v,
+ _mm256_sub_pd(g11_re, xi_re));
+ _mm256_storeu_pd(&g11[u + hn].v,
+ _mm256_add_pd(g11_im, xi_im));
+ _mm256_storeu_pd(&g01[u].v, mu_re);
+ _mm256_storeu_pd(&g01[u + hn].v, mu_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im,
+ mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(g11[u], g11[u + hn],
+ g11_re, g11_im, g01_re, g01_im);
+ g01[u] = mu_re;
+ g01[u + hn] = fpr_neg(mu_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(
+ fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+ g00_re = _mm256_loadu_pd(&g00[u].v);
+ g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+ g01_re = _mm256_loadu_pd(&g01[u].v);
+ g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+ g11_re = _mm256_loadu_pd(&g11[u].v);
+ g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+ t = _mm256_div_pd(one,
+ FMADD(g00_re, g00_re,
+ _mm256_mul_pd(g00_im, g00_im)));
+ g00_re = _mm256_mul_pd(g00_re, t);
+ g00_im = _mm256_mul_pd(g00_im, t);
+ mu_re = FMADD(g01_re, g00_re,
+ _mm256_mul_pd(g01_im, g00_im));
+ mu_im = FMSUB(g01_re, g00_im,
+ _mm256_mul_pd(g01_im, g00_re));
+ xi_re = FMSUB(mu_re, g01_re,
+ _mm256_mul_pd(mu_im, g01_im));
+ xi_im = FMADD(mu_im, g01_re,
+ _mm256_mul_pd(mu_re, g01_im));
+ _mm256_storeu_pd(&d11[u].v,
+ _mm256_sub_pd(g11_re, xi_re));
+ _mm256_storeu_pd(&d11[u + hn].v,
+ _mm256_add_pd(g11_im, xi_im));
+ _mm256_storeu_pd(&l10[u].v, mu_re);
+ _mm256_storeu_pd(&l10[u + hn].v, mu_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im,
+ mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(d11[u], d11[u + hn],
+ g11_re, g11_im, g01_re, g01_im);
+ l10[u] = mu_re;
+ l10[u + hn] = fpr_neg(mu_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(
+ fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn) {
+ /*
+ * The FFT representation we use is in bit-reversed order
+ * (element i contains f(w^(rev(i))), where rev() is the
+ * bit-reversal function over the ring degree. This changes
+ * indexes with regards to the Falcon specification.
+ */
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ if (n >= 8) {
+ __m256d half, sv;
+
+ half = _mm256_set1_pd(0.5);
+ sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
+ for (u = 0; u < qn; u += 2) {
+ __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
+
+ ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
+ ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
+ ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
+ ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
+ _mm_storeu_pd(&f0[u].v,
+ _mm256_extractf128_pd(ff0, 0));
+ _mm_storeu_pd(&f0[u + qn].v,
+ _mm256_extractf128_pd(ff0, 1));
+
+ ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
+ gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+ ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
+ ff3 = _mm256_hadd_pd(
+ _mm256_mul_pd(ff1, gmt),
+ _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
+ ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
+ _mm_storeu_pd(&f1[u].v,
+ _mm256_extractf128_pd(ff3, 0));
+ _mm_storeu_pd(&f1[u + qn].v,
+ _mm256_extractf128_pd(ff3, 1));
+ }
+ } else {
+ f0[0] = f[0];
+ f1[0] = f[hn];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f[(u << 1) + 0];
+ a_im = f[(u << 1) + 0 + hn];
+ b_re = f[(u << 1) + 1];
+ b_im = f[(u << 1) + 1 + hn];
+
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f0[u] = fpr_half(t_re);
+ f0[u + qn] = fpr_half(t_im);
+
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ FPC_MUL(t_re, t_im, t_re, t_im,
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+ f1[u] = fpr_half(t_re);
+ f1[u + qn] = fpr_half(t_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(
+ fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn) {
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ if (n >= 16) {
+ for (u = 0; u < qn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+ __m256d gm1, gm2, g_re, g_im;
+ __m256d t_re, t_im, u_re, u_im;
+ __m256d tu1_re, tu2_re, tu1_im, tu2_im;
+
+ a_re = _mm256_loadu_pd(&f0[u].v);
+ a_im = _mm256_loadu_pd(&f0[u + qn].v);
+ c_re = _mm256_loadu_pd(&f1[u].v);
+ c_im = _mm256_loadu_pd(&f1[u + qn].v);
+
+ gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+ gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
+ g_re = _mm256_unpacklo_pd(gm1, gm2);
+ g_im = _mm256_unpackhi_pd(gm1, gm2);
+ g_re = _mm256_permute4x64_pd(g_re, 0xD8);
+ g_im = _mm256_permute4x64_pd(g_im, 0xD8);
+
+ b_re = FMSUB(
+ c_re, g_re, _mm256_mul_pd(c_im, g_im));
+ b_im = FMADD(
+ c_re, g_im, _mm256_mul_pd(c_im, g_re));
+
+ t_re = _mm256_add_pd(a_re, b_re);
+ t_im = _mm256_add_pd(a_im, b_im);
+ u_re = _mm256_sub_pd(a_re, b_re);
+ u_im = _mm256_sub_pd(a_im, b_im);
+
+ tu1_re = _mm256_unpacklo_pd(t_re, u_re);
+ tu2_re = _mm256_unpackhi_pd(t_re, u_re);
+ tu1_im = _mm256_unpacklo_pd(t_im, u_im);
+ tu2_im = _mm256_unpackhi_pd(t_im, u_im);
+ _mm256_storeu_pd(&f[(u << 1)].v,
+ _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
+ _mm256_storeu_pd(&f[(u << 1) + 4].v,
+ _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
+ _mm256_storeu_pd(&f[(u << 1) + hn].v,
+ _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
+ _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
+ _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
+ }
+ } else {
+ f[0] = f0[0];
+ f[hn] = f1[0];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f0[u];
+ a_im = f0[u + qn];
+ FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_gm_tab[((u + hn) << 1) + 1]);
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 0] = t_re;
+ f[(u << 1) + 0 + hn] = t_im;
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 1] = t_re;
+ f[(u << 1) + 1 + hn] = t_im;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.c
new file mode 100644
index 000000000..8940f3400
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.c
@@ -0,0 +1,1076 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+const fpr fpr_gm_tab[] = {
+ {0}, {0}, /* unused */
+ {-0.000000000000000000000000000}, { 1.000000000000000000000000000},
+ { 0.707106781186547524400844362}, { 0.707106781186547524400844362},
+ {-0.707106781186547524400844362}, { 0.707106781186547524400844362},
+ { 0.923879532511286756128183189}, { 0.382683432365089771728459984},
+ {-0.382683432365089771728459984}, { 0.923879532511286756128183189},
+ { 0.382683432365089771728459984}, { 0.923879532511286756128183189},
+ {-0.923879532511286756128183189}, { 0.382683432365089771728459984},
+ { 0.980785280403230449126182236}, { 0.195090322016128267848284868},
+ {-0.195090322016128267848284868}, { 0.980785280403230449126182236},
+ { 0.555570233019602224742830814}, { 0.831469612302545237078788378},
+ {-0.831469612302545237078788378}, { 0.555570233019602224742830814},
+ { 0.831469612302545237078788378}, { 0.555570233019602224742830814},
+ {-0.555570233019602224742830814}, { 0.831469612302545237078788378},
+ { 0.195090322016128267848284868}, { 0.980785280403230449126182236},
+ {-0.980785280403230449126182236}, { 0.195090322016128267848284868},
+ { 0.995184726672196886244836953}, { 0.098017140329560601994195564},
+ {-0.098017140329560601994195564}, { 0.995184726672196886244836953},
+ { 0.634393284163645498215171613}, { 0.773010453362736960810906610},
+ {-0.773010453362736960810906610}, { 0.634393284163645498215171613},
+ { 0.881921264348355029712756864}, { 0.471396736825997648556387626},
+ {-0.471396736825997648556387626}, { 0.881921264348355029712756864},
+ { 0.290284677254462367636192376}, { 0.956940335732208864935797887},
+ {-0.956940335732208864935797887}, { 0.290284677254462367636192376},
+ { 0.956940335732208864935797887}, { 0.290284677254462367636192376},
+ {-0.290284677254462367636192376}, { 0.956940335732208864935797887},
+ { 0.471396736825997648556387626}, { 0.881921264348355029712756864},
+ {-0.881921264348355029712756864}, { 0.471396736825997648556387626},
+ { 0.773010453362736960810906610}, { 0.634393284163645498215171613},
+ {-0.634393284163645498215171613}, { 0.773010453362736960810906610},
+ { 0.098017140329560601994195564}, { 0.995184726672196886244836953},
+ {-0.995184726672196886244836953}, { 0.098017140329560601994195564},
+ { 0.998795456205172392714771605}, { 0.049067674327418014254954977},
+ {-0.049067674327418014254954977}, { 0.998795456205172392714771605},
+ { 0.671558954847018400625376850}, { 0.740951125354959091175616897},
+ {-0.740951125354959091175616897}, { 0.671558954847018400625376850},
+ { 0.903989293123443331586200297}, { 0.427555093430282094320966857},
+ {-0.427555093430282094320966857}, { 0.903989293123443331586200297},
+ { 0.336889853392220050689253213}, { 0.941544065183020778412509403},
+ {-0.941544065183020778412509403}, { 0.336889853392220050689253213},
+ { 0.970031253194543992603984207}, { 0.242980179903263889948274162},
+ {-0.242980179903263889948274162}, { 0.970031253194543992603984207},
+ { 0.514102744193221726593693839}, { 0.857728610000272069902269984},
+ {-0.857728610000272069902269984}, { 0.514102744193221726593693839},
+ { 0.803207531480644909806676513}, { 0.595699304492433343467036529},
+ {-0.595699304492433343467036529}, { 0.803207531480644909806676513},
+ { 0.146730474455361751658850130}, { 0.989176509964780973451673738},
+ {-0.989176509964780973451673738}, { 0.146730474455361751658850130},
+ { 0.989176509964780973451673738}, { 0.146730474455361751658850130},
+ {-0.146730474455361751658850130}, { 0.989176509964780973451673738},
+ { 0.595699304492433343467036529}, { 0.803207531480644909806676513},
+ {-0.803207531480644909806676513}, { 0.595699304492433343467036529},
+ { 0.857728610000272069902269984}, { 0.514102744193221726593693839},
+ {-0.514102744193221726593693839}, { 0.857728610000272069902269984},
+ { 0.242980179903263889948274162}, { 0.970031253194543992603984207},
+ {-0.970031253194543992603984207}, { 0.242980179903263889948274162},
+ { 0.941544065183020778412509403}, { 0.336889853392220050689253213},
+ {-0.336889853392220050689253213}, { 0.941544065183020778412509403},
+ { 0.427555093430282094320966857}, { 0.903989293123443331586200297},
+ {-0.903989293123443331586200297}, { 0.427555093430282094320966857},
+ { 0.740951125354959091175616897}, { 0.671558954847018400625376850},
+ {-0.671558954847018400625376850}, { 0.740951125354959091175616897},
+ { 0.049067674327418014254954977}, { 0.998795456205172392714771605},
+ {-0.998795456205172392714771605}, { 0.049067674327418014254954977},
+ { 0.999698818696204220115765650}, { 0.024541228522912288031734529},
+ {-0.024541228522912288031734529}, { 0.999698818696204220115765650},
+ { 0.689540544737066924616730630}, { 0.724247082951466920941069243},
+ {-0.724247082951466920941069243}, { 0.689540544737066924616730630},
+ { 0.914209755703530654635014829}, { 0.405241314004989870908481306},
+ {-0.405241314004989870908481306}, { 0.914209755703530654635014829},
+ { 0.359895036534988148775104572}, { 0.932992798834738887711660256},
+ {-0.932992798834738887711660256}, { 0.359895036534988148775104572},
+ { 0.975702130038528544460395766}, { 0.219101240156869797227737547},
+ {-0.219101240156869797227737547}, { 0.975702130038528544460395766},
+ { 0.534997619887097210663076905}, { 0.844853565249707073259571205},
+ {-0.844853565249707073259571205}, { 0.534997619887097210663076905},
+ { 0.817584813151583696504920884}, { 0.575808191417845300745972454},
+ {-0.575808191417845300745972454}, { 0.817584813151583696504920884},
+ { 0.170961888760301226363642357}, { 0.985277642388941244774018433},
+ {-0.985277642388941244774018433}, { 0.170961888760301226363642357},
+ { 0.992479534598709998156767252}, { 0.122410675199216198498704474},
+ {-0.122410675199216198498704474}, { 0.992479534598709998156767252},
+ { 0.615231590580626845484913563}, { 0.788346427626606262009164705},
+ {-0.788346427626606262009164705}, { 0.615231590580626845484913563},
+ { 0.870086991108711418652292404}, { 0.492898192229784036873026689},
+ {-0.492898192229784036873026689}, { 0.870086991108711418652292404},
+ { 0.266712757474898386325286515}, { 0.963776065795439866686464356},
+ {-0.963776065795439866686464356}, { 0.266712757474898386325286515},
+ { 0.949528180593036667195936074}, { 0.313681740398891476656478846},
+ {-0.313681740398891476656478846}, { 0.949528180593036667195936074},
+ { 0.449611329654606600046294579}, { 0.893224301195515320342416447},
+ {-0.893224301195515320342416447}, { 0.449611329654606600046294579},
+ { 0.757208846506484547575464054}, { 0.653172842953776764084203014},
+ {-0.653172842953776764084203014}, { 0.757208846506484547575464054},
+ { 0.073564563599667423529465622}, { 0.997290456678690216135597140},
+ {-0.997290456678690216135597140}, { 0.073564563599667423529465622},
+ { 0.997290456678690216135597140}, { 0.073564563599667423529465622},
+ {-0.073564563599667423529465622}, { 0.997290456678690216135597140},
+ { 0.653172842953776764084203014}, { 0.757208846506484547575464054},
+ {-0.757208846506484547575464054}, { 0.653172842953776764084203014},
+ { 0.893224301195515320342416447}, { 0.449611329654606600046294579},
+ {-0.449611329654606600046294579}, { 0.893224301195515320342416447},
+ { 0.313681740398891476656478846}, { 0.949528180593036667195936074},
+ {-0.949528180593036667195936074}, { 0.313681740398891476656478846},
+ { 0.963776065795439866686464356}, { 0.266712757474898386325286515},
+ {-0.266712757474898386325286515}, { 0.963776065795439866686464356},
+ { 0.492898192229784036873026689}, { 0.870086991108711418652292404},
+ {-0.870086991108711418652292404}, { 0.492898192229784036873026689},
+ { 0.788346427626606262009164705}, { 0.615231590580626845484913563},
+ {-0.615231590580626845484913563}, { 0.788346427626606262009164705},
+ { 0.122410675199216198498704474}, { 0.992479534598709998156767252},
+ {-0.992479534598709998156767252}, { 0.122410675199216198498704474},
+ { 0.985277642388941244774018433}, { 0.170961888760301226363642357},
+ {-0.170961888760301226363642357}, { 0.985277642388941244774018433},
+ { 0.575808191417845300745972454}, { 0.817584813151583696504920884},
+ {-0.817584813151583696504920884}, { 0.575808191417845300745972454},
+ { 0.844853565249707073259571205}, { 0.534997619887097210663076905},
+ {-0.534997619887097210663076905}, { 0.844853565249707073259571205},
+ { 0.219101240156869797227737547}, { 0.975702130038528544460395766},
+ {-0.975702130038528544460395766}, { 0.219101240156869797227737547},
+ { 0.932992798834738887711660256}, { 0.359895036534988148775104572},
+ {-0.359895036534988148775104572}, { 0.932992798834738887711660256},
+ { 0.405241314004989870908481306}, { 0.914209755703530654635014829},
+ {-0.914209755703530654635014829}, { 0.405241314004989870908481306},
+ { 0.724247082951466920941069243}, { 0.689540544737066924616730630},
+ {-0.689540544737066924616730630}, { 0.724247082951466920941069243},
+ { 0.024541228522912288031734529}, { 0.999698818696204220115765650},
+ {-0.999698818696204220115765650}, { 0.024541228522912288031734529},
+ { 0.999924701839144540921646491}, { 0.012271538285719926079408262},
+ {-0.012271538285719926079408262}, { 0.999924701839144540921646491},
+ { 0.698376249408972853554813503}, { 0.715730825283818654125532623},
+ {-0.715730825283818654125532623}, { 0.698376249408972853554813503},
+ { 0.919113851690057743908477789}, { 0.393992040061048108596188661},
+ {-0.393992040061048108596188661}, { 0.919113851690057743908477789},
+ { 0.371317193951837543411934967}, { 0.928506080473215565937167396},
+ {-0.928506080473215565937167396}, { 0.371317193951837543411934967},
+ { 0.978317370719627633106240097}, { 0.207111376192218549708116020},
+ {-0.207111376192218549708116020}, { 0.978317370719627633106240097},
+ { 0.545324988422046422313987347}, { 0.838224705554838043186996856},
+ {-0.838224705554838043186996856}, { 0.545324988422046422313987347},
+ { 0.824589302785025264474803737}, { 0.565731810783613197389765011},
+ {-0.565731810783613197389765011}, { 0.824589302785025264474803737},
+ { 0.183039887955140958516532578}, { 0.983105487431216327180301155},
+ {-0.983105487431216327180301155}, { 0.183039887955140958516532578},
+ { 0.993906970002356041546922813}, { 0.110222207293883058807899140},
+ {-0.110222207293883058807899140}, { 0.993906970002356041546922813},
+ { 0.624859488142386377084072816}, { 0.780737228572094478301588484},
+ {-0.780737228572094478301588484}, { 0.624859488142386377084072816},
+ { 0.876070094195406607095844268}, { 0.482183772079122748517344481},
+ {-0.482183772079122748517344481}, { 0.876070094195406607095844268},
+ { 0.278519689385053105207848526}, { 0.960430519415565811199035138},
+ {-0.960430519415565811199035138}, { 0.278519689385053105207848526},
+ { 0.953306040354193836916740383}, { 0.302005949319228067003463232},
+ {-0.302005949319228067003463232}, { 0.953306040354193836916740383},
+ { 0.460538710958240023633181487}, { 0.887639620402853947760181617},
+ {-0.887639620402853947760181617}, { 0.460538710958240023633181487},
+ { 0.765167265622458925888815999}, { 0.643831542889791465068086063},
+ {-0.643831542889791465068086063}, { 0.765167265622458925888815999},
+ { 0.085797312344439890461556332}, { 0.996312612182778012627226190},
+ {-0.996312612182778012627226190}, { 0.085797312344439890461556332},
+ { 0.998118112900149207125155861}, { 0.061320736302208577782614593},
+ {-0.061320736302208577782614593}, { 0.998118112900149207125155861},
+ { 0.662415777590171761113069817}, { 0.749136394523459325469203257},
+ {-0.749136394523459325469203257}, { 0.662415777590171761113069817},
+ { 0.898674465693953843041976744}, { 0.438616238538527637647025738},
+ {-0.438616238538527637647025738}, { 0.898674465693953843041976744},
+ { 0.325310292162262934135954708}, { 0.945607325380521325730945387},
+ {-0.945607325380521325730945387}, { 0.325310292162262934135954708},
+ { 0.966976471044852109087220226}, { 0.254865659604514571553980779},
+ {-0.254865659604514571553980779}, { 0.966976471044852109087220226},
+ { 0.503538383725717558691867071}, { 0.863972856121586737918147054},
+ {-0.863972856121586737918147054}, { 0.503538383725717558691867071},
+ { 0.795836904608883536262791915}, { 0.605511041404325513920626941},
+ {-0.605511041404325513920626941}, { 0.795836904608883536262791915},
+ { 0.134580708507126186316358409}, { 0.990902635427780025108237011},
+ {-0.990902635427780025108237011}, { 0.134580708507126186316358409},
+ { 0.987301418157858382399815802}, { 0.158858143333861441684385360},
+ {-0.158858143333861441684385360}, { 0.987301418157858382399815802},
+ { 0.585797857456438860328080838}, { 0.810457198252594791726703434},
+ {-0.810457198252594791726703434}, { 0.585797857456438860328080838},
+ { 0.851355193105265142261290312}, { 0.524589682678468906215098464},
+ {-0.524589682678468906215098464}, { 0.851355193105265142261290312},
+ { 0.231058108280671119643236018}, { 0.972939952205560145467720114},
+ {-0.972939952205560145467720114}, { 0.231058108280671119643236018},
+ { 0.937339011912574923201899593}, { 0.348418680249434568419308588},
+ {-0.348418680249434568419308588}, { 0.937339011912574923201899593},
+ { 0.416429560097637182562598911}, { 0.909167983090522376563884788},
+ {-0.909167983090522376563884788}, { 0.416429560097637182562598911},
+ { 0.732654271672412834615546649}, { 0.680600997795453050594430464},
+ {-0.680600997795453050594430464}, { 0.732654271672412834615546649},
+ { 0.036807222941358832324332691}, { 0.999322384588349500896221011},
+ {-0.999322384588349500896221011}, { 0.036807222941358832324332691},
+ { 0.999322384588349500896221011}, { 0.036807222941358832324332691},
+ {-0.036807222941358832324332691}, { 0.999322384588349500896221011},
+ { 0.680600997795453050594430464}, { 0.732654271672412834615546649},
+ {-0.732654271672412834615546649}, { 0.680600997795453050594430464},
+ { 0.909167983090522376563884788}, { 0.416429560097637182562598911},
+ {-0.416429560097637182562598911}, { 0.909167983090522376563884788},
+ { 0.348418680249434568419308588}, { 0.937339011912574923201899593},
+ {-0.937339011912574923201899593}, { 0.348418680249434568419308588},
+ { 0.972939952205560145467720114}, { 0.231058108280671119643236018},
+ {-0.231058108280671119643236018}, { 0.972939952205560145467720114},
+ { 0.524589682678468906215098464}, { 0.851355193105265142261290312},
+ {-0.851355193105265142261290312}, { 0.524589682678468906215098464},
+ { 0.810457198252594791726703434}, { 0.585797857456438860328080838},
+ {-0.585797857456438860328080838}, { 0.810457198252594791726703434},
+ { 0.158858143333861441684385360}, { 0.987301418157858382399815802},
+ {-0.987301418157858382399815802}, { 0.158858143333861441684385360},
+ { 0.990902635427780025108237011}, { 0.134580708507126186316358409},
+ {-0.134580708507126186316358409}, { 0.990902635427780025108237011},
+ { 0.605511041404325513920626941}, { 0.795836904608883536262791915},
+ {-0.795836904608883536262791915}, { 0.605511041404325513920626941},
+ { 0.863972856121586737918147054}, { 0.503538383725717558691867071},
+ {-0.503538383725717558691867071}, { 0.863972856121586737918147054},
+ { 0.254865659604514571553980779}, { 0.966976471044852109087220226},
+ {-0.966976471044852109087220226}, { 0.254865659604514571553980779},
+ { 0.945607325380521325730945387}, { 0.325310292162262934135954708},
+ {-0.325310292162262934135954708}, { 0.945607325380521325730945387},
+ { 0.438616238538527637647025738}, { 0.898674465693953843041976744},
+ {-0.898674465693953843041976744}, { 0.438616238538527637647025738},
+ { 0.749136394523459325469203257}, { 0.662415777590171761113069817},
+ {-0.662415777590171761113069817}, { 0.749136394523459325469203257},
+ { 0.061320736302208577782614593}, { 0.998118112900149207125155861},
+ {-0.998118112900149207125155861}, { 0.061320736302208577782614593},
+ { 0.996312612182778012627226190}, { 0.085797312344439890461556332},
+ {-0.085797312344439890461556332}, { 0.996312612182778012627226190},
+ { 0.643831542889791465068086063}, { 0.765167265622458925888815999},
+ {-0.765167265622458925888815999}, { 0.643831542889791465068086063},
+ { 0.887639620402853947760181617}, { 0.460538710958240023633181487},
+ {-0.460538710958240023633181487}, { 0.887639620402853947760181617},
+ { 0.302005949319228067003463232}, { 0.953306040354193836916740383},
+ {-0.953306040354193836916740383}, { 0.302005949319228067003463232},
+ { 0.960430519415565811199035138}, { 0.278519689385053105207848526},
+ {-0.278519689385053105207848526}, { 0.960430519415565811199035138},
+ { 0.482183772079122748517344481}, { 0.876070094195406607095844268},
+ {-0.876070094195406607095844268}, { 0.482183772079122748517344481},
+ { 0.780737228572094478301588484}, { 0.624859488142386377084072816},
+ {-0.624859488142386377084072816}, { 0.780737228572094478301588484},
+ { 0.110222207293883058807899140}, { 0.993906970002356041546922813},
+ {-0.993906970002356041546922813}, { 0.110222207293883058807899140},
+ { 0.983105487431216327180301155}, { 0.183039887955140958516532578},
+ {-0.183039887955140958516532578}, { 0.983105487431216327180301155},
+ { 0.565731810783613197389765011}, { 0.824589302785025264474803737},
+ {-0.824589302785025264474803737}, { 0.565731810783613197389765011},
+ { 0.838224705554838043186996856}, { 0.545324988422046422313987347},
+ {-0.545324988422046422313987347}, { 0.838224705554838043186996856},
+ { 0.207111376192218549708116020}, { 0.978317370719627633106240097},
+ {-0.978317370719627633106240097}, { 0.207111376192218549708116020},
+ { 0.928506080473215565937167396}, { 0.371317193951837543411934967},
+ {-0.371317193951837543411934967}, { 0.928506080473215565937167396},
+ { 0.393992040061048108596188661}, { 0.919113851690057743908477789},
+ {-0.919113851690057743908477789}, { 0.393992040061048108596188661},
+ { 0.715730825283818654125532623}, { 0.698376249408972853554813503},
+ {-0.698376249408972853554813503}, { 0.715730825283818654125532623},
+ { 0.012271538285719926079408262}, { 0.999924701839144540921646491},
+ {-0.999924701839144540921646491}, { 0.012271538285719926079408262},
+ { 0.999981175282601142656990438}, { 0.006135884649154475359640235},
+ {-0.006135884649154475359640235}, { 0.999981175282601142656990438},
+ { 0.702754744457225302452914421}, { 0.711432195745216441522130290},
+ {-0.711432195745216441522130290}, { 0.702754744457225302452914421},
+ { 0.921514039342041943465396332}, { 0.388345046698826291624993541},
+ {-0.388345046698826291624993541}, { 0.921514039342041943465396332},
+ { 0.377007410216418256726567823}, { 0.926210242138311341974793388},
+ {-0.926210242138311341974793388}, { 0.377007410216418256726567823},
+ { 0.979569765685440534439326110}, { 0.201104634842091911558443546},
+ {-0.201104634842091911558443546}, { 0.979569765685440534439326110},
+ { 0.550457972936604802977289893}, { 0.834862874986380056304401383},
+ {-0.834862874986380056304401383}, { 0.550457972936604802977289893},
+ { 0.828045045257755752067527592}, { 0.560661576197336023839710223},
+ {-0.560661576197336023839710223}, { 0.828045045257755752067527592},
+ { 0.189068664149806212754997837}, { 0.981963869109555264072848154},
+ {-0.981963869109555264072848154}, { 0.189068664149806212754997837},
+ { 0.994564570734255452119106243}, { 0.104121633872054579120943880},
+ {-0.104121633872054579120943880}, { 0.994564570734255452119106243},
+ { 0.629638238914927025372981341}, { 0.776888465673232450040827983},
+ {-0.776888465673232450040827983}, { 0.629638238914927025372981341},
+ { 0.879012226428633477831323711}, { 0.476799230063322133342158117},
+ {-0.476799230063322133342158117}, { 0.879012226428633477831323711},
+ { 0.284407537211271843618310615}, { 0.958703474895871555374645792},
+ {-0.958703474895871555374645792}, { 0.284407537211271843618310615},
+ { 0.955141168305770721498157712}, { 0.296150888243623824121786128},
+ {-0.296150888243623824121786128}, { 0.955141168305770721498157712},
+ { 0.465976495767966177902756065}, { 0.884797098430937780104007041},
+ {-0.884797098430937780104007041}, { 0.465976495767966177902756065},
+ { 0.769103337645579639346626069}, { 0.639124444863775743801488193},
+ {-0.639124444863775743801488193}, { 0.769103337645579639346626069},
+ { 0.091908956497132728624990979}, { 0.995767414467659793982495643},
+ {-0.995767414467659793982495643}, { 0.091908956497132728624990979},
+ { 0.998475580573294752208559038}, { 0.055195244349689939809447526},
+ {-0.055195244349689939809447526}, { 0.998475580573294752208559038},
+ { 0.666999922303637506650154222}, { 0.745057785441465962407907310},
+ {-0.745057785441465962407907310}, { 0.666999922303637506650154222},
+ { 0.901348847046022014570746093}, { 0.433093818853151968484222638},
+ {-0.433093818853151968484222638}, { 0.901348847046022014570746093},
+ { 0.331106305759876401737190737}, { 0.943593458161960361495301445},
+ {-0.943593458161960361495301445}, { 0.331106305759876401737190737},
+ { 0.968522094274417316221088329}, { 0.248927605745720168110682816},
+ {-0.248927605745720168110682816}, { 0.968522094274417316221088329},
+ { 0.508830142543107036931749324}, { 0.860866938637767279344583877},
+ {-0.860866938637767279344583877}, { 0.508830142543107036931749324},
+ { 0.799537269107905033500246232}, { 0.600616479383868926653875896},
+ {-0.600616479383868926653875896}, { 0.799537269107905033500246232},
+ { 0.140658239332849230714788846}, { 0.990058210262297105505906464},
+ {-0.990058210262297105505906464}, { 0.140658239332849230714788846},
+ { 0.988257567730749491404792538}, { 0.152797185258443427720336613},
+ {-0.152797185258443427720336613}, { 0.988257567730749491404792538},
+ { 0.590759701858874228423887908}, { 0.806847553543799272206514313},
+ {-0.806847553543799272206514313}, { 0.590759701858874228423887908},
+ { 0.854557988365400520767862276}, { 0.519355990165589587361829932},
+ {-0.519355990165589587361829932}, { 0.854557988365400520767862276},
+ { 0.237023605994367206867735915}, { 0.971503890986251775537099622},
+ {-0.971503890986251775537099622}, { 0.237023605994367206867735915},
+ { 0.939459223602189911962669246}, { 0.342660717311994397592781983},
+ {-0.342660717311994397592781983}, { 0.939459223602189911962669246},
+ { 0.422000270799799685941287941}, { 0.906595704514915365332960588},
+ {-0.906595704514915365332960588}, { 0.422000270799799685941287941},
+ { 0.736816568877369875090132520}, { 0.676092703575315960360419228},
+ {-0.676092703575315960360419228}, { 0.736816568877369875090132520},
+ { 0.042938256934940823077124540}, { 0.999077727752645382888781997},
+ {-0.999077727752645382888781997}, { 0.042938256934940823077124540},
+ { 0.999529417501093163079703322}, { 0.030674803176636625934021028},
+ {-0.030674803176636625934021028}, { 0.999529417501093163079703322},
+ { 0.685083667772700381362052545}, { 0.728464390448225196492035438},
+ {-0.728464390448225196492035438}, { 0.685083667772700381362052545},
+ { 0.911706032005429851404397325}, { 0.410843171057903942183466675},
+ {-0.410843171057903942183466675}, { 0.911706032005429851404397325},
+ { 0.354163525420490382357395796}, { 0.935183509938947577642207480},
+ {-0.935183509938947577642207480}, { 0.354163525420490382357395796},
+ { 0.974339382785575860518721668}, { 0.225083911359792835991642120},
+ {-0.225083911359792835991642120}, { 0.974339382785575860518721668},
+ { 0.529803624686294668216054671}, { 0.848120344803297251279133563},
+ {-0.848120344803297251279133563}, { 0.529803624686294668216054671},
+ { 0.814036329705948361654516690}, { 0.580813958095764545075595272},
+ {-0.580813958095764545075595272}, { 0.814036329705948361654516690},
+ { 0.164913120489969921418189113}, { 0.986308097244598647863297524},
+ {-0.986308097244598647863297524}, { 0.164913120489969921418189113},
+ { 0.991709753669099522860049931}, { 0.128498110793793172624415589},
+ {-0.128498110793793172624415589}, { 0.991709753669099522860049931},
+ { 0.610382806276309452716352152}, { 0.792106577300212351782342879},
+ {-0.792106577300212351782342879}, { 0.610382806276309452716352152},
+ { 0.867046245515692651480195629}, { 0.498227666972781852410983869},
+ {-0.498227666972781852410983869}, { 0.867046245515692651480195629},
+ { 0.260794117915275518280186509}, { 0.965394441697689374550843858},
+ {-0.965394441697689374550843858}, { 0.260794117915275518280186509},
+ { 0.947585591017741134653387321}, { 0.319502030816015677901518272},
+ {-0.319502030816015677901518272}, { 0.947585591017741134653387321},
+ { 0.444122144570429231642069418}, { 0.895966249756185155914560282},
+ {-0.895966249756185155914560282}, { 0.444122144570429231642069418},
+ { 0.753186799043612482483430486}, { 0.657806693297078656931182264},
+ {-0.657806693297078656931182264}, { 0.753186799043612482483430486},
+ { 0.067443919563664057897972422}, { 0.997723066644191609848546728},
+ {-0.997723066644191609848546728}, { 0.067443919563664057897972422},
+ { 0.996820299291165714972629398}, { 0.079682437971430121147120656},
+ {-0.079682437971430121147120656}, { 0.996820299291165714972629398},
+ { 0.648514401022112445084560551}, { 0.761202385484261814029709836},
+ {-0.761202385484261814029709836}, { 0.648514401022112445084560551},
+ { 0.890448723244757889952150560}, { 0.455083587126343823535869268},
+ {-0.455083587126343823535869268}, { 0.890448723244757889952150560},
+ { 0.307849640041534893682063646}, { 0.951435020969008369549175569},
+ {-0.951435020969008369549175569}, { 0.307849640041534893682063646},
+ { 0.962121404269041595429604316}, { 0.272621355449948984493347477},
+ {-0.272621355449948984493347477}, { 0.962121404269041595429604316},
+ { 0.487550160148435954641485027}, { 0.873094978418290098636085973},
+ {-0.873094978418290098636085973}, { 0.487550160148435954641485027},
+ { 0.784556597155575233023892575}, { 0.620057211763289178646268191},
+ {-0.620057211763289178646268191}, { 0.784556597155575233023892575},
+ { 0.116318630911904767252544319}, { 0.993211949234794533104601012},
+ {-0.993211949234794533104601012}, { 0.116318630911904767252544319},
+ { 0.984210092386929073193874387}, { 0.177004220412148756196839844},
+ {-0.177004220412148756196839844}, { 0.984210092386929073193874387},
+ { 0.570780745886967280232652864}, { 0.821102514991104679060430820},
+ {-0.821102514991104679060430820}, { 0.570780745886967280232652864},
+ { 0.841554977436898409603499520}, { 0.540171472729892881297845480},
+ {-0.540171472729892881297845480}, { 0.841554977436898409603499520},
+ { 0.213110319916091373967757518}, { 0.977028142657754351485866211},
+ {-0.977028142657754351485866211}, { 0.213110319916091373967757518},
+ { 0.930766961078983731944872340}, { 0.365612997804773870011745909},
+ {-0.365612997804773870011745909}, { 0.930766961078983731944872340},
+ { 0.399624199845646828544117031}, { 0.916679059921042663116457013},
+ {-0.916679059921042663116457013}, { 0.399624199845646828544117031},
+ { 0.720002507961381629076682999}, { 0.693971460889654009003734389},
+ {-0.693971460889654009003734389}, { 0.720002507961381629076682999},
+ { 0.018406729905804820927366313}, { 0.999830581795823422015722275},
+ {-0.999830581795823422015722275}, { 0.018406729905804820927366313},
+ { 0.999830581795823422015722275}, { 0.018406729905804820927366313},
+ {-0.018406729905804820927366313}, { 0.999830581795823422015722275},
+ { 0.693971460889654009003734389}, { 0.720002507961381629076682999},
+ {-0.720002507961381629076682999}, { 0.693971460889654009003734389},
+ { 0.916679059921042663116457013}, { 0.399624199845646828544117031},
+ {-0.399624199845646828544117031}, { 0.916679059921042663116457013},
+ { 0.365612997804773870011745909}, { 0.930766961078983731944872340},
+ {-0.930766961078983731944872340}, { 0.365612997804773870011745909},
+ { 0.977028142657754351485866211}, { 0.213110319916091373967757518},
+ {-0.213110319916091373967757518}, { 0.977028142657754351485866211},
+ { 0.540171472729892881297845480}, { 0.841554977436898409603499520},
+ {-0.841554977436898409603499520}, { 0.540171472729892881297845480},
+ { 0.821102514991104679060430820}, { 0.570780745886967280232652864},
+ {-0.570780745886967280232652864}, { 0.821102514991104679060430820},
+ { 0.177004220412148756196839844}, { 0.984210092386929073193874387},
+ {-0.984210092386929073193874387}, { 0.177004220412148756196839844},
+ { 0.993211949234794533104601012}, { 0.116318630911904767252544319},
+ {-0.116318630911904767252544319}, { 0.993211949234794533104601012},
+ { 0.620057211763289178646268191}, { 0.784556597155575233023892575},
+ {-0.784556597155575233023892575}, { 0.620057211763289178646268191},
+ { 0.873094978418290098636085973}, { 0.487550160148435954641485027},
+ {-0.487550160148435954641485027}, { 0.873094978418290098636085973},
+ { 0.272621355449948984493347477}, { 0.962121404269041595429604316},
+ {-0.962121404269041595429604316}, { 0.272621355449948984493347477},
+ { 0.951435020969008369549175569}, { 0.307849640041534893682063646},
+ {-0.307849640041534893682063646}, { 0.951435020969008369549175569},
+ { 0.455083587126343823535869268}, { 0.890448723244757889952150560},
+ {-0.890448723244757889952150560}, { 0.455083587126343823535869268},
+ { 0.761202385484261814029709836}, { 0.648514401022112445084560551},
+ {-0.648514401022112445084560551}, { 0.761202385484261814029709836},
+ { 0.079682437971430121147120656}, { 0.996820299291165714972629398},
+ {-0.996820299291165714972629398}, { 0.079682437971430121147120656},
+ { 0.997723066644191609848546728}, { 0.067443919563664057897972422},
+ {-0.067443919563664057897972422}, { 0.997723066644191609848546728},
+ { 0.657806693297078656931182264}, { 0.753186799043612482483430486},
+ {-0.753186799043612482483430486}, { 0.657806693297078656931182264},
+ { 0.895966249756185155914560282}, { 0.444122144570429231642069418},
+ {-0.444122144570429231642069418}, { 0.895966249756185155914560282},
+ { 0.319502030816015677901518272}, { 0.947585591017741134653387321},
+ {-0.947585591017741134653387321}, { 0.319502030816015677901518272},
+ { 0.965394441697689374550843858}, { 0.260794117915275518280186509},
+ {-0.260794117915275518280186509}, { 0.965394441697689374550843858},
+ { 0.498227666972781852410983869}, { 0.867046245515692651480195629},
+ {-0.867046245515692651480195629}, { 0.498227666972781852410983869},
+ { 0.792106577300212351782342879}, { 0.610382806276309452716352152},
+ {-0.610382806276309452716352152}, { 0.792106577300212351782342879},
+ { 0.128498110793793172624415589}, { 0.991709753669099522860049931},
+ {-0.991709753669099522860049931}, { 0.128498110793793172624415589},
+ { 0.986308097244598647863297524}, { 0.164913120489969921418189113},
+ {-0.164913120489969921418189113}, { 0.986308097244598647863297524},
+ { 0.580813958095764545075595272}, { 0.814036329705948361654516690},
+ {-0.814036329705948361654516690}, { 0.580813958095764545075595272},
+ { 0.848120344803297251279133563}, { 0.529803624686294668216054671},
+ {-0.529803624686294668216054671}, { 0.848120344803297251279133563},
+ { 0.225083911359792835991642120}, { 0.974339382785575860518721668},
+ {-0.974339382785575860518721668}, { 0.225083911359792835991642120},
+ { 0.935183509938947577642207480}, { 0.354163525420490382357395796},
+ {-0.354163525420490382357395796}, { 0.935183509938947577642207480},
+ { 0.410843171057903942183466675}, { 0.911706032005429851404397325},
+ {-0.911706032005429851404397325}, { 0.410843171057903942183466675},
+ { 0.728464390448225196492035438}, { 0.685083667772700381362052545},
+ {-0.685083667772700381362052545}, { 0.728464390448225196492035438},
+ { 0.030674803176636625934021028}, { 0.999529417501093163079703322},
+ {-0.999529417501093163079703322}, { 0.030674803176636625934021028},
+ { 0.999077727752645382888781997}, { 0.042938256934940823077124540},
+ {-0.042938256934940823077124540}, { 0.999077727752645382888781997},
+ { 0.676092703575315960360419228}, { 0.736816568877369875090132520},
+ {-0.736816568877369875090132520}, { 0.676092703575315960360419228},
+ { 0.906595704514915365332960588}, { 0.422000270799799685941287941},
+ {-0.422000270799799685941287941}, { 0.906595704514915365332960588},
+ { 0.342660717311994397592781983}, { 0.939459223602189911962669246},
+ {-0.939459223602189911962669246}, { 0.342660717311994397592781983},
+ { 0.971503890986251775537099622}, { 0.237023605994367206867735915},
+ {-0.237023605994367206867735915}, { 0.971503890986251775537099622},
+ { 0.519355990165589587361829932}, { 0.854557988365400520767862276},
+ {-0.854557988365400520767862276}, { 0.519355990165589587361829932},
+ { 0.806847553543799272206514313}, { 0.590759701858874228423887908},
+ {-0.590759701858874228423887908}, { 0.806847553543799272206514313},
+ { 0.152797185258443427720336613}, { 0.988257567730749491404792538},
+ {-0.988257567730749491404792538}, { 0.152797185258443427720336613},
+ { 0.990058210262297105505906464}, { 0.140658239332849230714788846},
+ {-0.140658239332849230714788846}, { 0.990058210262297105505906464},
+ { 0.600616479383868926653875896}, { 0.799537269107905033500246232},
+ {-0.799537269107905033500246232}, { 0.600616479383868926653875896},
+ { 0.860866938637767279344583877}, { 0.508830142543107036931749324},
+ {-0.508830142543107036931749324}, { 0.860866938637767279344583877},
+ { 0.248927605745720168110682816}, { 0.968522094274417316221088329},
+ {-0.968522094274417316221088329}, { 0.248927605745720168110682816},
+ { 0.943593458161960361495301445}, { 0.331106305759876401737190737},
+ {-0.331106305759876401737190737}, { 0.943593458161960361495301445},
+ { 0.433093818853151968484222638}, { 0.901348847046022014570746093},
+ {-0.901348847046022014570746093}, { 0.433093818853151968484222638},
+ { 0.745057785441465962407907310}, { 0.666999922303637506650154222},
+ {-0.666999922303637506650154222}, { 0.745057785441465962407907310},
+ { 0.055195244349689939809447526}, { 0.998475580573294752208559038},
+ {-0.998475580573294752208559038}, { 0.055195244349689939809447526},
+ { 0.995767414467659793982495643}, { 0.091908956497132728624990979},
+ {-0.091908956497132728624990979}, { 0.995767414467659793982495643},
+ { 0.639124444863775743801488193}, { 0.769103337645579639346626069},
+ {-0.769103337645579639346626069}, { 0.639124444863775743801488193},
+ { 0.884797098430937780104007041}, { 0.465976495767966177902756065},
+ {-0.465976495767966177902756065}, { 0.884797098430937780104007041},
+ { 0.296150888243623824121786128}, { 0.955141168305770721498157712},
+ {-0.955141168305770721498157712}, { 0.296150888243623824121786128},
+ { 0.958703474895871555374645792}, { 0.284407537211271843618310615},
+ {-0.284407537211271843618310615}, { 0.958703474895871555374645792},
+ { 0.476799230063322133342158117}, { 0.879012226428633477831323711},
+ {-0.879012226428633477831323711}, { 0.476799230063322133342158117},
+ { 0.776888465673232450040827983}, { 0.629638238914927025372981341},
+ {-0.629638238914927025372981341}, { 0.776888465673232450040827983},
+ { 0.104121633872054579120943880}, { 0.994564570734255452119106243},
+ {-0.994564570734255452119106243}, { 0.104121633872054579120943880},
+ { 0.981963869109555264072848154}, { 0.189068664149806212754997837},
+ {-0.189068664149806212754997837}, { 0.981963869109555264072848154},
+ { 0.560661576197336023839710223}, { 0.828045045257755752067527592},
+ {-0.828045045257755752067527592}, { 0.560661576197336023839710223},
+ { 0.834862874986380056304401383}, { 0.550457972936604802977289893},
+ {-0.550457972936604802977289893}, { 0.834862874986380056304401383},
+ { 0.201104634842091911558443546}, { 0.979569765685440534439326110},
+ {-0.979569765685440534439326110}, { 0.201104634842091911558443546},
+ { 0.926210242138311341974793388}, { 0.377007410216418256726567823},
+ {-0.377007410216418256726567823}, { 0.926210242138311341974793388},
+ { 0.388345046698826291624993541}, { 0.921514039342041943465396332},
+ {-0.921514039342041943465396332}, { 0.388345046698826291624993541},
+ { 0.711432195745216441522130290}, { 0.702754744457225302452914421},
+ {-0.702754744457225302452914421}, { 0.711432195745216441522130290},
+ { 0.006135884649154475359640235}, { 0.999981175282601142656990438},
+ {-0.999981175282601142656990438}, { 0.006135884649154475359640235},
+ { 0.999995293809576171511580126}, { 0.003067956762965976270145365},
+ {-0.003067956762965976270145365}, { 0.999995293809576171511580126},
+ { 0.704934080375904908852523758}, { 0.709272826438865651316533772},
+ {-0.709272826438865651316533772}, { 0.704934080375904908852523758},
+ { 0.922701128333878570437264227}, { 0.385516053843918864075607949},
+ {-0.385516053843918864075607949}, { 0.922701128333878570437264227},
+ { 0.379847208924051170576281147}, { 0.925049240782677590302371869},
+ {-0.925049240782677590302371869}, { 0.379847208924051170576281147},
+ { 0.980182135968117392690210009}, { 0.198098410717953586179324918},
+ {-0.198098410717953586179324918}, { 0.980182135968117392690210009},
+ { 0.553016705580027531764226988}, { 0.833170164701913186439915922},
+ {-0.833170164701913186439915922}, { 0.553016705580027531764226988},
+ { 0.829761233794523042469023765}, { 0.558118531220556115693702964},
+ {-0.558118531220556115693702964}, { 0.829761233794523042469023765},
+ { 0.192080397049892441679288205}, { 0.981379193313754574318224190},
+ {-0.981379193313754574318224190}, { 0.192080397049892441679288205},
+ { 0.994879330794805620591166107}, { 0.101069862754827824987887585},
+ {-0.101069862754827824987887585}, { 0.994879330794805620591166107},
+ { 0.632018735939809021909403706}, { 0.774953106594873878359129282},
+ {-0.774953106594873878359129282}, { 0.632018735939809021909403706},
+ { 0.880470889052160770806542929}, { 0.474100214650550014398580015},
+ {-0.474100214650550014398580015}, { 0.880470889052160770806542929},
+ { 0.287347459544729526477331841}, { 0.957826413027532890321037029},
+ {-0.957826413027532890321037029}, { 0.287347459544729526477331841},
+ { 0.956045251349996443270479823}, { 0.293219162694258650606608599},
+ {-0.293219162694258650606608599}, { 0.956045251349996443270479823},
+ { 0.468688822035827933697617870}, { 0.883363338665731594736308015},
+ {-0.883363338665731594736308015}, { 0.468688822035827933697617870},
+ { 0.771060524261813773200605759}, { 0.636761861236284230413943435},
+ {-0.636761861236284230413943435}, { 0.771060524261813773200605759},
+ { 0.094963495329638998938034312}, { 0.995480755491926941769171600},
+ {-0.995480755491926941769171600}, { 0.094963495329638998938034312},
+ { 0.998640218180265222418199049}, { 0.052131704680283321236358216},
+ {-0.052131704680283321236358216}, { 0.998640218180265222418199049},
+ { 0.669282588346636065720696366}, { 0.743007952135121693517362293},
+ {-0.743007952135121693517362293}, { 0.669282588346636065720696366},
+ { 0.902673318237258806751502391}, { 0.430326481340082633908199031},
+ {-0.430326481340082633908199031}, { 0.902673318237258806751502391},
+ { 0.333999651442009404650865481}, { 0.942573197601446879280758735},
+ {-0.942573197601446879280758735}, { 0.333999651442009404650865481},
+ { 0.969281235356548486048290738}, { 0.245955050335794611599924709},
+ {-0.245955050335794611599924709}, { 0.969281235356548486048290738},
+ { 0.511468850437970399504391001}, { 0.859301818357008404783582139},
+ {-0.859301818357008404783582139}, { 0.511468850437970399504391001},
+ { 0.801376171723140219430247777}, { 0.598160706996342311724958652},
+ {-0.598160706996342311724958652}, { 0.801376171723140219430247777},
+ { 0.143695033150294454819773349}, { 0.989622017463200834623694454},
+ {-0.989622017463200834623694454}, { 0.143695033150294454819773349},
+ { 0.988721691960323767604516485}, { 0.149764534677321517229695737},
+ {-0.149764534677321517229695737}, { 0.988721691960323767604516485},
+ { 0.593232295039799808047809426}, { 0.805031331142963597922659282},
+ {-0.805031331142963597922659282}, { 0.593232295039799808047809426},
+ { 0.856147328375194481019630732}, { 0.516731799017649881508753876},
+ {-0.516731799017649881508753876}, { 0.856147328375194481019630732},
+ { 0.240003022448741486568922365}, { 0.970772140728950302138169611},
+ {-0.970772140728950302138169611}, { 0.240003022448741486568922365},
+ { 0.940506070593268323787291309}, { 0.339776884406826857828825803},
+ {-0.339776884406826857828825803}, { 0.940506070593268323787291309},
+ { 0.424779681209108833357226189}, { 0.905296759318118774354048329},
+ {-0.905296759318118774354048329}, { 0.424779681209108833357226189},
+ { 0.738887324460615147933116508}, { 0.673829000378756060917568372},
+ {-0.673829000378756060917568372}, { 0.738887324460615147933116508},
+ { 0.046003182130914628814301788}, { 0.998941293186856850633930266},
+ {-0.998941293186856850633930266}, { 0.046003182130914628814301788},
+ { 0.999618822495178597116830637}, { 0.027608145778965741612354872},
+ {-0.027608145778965741612354872}, { 0.999618822495178597116830637},
+ { 0.687315340891759108199186948}, { 0.726359155084345976817494315},
+ {-0.726359155084345976817494315}, { 0.687315340891759108199186948},
+ { 0.912962190428398164628018233}, { 0.408044162864978680820747499},
+ {-0.408044162864978680820747499}, { 0.912962190428398164628018233},
+ { 0.357030961233430032614954036}, { 0.934092550404258914729877883},
+ {-0.934092550404258914729877883}, { 0.357030961233430032614954036},
+ { 0.975025345066994146844913468}, { 0.222093620973203534094094721},
+ {-0.222093620973203534094094721}, { 0.975025345066994146844913468},
+ { 0.532403127877197971442805218}, { 0.846490938774052078300544488},
+ {-0.846490938774052078300544488}, { 0.532403127877197971442805218},
+ { 0.815814410806733789010772660}, { 0.578313796411655563342245019},
+ {-0.578313796411655563342245019}, { 0.815814410806733789010772660},
+ { 0.167938294974731178054745536}, { 0.985797509167567424700995000},
+ {-0.985797509167567424700995000}, { 0.167938294974731178054745536},
+ { 0.992099313142191757112085445}, { 0.125454983411546238542336453},
+ {-0.125454983411546238542336453}, { 0.992099313142191757112085445},
+ { 0.612810082429409703935211936}, { 0.790230221437310055030217152},
+ {-0.790230221437310055030217152}, { 0.612810082429409703935211936},
+ { 0.868570705971340895340449876}, { 0.495565261825772531150266670},
+ {-0.495565261825772531150266670}, { 0.868570705971340895340449876},
+ { 0.263754678974831383611349322}, { 0.964589793289812723836432159},
+ {-0.964589793289812723836432159}, { 0.263754678974831383611349322},
+ { 0.948561349915730288158494826}, { 0.316593375556165867243047035},
+ {-0.316593375556165867243047035}, { 0.948561349915730288158494826},
+ { 0.446868840162374195353044389}, { 0.894599485631382678433072126},
+ {-0.894599485631382678433072126}, { 0.446868840162374195353044389},
+ { 0.755201376896536527598710756}, { 0.655492852999615385312679701},
+ {-0.655492852999615385312679701}, { 0.755201376896536527598710756},
+ { 0.070504573389613863027351471}, { 0.997511456140303459699448390},
+ {-0.997511456140303459699448390}, { 0.070504573389613863027351471},
+ { 0.997060070339482978987989949}, { 0.076623861392031492278332463},
+ {-0.076623861392031492278332463}, { 0.997060070339482978987989949},
+ { 0.650846684996380915068975573}, { 0.759209188978388033485525443},
+ {-0.759209188978388033485525443}, { 0.650846684996380915068975573},
+ { 0.891840709392342727796478697}, { 0.452349587233770874133026703},
+ {-0.452349587233770874133026703}, { 0.891840709392342727796478697},
+ { 0.310767152749611495835997250}, { 0.950486073949481721759926101},
+ {-0.950486073949481721759926101}, { 0.310767152749611495835997250},
+ { 0.962953266873683886347921481}, { 0.269668325572915106525464462},
+ {-0.269668325572915106525464462}, { 0.962953266873683886347921481},
+ { 0.490226483288291154229598449}, { 0.871595086655951034842481435},
+ {-0.871595086655951034842481435}, { 0.490226483288291154229598449},
+ { 0.786455213599085757522319464}, { 0.617647307937803932403979402},
+ {-0.617647307937803932403979402}, { 0.786455213599085757522319464},
+ { 0.119365214810991364593637790}, { 0.992850414459865090793563344},
+ {-0.992850414459865090793563344}, { 0.119365214810991364593637790},
+ { 0.984748501801904218556553176}, { 0.173983873387463827950700807},
+ {-0.173983873387463827950700807}, { 0.984748501801904218556553176},
+ { 0.573297166698042212820171239}, { 0.819347520076796960824689637},
+ {-0.819347520076796960824689637}, { 0.573297166698042212820171239},
+ { 0.843208239641845437161743865}, { 0.537587076295645482502214932},
+ {-0.537587076295645482502214932}, { 0.843208239641845437161743865},
+ { 0.216106797076219509948385131}, { 0.976369731330021149312732194},
+ {-0.976369731330021149312732194}, { 0.216106797076219509948385131},
+ { 0.931884265581668106718557199}, { 0.362755724367397216204854462},
+ {-0.362755724367397216204854462}, { 0.931884265581668106718557199},
+ { 0.402434650859418441082533934}, { 0.915448716088267819566431292},
+ {-0.915448716088267819566431292}, { 0.402434650859418441082533934},
+ { 0.722128193929215321243607198}, { 0.691759258364157774906734132},
+ {-0.691759258364157774906734132}, { 0.722128193929215321243607198},
+ { 0.021474080275469507418374898}, { 0.999769405351215321657617036},
+ {-0.999769405351215321657617036}, { 0.021474080275469507418374898},
+ { 0.999882347454212525633049627}, { 0.015339206284988101044151868},
+ {-0.015339206284988101044151868}, { 0.999882347454212525633049627},
+ { 0.696177131491462944788582591}, { 0.717870045055731736211325329},
+ {-0.717870045055731736211325329}, { 0.696177131491462944788582591},
+ { 0.917900775621390457642276297}, { 0.396809987416710328595290911},
+ {-0.396809987416710328595290911}, { 0.917900775621390457642276297},
+ { 0.368466829953372331712746222}, { 0.929640895843181265457918066},
+ {-0.929640895843181265457918066}, { 0.368466829953372331712746222},
+ { 0.977677357824509979943404762}, { 0.210111836880469621717489972},
+ {-0.210111836880469621717489972}, { 0.977677357824509979943404762},
+ { 0.542750784864515906586768661}, { 0.839893794195999504583383987},
+ {-0.839893794195999504583383987}, { 0.542750784864515906586768661},
+ { 0.822849781375826332046780034}, { 0.568258952670131549790548489},
+ {-0.568258952670131549790548489}, { 0.822849781375826332046780034},
+ { 0.180022901405699522679906590}, { 0.983662419211730274396237776},
+ {-0.983662419211730274396237776}, { 0.180022901405699522679906590},
+ { 0.993564135520595333782021697}, { 0.113270952177564349018228733},
+ {-0.113270952177564349018228733}, { 0.993564135520595333782021697},
+ { 0.622461279374149972519166721}, { 0.782650596166575738458949301},
+ {-0.782650596166575738458949301}, { 0.622461279374149972519166721},
+ { 0.874586652278176112634431897}, { 0.484869248000791101822951699},
+ {-0.484869248000791101822951699}, { 0.874586652278176112634431897},
+ { 0.275571819310958163076425168}, { 0.961280485811320641748659653},
+ {-0.961280485811320641748659653}, { 0.275571819310958163076425168},
+ { 0.952375012719765858529893608}, { 0.304929229735402406490728633},
+ {-0.304929229735402406490728633}, { 0.952375012719765858529893608},
+ { 0.457813303598877221904961155}, { 0.889048355854664562540777729},
+ {-0.889048355854664562540777729}, { 0.457813303598877221904961155},
+ { 0.763188417263381271704838297}, { 0.646176012983316364832802220},
+ {-0.646176012983316364832802220}, { 0.763188417263381271704838297},
+ { 0.082740264549375693111987083}, { 0.996571145790554847093566910},
+ {-0.996571145790554847093566910}, { 0.082740264549375693111987083},
+ { 0.997925286198596012623025462}, { 0.064382630929857460819324537},
+ {-0.064382630929857460819324537}, { 0.997925286198596012623025462},
+ { 0.660114342067420478559490747}, { 0.751165131909686411205819422},
+ {-0.751165131909686411205819422}, { 0.660114342067420478559490747},
+ { 0.897324580705418281231391836}, { 0.441371268731716692879988968},
+ {-0.441371268731716692879988968}, { 0.897324580705418281231391836},
+ { 0.322407678801069848384807478}, { 0.946600913083283570044599823},
+ {-0.946600913083283570044599823}, { 0.322407678801069848384807478},
+ { 0.966190003445412555433832961}, { 0.257831102162159005614471295},
+ {-0.257831102162159005614471295}, { 0.966190003445412555433832961},
+ { 0.500885382611240786241285004}, { 0.865513624090569082825488358},
+ {-0.865513624090569082825488358}, { 0.500885382611240786241285004},
+ { 0.793975477554337164895083757}, { 0.607949784967773667243642671},
+ {-0.607949784967773667243642671}, { 0.793975477554337164895083757},
+ { 0.131540028702883111103387493}, { 0.991310859846115418957349799},
+ {-0.991310859846115418957349799}, { 0.131540028702883111103387493},
+ { 0.986809401814185476970235952}, { 0.161886393780111837641387995},
+ {-0.161886393780111837641387995}, { 0.986809401814185476970235952},
+ { 0.583308652937698294392830961}, { 0.812250586585203913049744181},
+ {-0.812250586585203913049744181}, { 0.583308652937698294392830961},
+ { 0.849741768000852489471268395}, { 0.527199134781901348464274575},
+ {-0.527199134781901348464274575}, { 0.849741768000852489471268395},
+ { 0.228072083170885739254457379}, { 0.973644249650811925318383912},
+ {-0.973644249650811925318383912}, { 0.228072083170885739254457379},
+ { 0.936265667170278246576310996}, { 0.351292756085567125601307623},
+ {-0.351292756085567125601307623}, { 0.936265667170278246576310996},
+ { 0.413638312238434547471944324}, { 0.910441292258067196934095369},
+ {-0.910441292258067196934095369}, { 0.413638312238434547471944324},
+ { 0.730562769227827561177758850}, { 0.682845546385248068164596123},
+ {-0.682845546385248068164596123}, { 0.730562769227827561177758850},
+ { 0.033741171851377584833716112}, { 0.999430604555461772019008327},
+ {-0.999430604555461772019008327}, { 0.033741171851377584833716112},
+ { 0.999204758618363895492950001}, { 0.039872927587739811128578738},
+ {-0.039872927587739811128578738}, { 0.999204758618363895492950001},
+ { 0.678350043129861486873655042}, { 0.734738878095963464563223604},
+ {-0.734738878095963464563223604}, { 0.678350043129861486873655042},
+ { 0.907886116487666212038681480}, { 0.419216888363223956433010020},
+ {-0.419216888363223956433010020}, { 0.907886116487666212038681480},
+ { 0.345541324963989065539191723}, { 0.938403534063108112192420774},
+ {-0.938403534063108112192420774}, { 0.345541324963989065539191723},
+ { 0.972226497078936305708321144}, { 0.234041958583543423191242045},
+ {-0.234041958583543423191242045}, { 0.972226497078936305708321144},
+ { 0.521975292937154342694258318}, { 0.852960604930363657746588082},
+ {-0.852960604930363657746588082}, { 0.521975292937154342694258318},
+ { 0.808656181588174991946968128}, { 0.588281548222645304786439813},
+ {-0.588281548222645304786439813}, { 0.808656181588174991946968128},
+ { 0.155828397654265235743101486}, { 0.987784141644572154230969032},
+ {-0.987784141644572154230969032}, { 0.155828397654265235743101486},
+ { 0.990485084256457037998682243}, { 0.137620121586486044948441663},
+ {-0.137620121586486044948441663}, { 0.990485084256457037998682243},
+ { 0.603066598540348201693430617}, { 0.797690840943391108362662755},
+ {-0.797690840943391108362662755}, { 0.603066598540348201693430617},
+ { 0.862423956111040538690933878}, { 0.506186645345155291048942344},
+ {-0.506186645345155291048942344}, { 0.862423956111040538690933878},
+ { 0.251897818154216950498106628}, { 0.967753837093475465243391912},
+ {-0.967753837093475465243391912}, { 0.251897818154216950498106628},
+ { 0.944604837261480265659265493}, { 0.328209843579092526107916817},
+ {-0.328209843579092526107916817}, { 0.944604837261480265659265493},
+ { 0.435857079922255491032544080}, { 0.900015892016160228714535267},
+ {-0.900015892016160228714535267}, { 0.435857079922255491032544080},
+ { 0.747100605980180144323078847}, { 0.664710978203344868130324985},
+ {-0.664710978203344868130324985}, { 0.747100605980180144323078847},
+ { 0.058258264500435759613979782}, { 0.998301544933892840738782163},
+ {-0.998301544933892840738782163}, { 0.058258264500435759613979782},
+ { 0.996044700901251989887944810}, { 0.088853552582524596561586535},
+ {-0.088853552582524596561586535}, { 0.996044700901251989887944810},
+ { 0.641481012808583151988739898}, { 0.767138911935820381181694573},
+ {-0.767138911935820381181694573}, { 0.641481012808583151988739898},
+ { 0.886222530148880631647990821}, { 0.463259783551860197390719637},
+ {-0.463259783551860197390719637}, { 0.886222530148880631647990821},
+ { 0.299079826308040476750336973}, { 0.954228095109105629780430732},
+ {-0.954228095109105629780430732}, { 0.299079826308040476750336973},
+ { 0.959571513081984528335528181}, { 0.281464937925757984095231007},
+ {-0.281464937925757984095231007}, { 0.959571513081984528335528181},
+ { 0.479493757660153026679839798}, { 0.877545290207261291668470750},
+ {-0.877545290207261291668470750}, { 0.479493757660153026679839798},
+ { 0.778816512381475953374724325}, { 0.627251815495144113509622565},
+ {-0.627251815495144113509622565}, { 0.778816512381475953374724325},
+ { 0.107172424956808849175529148}, { 0.994240449453187946358413442},
+ {-0.994240449453187946358413442}, { 0.107172424956808849175529148},
+ { 0.982539302287441255907040396}, { 0.186055151663446648105438304},
+ {-0.186055151663446648105438304}, { 0.982539302287441255907040396},
+ { 0.563199344013834115007363772}, { 0.826321062845663480311195452},
+ {-0.826321062845663480311195452}, { 0.563199344013834115007363772},
+ { 0.836547727223511984524285790}, { 0.547894059173100165608820571},
+ {-0.547894059173100165608820571}, { 0.836547727223511984524285790},
+ { 0.204108966092816874181696950}, { 0.978948175319062194715480124},
+ {-0.978948175319062194715480124}, { 0.204108966092816874181696950},
+ { 0.927362525650401087274536959}, { 0.374164062971457997104393020},
+ {-0.374164062971457997104393020}, { 0.927362525650401087274536959},
+ { 0.391170384302253888687512949}, { 0.920318276709110566440076541},
+ {-0.920318276709110566440076541}, { 0.391170384302253888687512949},
+ { 0.713584868780793592903125099}, { 0.700568793943248366792866380},
+ {-0.700568793943248366792866380}, { 0.713584868780793592903125099},
+ { 0.009203754782059819315102378}, { 0.999957644551963866333120920},
+ {-0.999957644551963866333120920}, { 0.009203754782059819315102378},
+ { 0.999957644551963866333120920}, { 0.009203754782059819315102378},
+ {-0.009203754782059819315102378}, { 0.999957644551963866333120920},
+ { 0.700568793943248366792866380}, { 0.713584868780793592903125099},
+ {-0.713584868780793592903125099}, { 0.700568793943248366792866380},
+ { 0.920318276709110566440076541}, { 0.391170384302253888687512949},
+ {-0.391170384302253888687512949}, { 0.920318276709110566440076541},
+ { 0.374164062971457997104393020}, { 0.927362525650401087274536959},
+ {-0.927362525650401087274536959}, { 0.374164062971457997104393020},
+ { 0.978948175319062194715480124}, { 0.204108966092816874181696950},
+ {-0.204108966092816874181696950}, { 0.978948175319062194715480124},
+ { 0.547894059173100165608820571}, { 0.836547727223511984524285790},
+ {-0.836547727223511984524285790}, { 0.547894059173100165608820571},
+ { 0.826321062845663480311195452}, { 0.563199344013834115007363772},
+ {-0.563199344013834115007363772}, { 0.826321062845663480311195452},
+ { 0.186055151663446648105438304}, { 0.982539302287441255907040396},
+ {-0.982539302287441255907040396}, { 0.186055151663446648105438304},
+ { 0.994240449453187946358413442}, { 0.107172424956808849175529148},
+ {-0.107172424956808849175529148}, { 0.994240449453187946358413442},
+ { 0.627251815495144113509622565}, { 0.778816512381475953374724325},
+ {-0.778816512381475953374724325}, { 0.627251815495144113509622565},
+ { 0.877545290207261291668470750}, { 0.479493757660153026679839798},
+ {-0.479493757660153026679839798}, { 0.877545290207261291668470750},
+ { 0.281464937925757984095231007}, { 0.959571513081984528335528181},
+ {-0.959571513081984528335528181}, { 0.281464937925757984095231007},
+ { 0.954228095109105629780430732}, { 0.299079826308040476750336973},
+ {-0.299079826308040476750336973}, { 0.954228095109105629780430732},
+ { 0.463259783551860197390719637}, { 0.886222530148880631647990821},
+ {-0.886222530148880631647990821}, { 0.463259783551860197390719637},
+ { 0.767138911935820381181694573}, { 0.641481012808583151988739898},
+ {-0.641481012808583151988739898}, { 0.767138911935820381181694573},
+ { 0.088853552582524596561586535}, { 0.996044700901251989887944810},
+ {-0.996044700901251989887944810}, { 0.088853552582524596561586535},
+ { 0.998301544933892840738782163}, { 0.058258264500435759613979782},
+ {-0.058258264500435759613979782}, { 0.998301544933892840738782163},
+ { 0.664710978203344868130324985}, { 0.747100605980180144323078847},
+ {-0.747100605980180144323078847}, { 0.664710978203344868130324985},
+ { 0.900015892016160228714535267}, { 0.435857079922255491032544080},
+ {-0.435857079922255491032544080}, { 0.900015892016160228714535267},
+ { 0.328209843579092526107916817}, { 0.944604837261480265659265493},
+ {-0.944604837261480265659265493}, { 0.328209843579092526107916817},
+ { 0.967753837093475465243391912}, { 0.251897818154216950498106628},
+ {-0.251897818154216950498106628}, { 0.967753837093475465243391912},
+ { 0.506186645345155291048942344}, { 0.862423956111040538690933878},
+ {-0.862423956111040538690933878}, { 0.506186645345155291048942344},
+ { 0.797690840943391108362662755}, { 0.603066598540348201693430617},
+ {-0.603066598540348201693430617}, { 0.797690840943391108362662755},
+ { 0.137620121586486044948441663}, { 0.990485084256457037998682243},
+ {-0.990485084256457037998682243}, { 0.137620121586486044948441663},
+ { 0.987784141644572154230969032}, { 0.155828397654265235743101486},
+ {-0.155828397654265235743101486}, { 0.987784141644572154230969032},
+ { 0.588281548222645304786439813}, { 0.808656181588174991946968128},
+ {-0.808656181588174991946968128}, { 0.588281548222645304786439813},
+ { 0.852960604930363657746588082}, { 0.521975292937154342694258318},
+ {-0.521975292937154342694258318}, { 0.852960604930363657746588082},
+ { 0.234041958583543423191242045}, { 0.972226497078936305708321144},
+ {-0.972226497078936305708321144}, { 0.234041958583543423191242045},
+ { 0.938403534063108112192420774}, { 0.345541324963989065539191723},
+ {-0.345541324963989065539191723}, { 0.938403534063108112192420774},
+ { 0.419216888363223956433010020}, { 0.907886116487666212038681480},
+ {-0.907886116487666212038681480}, { 0.419216888363223956433010020},
+ { 0.734738878095963464563223604}, { 0.678350043129861486873655042},
+ {-0.678350043129861486873655042}, { 0.734738878095963464563223604},
+ { 0.039872927587739811128578738}, { 0.999204758618363895492950001},
+ {-0.999204758618363895492950001}, { 0.039872927587739811128578738},
+ { 0.999430604555461772019008327}, { 0.033741171851377584833716112},
+ {-0.033741171851377584833716112}, { 0.999430604555461772019008327},
+ { 0.682845546385248068164596123}, { 0.730562769227827561177758850},
+ {-0.730562769227827561177758850}, { 0.682845546385248068164596123},
+ { 0.910441292258067196934095369}, { 0.413638312238434547471944324},
+ {-0.413638312238434547471944324}, { 0.910441292258067196934095369},
+ { 0.351292756085567125601307623}, { 0.936265667170278246576310996},
+ {-0.936265667170278246576310996}, { 0.351292756085567125601307623},
+ { 0.973644249650811925318383912}, { 0.228072083170885739254457379},
+ {-0.228072083170885739254457379}, { 0.973644249650811925318383912},
+ { 0.527199134781901348464274575}, { 0.849741768000852489471268395},
+ {-0.849741768000852489471268395}, { 0.527199134781901348464274575},
+ { 0.812250586585203913049744181}, { 0.583308652937698294392830961},
+ {-0.583308652937698294392830961}, { 0.812250586585203913049744181},
+ { 0.161886393780111837641387995}, { 0.986809401814185476970235952},
+ {-0.986809401814185476970235952}, { 0.161886393780111837641387995},
+ { 0.991310859846115418957349799}, { 0.131540028702883111103387493},
+ {-0.131540028702883111103387493}, { 0.991310859846115418957349799},
+ { 0.607949784967773667243642671}, { 0.793975477554337164895083757},
+ {-0.793975477554337164895083757}, { 0.607949784967773667243642671},
+ { 0.865513624090569082825488358}, { 0.500885382611240786241285004},
+ {-0.500885382611240786241285004}, { 0.865513624090569082825488358},
+ { 0.257831102162159005614471295}, { 0.966190003445412555433832961},
+ {-0.966190003445412555433832961}, { 0.257831102162159005614471295},
+ { 0.946600913083283570044599823}, { 0.322407678801069848384807478},
+ {-0.322407678801069848384807478}, { 0.946600913083283570044599823},
+ { 0.441371268731716692879988968}, { 0.897324580705418281231391836},
+ {-0.897324580705418281231391836}, { 0.441371268731716692879988968},
+ { 0.751165131909686411205819422}, { 0.660114342067420478559490747},
+ {-0.660114342067420478559490747}, { 0.751165131909686411205819422},
+ { 0.064382630929857460819324537}, { 0.997925286198596012623025462},
+ {-0.997925286198596012623025462}, { 0.064382630929857460819324537},
+ { 0.996571145790554847093566910}, { 0.082740264549375693111987083},
+ {-0.082740264549375693111987083}, { 0.996571145790554847093566910},
+ { 0.646176012983316364832802220}, { 0.763188417263381271704838297},
+ {-0.763188417263381271704838297}, { 0.646176012983316364832802220},
+ { 0.889048355854664562540777729}, { 0.457813303598877221904961155},
+ {-0.457813303598877221904961155}, { 0.889048355854664562540777729},
+ { 0.304929229735402406490728633}, { 0.952375012719765858529893608},
+ {-0.952375012719765858529893608}, { 0.304929229735402406490728633},
+ { 0.961280485811320641748659653}, { 0.275571819310958163076425168},
+ {-0.275571819310958163076425168}, { 0.961280485811320641748659653},
+ { 0.484869248000791101822951699}, { 0.874586652278176112634431897},
+ {-0.874586652278176112634431897}, { 0.484869248000791101822951699},
+ { 0.782650596166575738458949301}, { 0.622461279374149972519166721},
+ {-0.622461279374149972519166721}, { 0.782650596166575738458949301},
+ { 0.113270952177564349018228733}, { 0.993564135520595333782021697},
+ {-0.993564135520595333782021697}, { 0.113270952177564349018228733},
+ { 0.983662419211730274396237776}, { 0.180022901405699522679906590},
+ {-0.180022901405699522679906590}, { 0.983662419211730274396237776},
+ { 0.568258952670131549790548489}, { 0.822849781375826332046780034},
+ {-0.822849781375826332046780034}, { 0.568258952670131549790548489},
+ { 0.839893794195999504583383987}, { 0.542750784864515906586768661},
+ {-0.542750784864515906586768661}, { 0.839893794195999504583383987},
+ { 0.210111836880469621717489972}, { 0.977677357824509979943404762},
+ {-0.977677357824509979943404762}, { 0.210111836880469621717489972},
+ { 0.929640895843181265457918066}, { 0.368466829953372331712746222},
+ {-0.368466829953372331712746222}, { 0.929640895843181265457918066},
+ { 0.396809987416710328595290911}, { 0.917900775621390457642276297},
+ {-0.917900775621390457642276297}, { 0.396809987416710328595290911},
+ { 0.717870045055731736211325329}, { 0.696177131491462944788582591},
+ {-0.696177131491462944788582591}, { 0.717870045055731736211325329},
+ { 0.015339206284988101044151868}, { 0.999882347454212525633049627},
+ {-0.999882347454212525633049627}, { 0.015339206284988101044151868},
+ { 0.999769405351215321657617036}, { 0.021474080275469507418374898},
+ {-0.021474080275469507418374898}, { 0.999769405351215321657617036},
+ { 0.691759258364157774906734132}, { 0.722128193929215321243607198},
+ {-0.722128193929215321243607198}, { 0.691759258364157774906734132},
+ { 0.915448716088267819566431292}, { 0.402434650859418441082533934},
+ {-0.402434650859418441082533934}, { 0.915448716088267819566431292},
+ { 0.362755724367397216204854462}, { 0.931884265581668106718557199},
+ {-0.931884265581668106718557199}, { 0.362755724367397216204854462},
+ { 0.976369731330021149312732194}, { 0.216106797076219509948385131},
+ {-0.216106797076219509948385131}, { 0.976369731330021149312732194},
+ { 0.537587076295645482502214932}, { 0.843208239641845437161743865},
+ {-0.843208239641845437161743865}, { 0.537587076295645482502214932},
+ { 0.819347520076796960824689637}, { 0.573297166698042212820171239},
+ {-0.573297166698042212820171239}, { 0.819347520076796960824689637},
+ { 0.173983873387463827950700807}, { 0.984748501801904218556553176},
+ {-0.984748501801904218556553176}, { 0.173983873387463827950700807},
+ { 0.992850414459865090793563344}, { 0.119365214810991364593637790},
+ {-0.119365214810991364593637790}, { 0.992850414459865090793563344},
+ { 0.617647307937803932403979402}, { 0.786455213599085757522319464},
+ {-0.786455213599085757522319464}, { 0.617647307937803932403979402},
+ { 0.871595086655951034842481435}, { 0.490226483288291154229598449},
+ {-0.490226483288291154229598449}, { 0.871595086655951034842481435},
+ { 0.269668325572915106525464462}, { 0.962953266873683886347921481},
+ {-0.962953266873683886347921481}, { 0.269668325572915106525464462},
+ { 0.950486073949481721759926101}, { 0.310767152749611495835997250},
+ {-0.310767152749611495835997250}, { 0.950486073949481721759926101},
+ { 0.452349587233770874133026703}, { 0.891840709392342727796478697},
+ {-0.891840709392342727796478697}, { 0.452349587233770874133026703},
+ { 0.759209188978388033485525443}, { 0.650846684996380915068975573},
+ {-0.650846684996380915068975573}, { 0.759209188978388033485525443},
+ { 0.076623861392031492278332463}, { 0.997060070339482978987989949},
+ {-0.997060070339482978987989949}, { 0.076623861392031492278332463},
+ { 0.997511456140303459699448390}, { 0.070504573389613863027351471},
+ {-0.070504573389613863027351471}, { 0.997511456140303459699448390},
+ { 0.655492852999615385312679701}, { 0.755201376896536527598710756},
+ {-0.755201376896536527598710756}, { 0.655492852999615385312679701},
+ { 0.894599485631382678433072126}, { 0.446868840162374195353044389},
+ {-0.446868840162374195353044389}, { 0.894599485631382678433072126},
+ { 0.316593375556165867243047035}, { 0.948561349915730288158494826},
+ {-0.948561349915730288158494826}, { 0.316593375556165867243047035},
+ { 0.964589793289812723836432159}, { 0.263754678974831383611349322},
+ {-0.263754678974831383611349322}, { 0.964589793289812723836432159},
+ { 0.495565261825772531150266670}, { 0.868570705971340895340449876},
+ {-0.868570705971340895340449876}, { 0.495565261825772531150266670},
+ { 0.790230221437310055030217152}, { 0.612810082429409703935211936},
+ {-0.612810082429409703935211936}, { 0.790230221437310055030217152},
+ { 0.125454983411546238542336453}, { 0.992099313142191757112085445},
+ {-0.992099313142191757112085445}, { 0.125454983411546238542336453},
+ { 0.985797509167567424700995000}, { 0.167938294974731178054745536},
+ {-0.167938294974731178054745536}, { 0.985797509167567424700995000},
+ { 0.578313796411655563342245019}, { 0.815814410806733789010772660},
+ {-0.815814410806733789010772660}, { 0.578313796411655563342245019},
+ { 0.846490938774052078300544488}, { 0.532403127877197971442805218},
+ {-0.532403127877197971442805218}, { 0.846490938774052078300544488},
+ { 0.222093620973203534094094721}, { 0.975025345066994146844913468},
+ {-0.975025345066994146844913468}, { 0.222093620973203534094094721},
+ { 0.934092550404258914729877883}, { 0.357030961233430032614954036},
+ {-0.357030961233430032614954036}, { 0.934092550404258914729877883},
+ { 0.408044162864978680820747499}, { 0.912962190428398164628018233},
+ {-0.912962190428398164628018233}, { 0.408044162864978680820747499},
+ { 0.726359155084345976817494315}, { 0.687315340891759108199186948},
+ {-0.687315340891759108199186948}, { 0.726359155084345976817494315},
+ { 0.027608145778965741612354872}, { 0.999618822495178597116830637},
+ {-0.999618822495178597116830637}, { 0.027608145778965741612354872},
+ { 0.998941293186856850633930266}, { 0.046003182130914628814301788},
+ {-0.046003182130914628814301788}, { 0.998941293186856850633930266},
+ { 0.673829000378756060917568372}, { 0.738887324460615147933116508},
+ {-0.738887324460615147933116508}, { 0.673829000378756060917568372},
+ { 0.905296759318118774354048329}, { 0.424779681209108833357226189},
+ {-0.424779681209108833357226189}, { 0.905296759318118774354048329},
+ { 0.339776884406826857828825803}, { 0.940506070593268323787291309},
+ {-0.940506070593268323787291309}, { 0.339776884406826857828825803},
+ { 0.970772140728950302138169611}, { 0.240003022448741486568922365},
+ {-0.240003022448741486568922365}, { 0.970772140728950302138169611},
+ { 0.516731799017649881508753876}, { 0.856147328375194481019630732},
+ {-0.856147328375194481019630732}, { 0.516731799017649881508753876},
+ { 0.805031331142963597922659282}, { 0.593232295039799808047809426},
+ {-0.593232295039799808047809426}, { 0.805031331142963597922659282},
+ { 0.149764534677321517229695737}, { 0.988721691960323767604516485},
+ {-0.988721691960323767604516485}, { 0.149764534677321517229695737},
+ { 0.989622017463200834623694454}, { 0.143695033150294454819773349},
+ {-0.143695033150294454819773349}, { 0.989622017463200834623694454},
+ { 0.598160706996342311724958652}, { 0.801376171723140219430247777},
+ {-0.801376171723140219430247777}, { 0.598160706996342311724958652},
+ { 0.859301818357008404783582139}, { 0.511468850437970399504391001},
+ {-0.511468850437970399504391001}, { 0.859301818357008404783582139},
+ { 0.245955050335794611599924709}, { 0.969281235356548486048290738},
+ {-0.969281235356548486048290738}, { 0.245955050335794611599924709},
+ { 0.942573197601446879280758735}, { 0.333999651442009404650865481},
+ {-0.333999651442009404650865481}, { 0.942573197601446879280758735},
+ { 0.430326481340082633908199031}, { 0.902673318237258806751502391},
+ {-0.902673318237258806751502391}, { 0.430326481340082633908199031},
+ { 0.743007952135121693517362293}, { 0.669282588346636065720696366},
+ {-0.669282588346636065720696366}, { 0.743007952135121693517362293},
+ { 0.052131704680283321236358216}, { 0.998640218180265222418199049},
+ {-0.998640218180265222418199049}, { 0.052131704680283321236358216},
+ { 0.995480755491926941769171600}, { 0.094963495329638998938034312},
+ {-0.094963495329638998938034312}, { 0.995480755491926941769171600},
+ { 0.636761861236284230413943435}, { 0.771060524261813773200605759},
+ {-0.771060524261813773200605759}, { 0.636761861236284230413943435},
+ { 0.883363338665731594736308015}, { 0.468688822035827933697617870},
+ {-0.468688822035827933697617870}, { 0.883363338665731594736308015},
+ { 0.293219162694258650606608599}, { 0.956045251349996443270479823},
+ {-0.956045251349996443270479823}, { 0.293219162694258650606608599},
+ { 0.957826413027532890321037029}, { 0.287347459544729526477331841},
+ {-0.287347459544729526477331841}, { 0.957826413027532890321037029},
+ { 0.474100214650550014398580015}, { 0.880470889052160770806542929},
+ {-0.880470889052160770806542929}, { 0.474100214650550014398580015},
+ { 0.774953106594873878359129282}, { 0.632018735939809021909403706},
+ {-0.632018735939809021909403706}, { 0.774953106594873878359129282},
+ { 0.101069862754827824987887585}, { 0.994879330794805620591166107},
+ {-0.994879330794805620591166107}, { 0.101069862754827824987887585},
+ { 0.981379193313754574318224190}, { 0.192080397049892441679288205},
+ {-0.192080397049892441679288205}, { 0.981379193313754574318224190},
+ { 0.558118531220556115693702964}, { 0.829761233794523042469023765},
+ {-0.829761233794523042469023765}, { 0.558118531220556115693702964},
+ { 0.833170164701913186439915922}, { 0.553016705580027531764226988},
+ {-0.553016705580027531764226988}, { 0.833170164701913186439915922},
+ { 0.198098410717953586179324918}, { 0.980182135968117392690210009},
+ {-0.980182135968117392690210009}, { 0.198098410717953586179324918},
+ { 0.925049240782677590302371869}, { 0.379847208924051170576281147},
+ {-0.379847208924051170576281147}, { 0.925049240782677590302371869},
+ { 0.385516053843918864075607949}, { 0.922701128333878570437264227},
+ {-0.922701128333878570437264227}, { 0.385516053843918864075607949},
+ { 0.709272826438865651316533772}, { 0.704934080375904908852523758},
+ {-0.704934080375904908852523758}, { 0.709272826438865651316533772},
+ { 0.003067956762965976270145365}, { 0.999995293809576171511580126},
+ {-0.999995293809576171511580126}, { 0.003067956762965976270145365}
+};
+
+const fpr fpr_p2_tab[] = {
+ { 2.00000000000 },
+ { 1.00000000000 },
+ { 0.50000000000 },
+ { 0.25000000000 },
+ { 0.12500000000 },
+ { 0.06250000000 },
+ { 0.03125000000 },
+ { 0.01562500000 },
+ { 0.00781250000 },
+ { 0.00390625000 },
+ { 0.00195312500 }
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.h b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.h
new file mode 100644
index 000000000..6073efff3
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/fpr.h
@@ -0,0 +1,362 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/* ====================================================================== */
+
+#include
+
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef struct {
+ double v;
+} fpr;
+
+static inline fpr
+FPR(double v) {
+ fpr x;
+
+ x.v = v;
+ return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+ return FPR((double)i);
+}
+
+static const fpr fpr_q = { 12289.0 };
+static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
+static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
+static const fpr fpr_inv_sigma[] = {
+ { 0.0 }, /* unused */
+ { 0.0069054793295940891952143765991630516 },
+ { 0.0068102267767177975961393730687908629 },
+ { 0.0067188101910722710707826117910434131 },
+ { 0.0065883354370073665545865037227681924 },
+ { 0.0064651781207602900738053897763485516 },
+ { 0.0063486788828078995327741182928037856 },
+ { 0.0062382586529084374473367528433697537 },
+ { 0.0061334065020930261548984001431770281 },
+ { 0.0060336696681577241031668062510953022 },
+ { 0.0059386453095331159950250124336477482 }
+};
+static const fpr fpr_sigma_min[] = {
+ { 0.0 }, /* unused */
+ { 1.1165085072329102588881898380334015 },
+ { 1.1321247692325272405718031785357108 },
+ { 1.1475285353733668684571123112513188 },
+ { 1.1702540788534828939713084716509250 },
+ { 1.1925466358390344011122170489094133 },
+ { 1.2144300507766139921088487776957699 },
+ { 1.2359260567719808790104525941706723 },
+ { 1.2570545284063214162779743112075080 },
+ { 1.2778336969128335860256340575729042 },
+ { 1.2982803343442918539708792538826807 }
+};
+static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
+static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
+static const fpr fpr_bnorm_max = { 16822.4121 };
+static const fpr fpr_zero = { 0.0 };
+static const fpr fpr_one = { 1.0 };
+static const fpr fpr_two = { 2.0 };
+static const fpr fpr_onehalf = { 0.5 };
+static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
+static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
+static const fpr fpr_ptwo31 = { 2147483648.0 };
+static const fpr fpr_ptwo31m1 = { 2147483647.0 };
+static const fpr fpr_mtwo31m1 = { -2147483647.0 };
+static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
+static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
+static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
+
+static inline int64_t
+fpr_rint(fpr x) {
+ /*
+ * We do not want to use llrint() since it might be not
+ * constant-time.
+ *
+ * Suppose that x >= 0. If x >= 2^52, then it is already an
+ * integer. Otherwise, if x < 2^52, then computing x+2^52 will
+ * yield a value that will be rounded to the nearest integer
+ * with exactly the right rules (round-to-nearest-even).
+ *
+ * In order to have constant-time processing, we must do the
+ * computation for both x >= 0 and x < 0 cases, and use a
+ * cast to an integer to access the sign and select the proper
+ * value. Such casts also allow us to find out if |x| < 2^52.
+ */
+ int64_t sx, tx, rp, rn, m;
+ uint32_t ub;
+
+ sx = (int64_t)(x.v - 1.0);
+ tx = (int64_t)x.v;
+ rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
+ rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
+
+ /*
+ * If tx >= 2^52 or tx < -2^52, then result is tx.
+ * Otherwise, if sx >= 0, then result is rp.
+ * Otherwise, result is rn. We use the fact that when x is
+ * close to 0 (|x| <= 0.25) then both rp and rn are correct;
+ * and if x is not close to 0, then trunc(x-1.0) yields the
+ * appropriate sign.
+ */
+
+ /*
+ * Clamp rp to zero if tx < 0.
+ * Clamp rn to zero if tx >= 0.
+ */
+ m = sx >> 63;
+ rn &= m;
+ rp &= ~m;
+
+ /*
+ * Get the 12 upper bits of tx; if they are not all zeros or
+ * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
+ * rp and rn to zero. Otherwise, we clamp tx to zero.
+ */
+ ub = (uint32_t)((uint64_t)tx >> 52);
+ m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
+ rp &= m;
+ rn &= m;
+ tx &= ~m;
+
+ /*
+ * Only one of tx, rn or rp (at most) can be non-zero at this
+ * point.
+ */
+ return tx | rn | rp;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+ int64_t r;
+
+ /*
+ * The cast performs a trunc() (rounding toward 0) and thus is
+ * wrong by 1 for most negative values. The correction below is
+ * constant-time as long as the compiler turns the
+ * floating-point conversion result into a 0/1 integer without a
+ * conditional branch or another non-constant-time construction.
+ * This should hold on all modern architectures with an FPU (and
+ * if it is false on a given arch, then chances are that the FPU
+ * itself is not constant-time, making the point moot).
+ */
+ r = (int64_t)x.v;
+ return r - (x.v < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+ return (int64_t)x.v;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+ return FPR(x.v + y.v);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+ return FPR(x.v - y.v);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+ return FPR(-x.v);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+ return FPR(x.v * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+ return FPR(x.v + x.v);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+ return FPR(x.v * y.v);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+ return FPR(x.v * x.v);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+ return FPR(1.0 / x.v);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+ return FPR(x.v / y.v);
+}
+
+static inline void
+fpr_sqrt_avx2(double *t) {
+ __m128d x;
+
+ x = _mm_load1_pd(t);
+ x = _mm_sqrt_pd(x);
+ _mm_storel_pd(t, x);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+ /*
+ * We prefer not to have a dependency on libm when it can be
+ * avoided. On x86, calling the sqrt() libm function inlines
+ * the relevant opcode (fsqrt or sqrtsd, depending on whether
+ * the 387 FPU or SSE2 is used for floating-point operations)
+ * but then makes an optional call to the library function
+ * for proper error handling, in case the operand is negative.
+ *
+ * To avoid this dependency, we use intrinsics or inline assembly
+ * on recognized platforms:
+ *
+ * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
+ *
+ * - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
+ *
+ * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
+ * to call the 387 FPU fsqrt opcode.
+ *
+ * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
+ * the fsqrt opcode (Clang needs a special hack).
+ *
+ * - On GCC/Clang on ARM with hardware floating-point, we use
+ * inline assembly to call the vqsrt.f64 opcode. Due to a
+ * complex ecosystem of compilers and assembly syntaxes, we
+ * have to call it "fsqrt" or "fsqrtd", depending on case.
+ *
+ * If the platform is not recognized, a call to the system
+ * library function sqrt() is performed. On some compilers, this
+ * may actually inline the relevant opcode, and call the library
+ * function only when the input is invalid (e.g. negative);
+ * Falcon never actually calls sqrt() on a negative value, but
+ * the dependency to libm will still be there.
+ */
+
+ fpr_sqrt_avx2(&x.v);
+ return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+ return x.v < y.v;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+ /*
+ * Polynomial approximation of exp(-x) is taken from FACCT:
+ * https://eprint.iacr.org/2018/1234
+ * Specifically, values are extracted from the implementation
+ * referenced from the FACCT article, and available at:
+ * https://github.com/raykzhao/gaussian
+ * Tests over more than 24 billions of random inputs in the
+ * 0..log(2) range have never shown a deviation larger than
+ * 2^(-50) from the true mathematical value.
+ */
+
+ /*
+ * AVX2 implementation uses more operations than Horner's method,
+ * but with a lower expression tree depth. This helps because
+ * additions and multiplications have a latency of 4 cycles on
+ * a Skylake, but the CPU can issue two of them per cycle.
+ */
+
+ static const union {
+ double d[12];
+ __m256d v[3];
+ } c = {
+ {
+ 0.999999999999994892974086724280,
+ 0.500000000000019206858326015208,
+ 0.166666666666984014666397229121,
+ 0.041666666666110491190622155955,
+ 0.008333333327800835146903501993,
+ 0.001388888894063186997887560103,
+ 0.000198412739277311890541063977,
+ 0.000024801566833585381209939524,
+ 0.000002755586350219122514855659,
+ 0.000000275607356160477811864927,
+ 0.000000025299506379442070029551,
+ 0.000000002073772366009083061987
+ }
+ };
+
+ double d1, d2, d4, d8, y;
+ __m256d d14, d58, d9c;
+
+ d1 = -x.v;
+ d2 = d1 * d1;
+ d4 = d2 * d2;
+ d8 = d4 * d4;
+ d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
+ d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
+ d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
+ d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
+ d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
+ d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
+ d9c = _mm256_hadd_pd(d9c, d9c);
+ y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
+ + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
+ y *= ccs.v;
+
+ /*
+ * Final conversion goes through int64_t first, because that's what
+ * the underlying opcode (vcvttsd2si) will do, and we know that the
+ * result will fit, since x >= 0 and ccs < 1. If we did the
+ * conversion directly to uint64_t, then the compiler would add some
+ * extra code to cover the case of a source value of 2^63 or more,
+ * and though the alternate path would never be exercised, the
+ * extra comparison would cost us some cycles.
+ */
+ return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
+
+}
+
+#define fpr_gm_tab PQCLEAN_FALCONPADDED1024_AVX2_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab PQCLEAN_FALCONPADDED1024_AVX2_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/inner.h b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/inner.h
new file mode 100644
index 000000000..5c0d57b22
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/inner.h
@@ -0,0 +1,827 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ * - All public functions (i.e. the non-static ones) must be referenced
+ * with the PQCLEAN_FALCONPADDED1024_AVX2_ macro (e.g. PQCLEAN_FALCONPADDED1024_AVX2_verify_raw for the verify_raw()
+ * function). That macro adds a prefix to the name, which is
+ * configurable with the FALCON_PREFIX macro. This allows compiling
+ * the code into a specific "namespace" and potentially including
+ * several versions of this code into a single application (e.g. to
+ * have an AVX2 and a non-AVX2 variants and select the one to use at
+ * runtime based on availability of AVX2 opcodes).
+ *
+ * - Functions that need temporary buffers expects them as a final
+ * tmp[] array of type uint8_t*, with a size which is documented for
+ * each function. However, most have some alignment requirements,
+ * because they will use the array to store 16-bit, 32-bit or 64-bit
+ * values (e.g. uint64_t or double). The caller must ensure proper
+ * alignment. What happens on unaligned access depends on the
+ * underlying architecture, ranging from a slight time penalty
+ * to immediate termination of the process.
+ *
+ * - Some functions rely on specific rounding rules and precision for
+ * floating-point numbers. On some systems (in particular 32-bit x86
+ * with the 387 FPU), this requires setting an hardware control
+ * word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ * oldcw = set_fpu_cw(2);
+ * PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(...);
+ * set_fpu_cw(oldcw);
+ *
+ * On systems where the native floating-point precision is already
+ * proper, or integer-based emulation is used, the set_fpu_cw()
+ * function does nothing, so it can be called systematically.
+ */
+
+#include
+#include
+#include
+
+/*
+ * This implementation uses AVX2 and optionally FMA intrinsics.
+ */
+#include
+#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c)
+#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c)
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+ return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context shake256incctx
+#define inner_shake256_init(sc) shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ * - modq: sequence of values modulo 12289, each encoded over exactly
+ * 14 bits. The encoder and decoder verify that integers are within
+ * the valid range (0..12288). Values are arrays of uint16.
+ *
+ * - trim: sequence of signed integers, a specified number of bits
+ * each. The number of bits is provided as parameter and includes
+ * the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ * (which means that the -2^(bits-1) value is forbidden); encode and
+ * decode functions check that property. Values are arrays of
+ * int16_t or int8_t, corresponding to names 'trim_i16' and
+ * 'trim_i8', respectively.
+ *
+ * - comp: variable-length encoding for signed integers; each integer
+ * uses a minimum of 9 bits, possibly more. This is normally used
+ * only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED1024_AVX2_modq_encode(void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_comp_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED1024_AVX2_modq_decode(uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_AVX2_comp_decode(int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_AVX2_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_vartime(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ * c0[] contains the hashed nonce+message
+ * s2[] is the decoded signature
+ * h[] contains the public key, in NTT + Montgomery format
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ * h[] receives the public key (NOT in NTT/Montgomery format)
+ * c0[] contains the hashed nonce+message
+ * s1[] is the first signature half
+ * s2[] is the second signature half
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ * fpr
+ * type for a real number
+ *
+ * fpr fpr_of(int64_t i)
+ * cast an integer into a real number; source must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_scaled(int64_t i, int sc)
+ * compute i*2^sc as a real number; source 'i' must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_ldexp(fpr x, int e)
+ * compute x*2^e
+ *
+ * int64_t fpr_rint(fpr x)
+ * round x to the nearest integer; x must be in the -(2^63-1)
+ * to +(2^63-1) range
+ *
+ * int64_t fpr_trunc(fpr x)
+ * round to an integer; this rounds towards zero; value must
+ * be in the -(2^63-1) to +(2^63-1) range
+ *
+ * fpr fpr_add(fpr x, fpr y)
+ * compute x + y
+ *
+ * fpr fpr_sub(fpr x, fpr y)
+ * compute x - y
+ *
+ * fpr fpr_neg(fpr x)
+ * compute -x
+ *
+ * fpr fpr_half(fpr x)
+ * compute x/2
+ *
+ * fpr fpr_double(fpr x)
+ * compute x*2
+ *
+ * fpr fpr_mul(fpr x, fpr y)
+ * compute x * y
+ *
+ * fpr fpr_sqr(fpr x)
+ * compute x * x
+ *
+ * fpr fpr_inv(fpr x)
+ * compute 1/x
+ *
+ * fpr fpr_div(fpr x, fpr y)
+ * compute x/y
+ *
+ * fpr fpr_sqrt(fpr x)
+ * compute the square root of x
+ *
+ * int fpr_lt(fpr x, fpr y)
+ * return 1 if x < y, 0 otherwise
+ *
+ * uint64_t fpr_expm_p63(fpr x)
+ * return exp(x), assuming that 0 <= x < log(2). Returned value
+ * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ * rounded to the nearest integer). Computation should have a
+ * precision of at least 45 bits.
+ *
+ * const fpr fpr_gm_tab[]
+ * array of constants for FFT / iFFT
+ *
+ * const fpr fpr_p2_tab[]
+ * precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ * fpr fpr_q 12289
+ * fpr fpr_inverse_of_q 1/12289
+ * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2))
+ * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10)
+ * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10)
+ * fpr fpr_log2 log(2)
+ * fpr fpr_inv_log2 1/log(2)
+ * fpr fpr_bnorm_max 16822.4121
+ * fpr fpr_zero 0
+ * fpr fpr_one 1
+ * fpr fpr_two 2
+ * fpr fpr_onehalf 0.5
+ * fpr fpr_ptwo31 2^31
+ * fpr fpr_ptwo31m1 2^31-1
+ * fpr fpr_mtwo31m1 -(2^31-1)
+ * fpr fpr_ptwo63m1 2^63-1
+ * fpr fpr_mtwo63m1 -(2^63-1)
+ * fpr fpr_ptwo63 2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_AVX2_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+ union {
+ uint8_t d[512]; /* MUST be 512, exactly */
+ uint64_t dummy_u64;
+ } buf;
+ size_t ptr;
+ union {
+ uint8_t d[256];
+ uint64_t dummy_u64;
+ } state;
+ int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+ size_t u;
+
+ /*
+ * If there are less than 9 bytes in the buffer, we refill it.
+ * This means that we may drop the last few bytes, but this allows
+ * for faster extraction code. Also, it means that we never leave
+ * an empty buffer.
+ */
+ u = p->ptr;
+ if (u >= (sizeof p->buf.d) - 9) {
+ PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+ u = 0;
+ }
+ p->ptr = u + 8;
+
+ return (uint64_t)p->buf.d[u + 0]
+ | ((uint64_t)p->buf.d[u + 1] << 8)
+ | ((uint64_t)p->buf.d[u + 2] << 16)
+ | ((uint64_t)p->buf.d[u + 3] << 24)
+ | ((uint64_t)p->buf.d[u + 4] << 32)
+ | ((uint64_t)p->buf.d[u + 5] << 40)
+ | ((uint64_t)p->buf.d[u + 6] << 48)
+ | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+ unsigned v;
+
+ v = p->buf.d[p->ptr ++];
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+ }
+ return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_div_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_LDL_fft(const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1 136
+#define FALCON_KEYGEN_TEMP_2 272
+#define FALCON_KEYGEN_TEMP_3 224
+#define FALCON_KEYGEN_TEMP_4 448
+#define FALCON_KEYGEN_TEMP_5 896
+#define FALCON_KEYGEN_TEMP_6 1792
+#define FALCON_KEYGEN_TEMP_7 3584
+#define FALCON_KEYGEN_TEMP_8 7168
+#define FALCON_KEYGEN_TEMP_9 14336
+#define FALCON_KEYGEN_TEMP_10 28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED1024_AVX2_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ * ctx pointer to the sampler_context structure
+ * mu center for the distribution
+ * isigma inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+ prng p;
+ fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED1024_AVX2_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED1024_AVX2_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/keygen.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/keygen.c
new file mode 100644
index 000000000..d3197b8c7
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/keygen.c
@@ -0,0 +1,4233 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ * Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ * Montgomery multiplication computes (x*y)/R mod p for
+ * operands x and y. Therefore:
+ *
+ * - if operands are x*R and y*R (Montgomery representations of x and
+ * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ * mod p, which is the Montgomery representation of the product x*y;
+ *
+ * - if operands are x*R and y (or x and y*R), then Montgomery
+ * multiplication returns x*y mod p: mixed-representation
+ * multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ * p The prime itself.
+ *
+ * g A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ * s The inverse of the product of all previous primes in the array,
+ * computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+ uint32_t p;
+ uint32_t g;
+ uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+ { 2147473409, 383167813, 10239 },
+ { 2147389441, 211808905, 471403745 },
+ { 2147387393, 37672282, 1329335065 },
+ { 2147377153, 1977035326, 968223422 },
+ { 2147358721, 1067163706, 132460015 },
+ { 2147352577, 1606082042, 598693809 },
+ { 2147346433, 2033915641, 1056257184 },
+ { 2147338241, 1653770625, 421286710 },
+ { 2147309569, 631200819, 1111201074 },
+ { 2147297281, 2038364663, 1042003613 },
+ { 2147295233, 1962540515, 19440033 },
+ { 2147239937, 2100082663, 353296760 },
+ { 2147235841, 1991153006, 1703918027 },
+ { 2147217409, 516405114, 1258919613 },
+ { 2147205121, 409347988, 1089726929 },
+ { 2147196929, 927788991, 1946238668 },
+ { 2147178497, 1136922411, 1347028164 },
+ { 2147100673, 868626236, 701164723 },
+ { 2147082241, 1897279176, 617820870 },
+ { 2147074049, 1888819123, 158382189 },
+ { 2147051521, 25006327, 522758543 },
+ { 2147043329, 327546255, 37227845 },
+ { 2147039233, 766324424, 1133356428 },
+ { 2146988033, 1862817362, 73861329 },
+ { 2146963457, 404622040, 653019435 },
+ { 2146959361, 1936581214, 995143093 },
+ { 2146938881, 1559770096, 634921513 },
+ { 2146908161, 422623708, 1985060172 },
+ { 2146885633, 1751189170, 298238186 },
+ { 2146871297, 578919515, 291810829 },
+ { 2146846721, 1114060353, 915902322 },
+ { 2146834433, 2069565474, 47859524 },
+ { 2146818049, 1552824584, 646281055 },
+ { 2146775041, 1906267847, 1597832891 },
+ { 2146756609, 1847414714, 1228090888 },
+ { 2146744321, 1818792070, 1176377637 },
+ { 2146738177, 1118066398, 1054971214 },
+ { 2146736129, 52057278, 933422153 },
+ { 2146713601, 592259376, 1406621510 },
+ { 2146695169, 263161877, 1514178701 },
+ { 2146656257, 685363115, 384505091 },
+ { 2146650113, 927727032, 537575289 },
+ { 2146646017, 52575506, 1799464037 },
+ { 2146643969, 1276803876, 1348954416 },
+ { 2146603009, 814028633, 1521547704 },
+ { 2146572289, 1846678872, 1310832121 },
+ { 2146547713, 919368090, 1019041349 },
+ { 2146508801, 671847612, 38582496 },
+ { 2146492417, 283911680, 532424562 },
+ { 2146490369, 1780044827, 896447978 },
+ { 2146459649, 327980850, 1327906900 },
+ { 2146447361, 1310561493, 958645253 },
+ { 2146441217, 412148926, 287271128 },
+ { 2146437121, 293186449, 2009822534 },
+ { 2146430977, 179034356, 1359155584 },
+ { 2146418689, 1517345488, 1790248672 },
+ { 2146406401, 1615820390, 1584833571 },
+ { 2146404353, 826651445, 607120498 },
+ { 2146379777, 3816988, 1897049071 },
+ { 2146363393, 1221409784, 1986921567 },
+ { 2146355201, 1388081168, 849968120 },
+ { 2146336769, 1803473237, 1655544036 },
+ { 2146312193, 1023484977, 273671831 },
+ { 2146293761, 1074591448, 467406983 },
+ { 2146283521, 831604668, 1523950494 },
+ { 2146203649, 712865423, 1170834574 },
+ { 2146154497, 1764991362, 1064856763 },
+ { 2146142209, 627386213, 1406840151 },
+ { 2146127873, 1638674429, 2088393537 },
+ { 2146099201, 1516001018, 690673370 },
+ { 2146093057, 1294931393, 315136610 },
+ { 2146091009, 1942399533, 973539425 },
+ { 2146078721, 1843461814, 2132275436 },
+ { 2146060289, 1098740778, 360423481 },
+ { 2146048001, 1617213232, 1951981294 },
+ { 2146041857, 1805783169, 2075683489 },
+ { 2146019329, 272027909, 1753219918 },
+ { 2145986561, 1206530344, 2034028118 },
+ { 2145976321, 1243769360, 1173377644 },
+ { 2145964033, 887200839, 1281344586 },
+ { 2145906689, 1651026455, 906178216 },
+ { 2145875969, 1673238256, 1043521212 },
+ { 2145871873, 1226591210, 1399796492 },
+ { 2145841153, 1465353397, 1324527802 },
+ { 2145832961, 1150638905, 554084759 },
+ { 2145816577, 221601706, 427340863 },
+ { 2145785857, 608896761, 316590738 },
+ { 2145755137, 1712054942, 1684294304 },
+ { 2145742849, 1302302867, 724873116 },
+ { 2145728513, 516717693, 431671476 },
+ { 2145699841, 524575579, 1619722537 },
+ { 2145691649, 1925625239, 982974435 },
+ { 2145687553, 463795662, 1293154300 },
+ { 2145673217, 771716636, 881778029 },
+ { 2145630209, 1509556977, 837364988 },
+ { 2145595393, 229091856, 851648427 },
+ { 2145587201, 1796903241, 635342424 },
+ { 2145525761, 715310882, 1677228081 },
+ { 2145495041, 1040930522, 200685896 },
+ { 2145466369, 949804237, 1809146322 },
+ { 2145445889, 1673903706, 95316881 },
+ { 2145390593, 806941852, 1428671135 },
+ { 2145372161, 1402525292, 159350694 },
+ { 2145361921, 2124760298, 1589134749 },
+ { 2145359873, 1217503067, 1561543010 },
+ { 2145355777, 338341402, 83865711 },
+ { 2145343489, 1381532164, 641430002 },
+ { 2145325057, 1883895478, 1528469895 },
+ { 2145318913, 1335370424, 65809740 },
+ { 2145312769, 2000008042, 1919775760 },
+ { 2145300481, 961450962, 1229540578 },
+ { 2145282049, 910466767, 1964062701 },
+ { 2145232897, 816527501, 450152063 },
+ { 2145218561, 1435128058, 1794509700 },
+ { 2145187841, 33505311, 1272467582 },
+ { 2145181697, 269767433, 1380363849 },
+ { 2145175553, 56386299, 1316870546 },
+ { 2145079297, 2106880293, 1391797340 },
+ { 2145021953, 1347906152, 720510798 },
+ { 2145015809, 206769262, 1651459955 },
+ { 2145003521, 1885513236, 1393381284 },
+ { 2144960513, 1810381315, 31937275 },
+ { 2144944129, 1306487838, 2019419520 },
+ { 2144935937, 37304730, 1841489054 },
+ { 2144894977, 1601434616, 157985831 },
+ { 2144888833, 98749330, 2128592228 },
+ { 2144880641, 1772327002, 2076128344 },
+ { 2144864257, 1404514762, 2029969964 },
+ { 2144827393, 801236594, 406627220 },
+ { 2144806913, 349217443, 1501080290 },
+ { 2144796673, 1542656776, 2084736519 },
+ { 2144778241, 1210734884, 1746416203 },
+ { 2144759809, 1146598851, 716464489 },
+ { 2144757761, 286328400, 1823728177 },
+ { 2144729089, 1347555695, 1836644881 },
+ { 2144727041, 1795703790, 520296412 },
+ { 2144696321, 1302475157, 852964281 },
+ { 2144667649, 1075877614, 504992927 },
+ { 2144573441, 198765808, 1617144982 },
+ { 2144555009, 321528767, 155821259 },
+ { 2144550913, 814139516, 1819937644 },
+ { 2144536577, 571143206, 962942255 },
+ { 2144524289, 1746733766, 2471321 },
+ { 2144512001, 1821415077, 124190939 },
+ { 2144468993, 917871546, 1260072806 },
+ { 2144458753, 378417981, 1569240563 },
+ { 2144421889, 175229668, 1825620763 },
+ { 2144409601, 1699216963, 351648117 },
+ { 2144370689, 1071885991, 958186029 },
+ { 2144348161, 1763151227, 540353574 },
+ { 2144335873, 1060214804, 919598847 },
+ { 2144329729, 663515846, 1448552668 },
+ { 2144327681, 1057776305, 590222840 },
+ { 2144309249, 1705149168, 1459294624 },
+ { 2144296961, 325823721, 1649016934 },
+ { 2144290817, 738775789, 447427206 },
+ { 2144243713, 962347618, 893050215 },
+ { 2144237569, 1655257077, 900860862 },
+ { 2144161793, 242206694, 1567868672 },
+ { 2144155649, 769415308, 1247993134 },
+ { 2144137217, 320492023, 515841070 },
+ { 2144120833, 1639388522, 770877302 },
+ { 2144071681, 1761785233, 964296120 },
+ { 2144065537, 419817825, 204564472 },
+ { 2144028673, 666050597, 2091019760 },
+ { 2144010241, 1413657615, 1518702610 },
+ { 2143952897, 1238327946, 475672271 },
+ { 2143940609, 307063413, 1176750846 },
+ { 2143918081, 2062905559, 786785803 },
+ { 2143899649, 1338112849, 1562292083 },
+ { 2143891457, 68149545, 87166451 },
+ { 2143885313, 921750778, 394460854 },
+ { 2143854593, 719766593, 133877196 },
+ { 2143836161, 1149399850, 1861591875 },
+ { 2143762433, 1848739366, 1335934145 },
+ { 2143756289, 1326674710, 102999236 },
+ { 2143713281, 808061791, 1156900308 },
+ { 2143690753, 388399459, 1926468019 },
+ { 2143670273, 1427891374, 1756689401 },
+ { 2143666177, 1912173949, 986629565 },
+ { 2143645697, 2041160111, 371842865 },
+ { 2143641601, 1279906897, 2023974350 },
+ { 2143635457, 720473174, 1389027526 },
+ { 2143621121, 1298309455, 1732632006 },
+ { 2143598593, 1548762216, 1825417506 },
+ { 2143567873, 620475784, 1073787233 },
+ { 2143561729, 1932954575, 949167309 },
+ { 2143553537, 354315656, 1652037534 },
+ { 2143541249, 577424288, 1097027618 },
+ { 2143531009, 357862822, 478640055 },
+ { 2143522817, 2017706025, 1550531668 },
+ { 2143506433, 2078127419, 1824320165 },
+ { 2143488001, 613475285, 1604011510 },
+ { 2143469569, 1466594987, 502095196 },
+ { 2143426561, 1115430331, 1044637111 },
+ { 2143383553, 9778045, 1902463734 },
+ { 2143377409, 1557401276, 2056861771 },
+ { 2143363073, 652036455, 1965915971 },
+ { 2143260673, 1464581171, 1523257541 },
+ { 2143246337, 1876119649, 764541916 },
+ { 2143209473, 1614992673, 1920672844 },
+ { 2143203329, 981052047, 2049774209 },
+ { 2143160321, 1847355533, 728535665 },
+ { 2143129601, 965558457, 603052992 },
+ { 2143123457, 2140817191, 8348679 },
+ { 2143100929, 1547263683, 694209023 },
+ { 2143092737, 643459066, 1979934533 },
+ { 2143082497, 188603778, 2026175670 },
+ { 2143062017, 1657329695, 377451099 },
+ { 2143051777, 114967950, 979255473 },
+ { 2143025153, 1698431342, 1449196896 },
+ { 2143006721, 1862741675, 1739650365 },
+ { 2142996481, 756660457, 996160050 },
+ { 2142976001, 927864010, 1166847574 },
+ { 2142965761, 905070557, 661974566 },
+ { 2142916609, 40932754, 1787161127 },
+ { 2142892033, 1987985648, 675335382 },
+ { 2142885889, 797497211, 1323096997 },
+ { 2142871553, 2068025830, 1411877159 },
+ { 2142861313, 1217177090, 1438410687 },
+ { 2142830593, 409906375, 1767860634 },
+ { 2142803969, 1197788993, 359782919 },
+ { 2142785537, 643817365, 513932862 },
+ { 2142779393, 1717046338, 218943121 },
+ { 2142724097, 89336830, 416687049 },
+ { 2142707713, 5944581, 1356813523 },
+ { 2142658561, 887942135, 2074011722 },
+ { 2142638081, 151851972, 1647339939 },
+ { 2142564353, 1691505537, 1483107336 },
+ { 2142533633, 1989920200, 1135938817 },
+ { 2142529537, 959263126, 1531961857 },
+ { 2142527489, 453251129, 1725566162 },
+ { 2142502913, 1536028102, 182053257 },
+ { 2142498817, 570138730, 701443447 },
+ { 2142416897, 326965800, 411931819 },
+ { 2142363649, 1675665410, 1517191733 },
+ { 2142351361, 968529566, 1575712703 },
+ { 2142330881, 1384953238, 1769087884 },
+ { 2142314497, 1977173242, 1833745524 },
+ { 2142289921, 95082313, 1714775493 },
+ { 2142283777, 109377615, 1070584533 },
+ { 2142277633, 16960510, 702157145 },
+ { 2142263297, 553850819, 431364395 },
+ { 2142208001, 241466367, 2053967982 },
+ { 2142164993, 1795661326, 1031836848 },
+ { 2142097409, 1212530046, 712772031 },
+ { 2142087169, 1763869720, 822276067 },
+ { 2142078977, 644065713, 1765268066 },
+ { 2142074881, 112671944, 643204925 },
+ { 2142044161, 1387785471, 1297890174 },
+ { 2142025729, 783885537, 1000425730 },
+ { 2142011393, 905662232, 1679401033 },
+ { 2141974529, 799788433, 468119557 },
+ { 2141943809, 1932544124, 449305555 },
+ { 2141933569, 1527403256, 841867925 },
+ { 2141931521, 1247076451, 743823916 },
+ { 2141902849, 1199660531, 401687910 },
+ { 2141890561, 150132350, 1720336972 },
+ { 2141857793, 1287438162, 663880489 },
+ { 2141833217, 618017731, 1819208266 },
+ { 2141820929, 999578638, 1403090096 },
+ { 2141786113, 81834325, 1523542501 },
+ { 2141771777, 120001928, 463556492 },
+ { 2141759489, 122455485, 2124928282 },
+ { 2141749249, 141986041, 940339153 },
+ { 2141685761, 889088734, 477141499 },
+ { 2141673473, 324212681, 1122558298 },
+ { 2141669377, 1175806187, 1373818177 },
+ { 2141655041, 1113654822, 296887082 },
+ { 2141587457, 991103258, 1585913875 },
+ { 2141583361, 1401451409, 1802457360 },
+ { 2141575169, 1571977166, 712760980 },
+ { 2141546497, 1107849376, 1250270109 },
+ { 2141515777, 196544219, 356001130 },
+ { 2141495297, 1733571506, 1060744866 },
+ { 2141483009, 321552363, 1168297026 },
+ { 2141458433, 505818251, 733225819 },
+ { 2141360129, 1026840098, 948342276 },
+ { 2141325313, 945133744, 2129965998 },
+ { 2141317121, 1871100260, 1843844634 },
+ { 2141286401, 1790639498, 1750465696 },
+ { 2141267969, 1376858592, 186160720 },
+ { 2141255681, 2129698296, 1876677959 },
+ { 2141243393, 2138900688, 1340009628 },
+ { 2141214721, 1933049835, 1087819477 },
+ { 2141212673, 1898664939, 1786328049 },
+ { 2141202433, 990234828, 940682169 },
+ { 2141175809, 1406392421, 993089586 },
+ { 2141165569, 1263518371, 289019479 },
+ { 2141073409, 1485624211, 507864514 },
+ { 2141052929, 1885134788, 311252465 },
+ { 2141040641, 1285021247, 280941862 },
+ { 2141028353, 1527610374, 375035110 },
+ { 2141011969, 1400626168, 164696620 },
+ { 2140999681, 632959608, 966175067 },
+ { 2140997633, 2045628978, 1290889438 },
+ { 2140993537, 1412755491, 375366253 },
+ { 2140942337, 719477232, 785367828 },
+ { 2140925953, 45224252, 836552317 },
+ { 2140917761, 1157376588, 1001839569 },
+ { 2140887041, 278480752, 2098732796 },
+ { 2140837889, 1663139953, 924094810 },
+ { 2140788737, 802501511, 2045368990 },
+ { 2140766209, 1820083885, 1800295504 },
+ { 2140764161, 1169561905, 2106792035 },
+ { 2140696577, 127781498, 1885987531 },
+ { 2140684289, 16014477, 1098116827 },
+ { 2140653569, 665960598, 1796728247 },
+ { 2140594177, 1043085491, 377310938 },
+ { 2140579841, 1732838211, 1504505945 },
+ { 2140569601, 302071939, 358291016 },
+ { 2140567553, 192393733, 1909137143 },
+ { 2140557313, 406595731, 1175330270 },
+ { 2140549121, 1748850918, 525007007 },
+ { 2140477441, 499436566, 1031159814 },
+ { 2140469249, 1886004401, 1029951320 },
+ { 2140426241, 1483168100, 1676273461 },
+ { 2140420097, 1779917297, 846024476 },
+ { 2140413953, 522948893, 1816354149 },
+ { 2140383233, 1931364473, 1296921241 },
+ { 2140366849, 1917356555, 147196204 },
+ { 2140354561, 16466177, 1349052107 },
+ { 2140348417, 1875366972, 1860485634 },
+ { 2140323841, 456498717, 1790256483 },
+ { 2140321793, 1629493973, 150031888 },
+ { 2140315649, 1904063898, 395510935 },
+ { 2140280833, 1784104328, 831417909 },
+ { 2140250113, 256087139, 697349101 },
+ { 2140229633, 388553070, 243875754 },
+ { 2140223489, 747459608, 1396270850 },
+ { 2140200961, 507423743, 1895572209 },
+ { 2140162049, 580106016, 2045297469 },
+ { 2140149761, 712426444, 785217995 },
+ { 2140137473, 1441607584, 536866543 },
+ { 2140119041, 346538902, 1740434653 },
+ { 2140090369, 282642885, 21051094 },
+ { 2140076033, 1407456228, 319910029 },
+ { 2140047361, 1619330500, 1488632070 },
+ { 2140041217, 2089408064, 2012026134 },
+ { 2140008449, 1705524800, 1613440760 },
+ { 2139924481, 1846208233, 1280649481 },
+ { 2139906049, 989438755, 1185646076 },
+ { 2139867137, 1522314850, 372783595 },
+ { 2139842561, 1681587377, 216848235 },
+ { 2139826177, 2066284988, 1784999464 },
+ { 2139824129, 480888214, 1513323027 },
+ { 2139789313, 847937200, 858192859 },
+ { 2139783169, 1642000434, 1583261448 },
+ { 2139770881, 940699589, 179702100 },
+ { 2139768833, 315623242, 964612676 },
+ { 2139666433, 331649203, 764666914 },
+ { 2139641857, 2118730799, 1313764644 },
+ { 2139635713, 519149027, 519212449 },
+ { 2139598849, 1526413634, 1769667104 },
+ { 2139574273, 551148610, 820739925 },
+ { 2139568129, 1386800242, 472447405 },
+ { 2139549697, 813760130, 1412328531 },
+ { 2139537409, 1615286260, 1609362979 },
+ { 2139475969, 1352559299, 1696720421 },
+ { 2139455489, 1048691649, 1584935400 },
+ { 2139432961, 836025845, 950121150 },
+ { 2139424769, 1558281165, 1635486858 },
+ { 2139406337, 1728402143, 1674423301 },
+ { 2139396097, 1727715782, 1483470544 },
+ { 2139383809, 1092853491, 1741699084 },
+ { 2139369473, 690776899, 1242798709 },
+ { 2139351041, 1768782380, 2120712049 },
+ { 2139334657, 1739968247, 1427249225 },
+ { 2139332609, 1547189119, 623011170 },
+ { 2139310081, 1346827917, 1605466350 },
+ { 2139303937, 369317948, 828392831 },
+ { 2139301889, 1560417239, 1788073219 },
+ { 2139283457, 1303121623, 595079358 },
+ { 2139248641, 1354555286, 573424177 },
+ { 2139240449, 60974056, 885781403 },
+ { 2139222017, 355573421, 1221054839 },
+ { 2139215873, 566477826, 1724006500 },
+ { 2139150337, 871437673, 1609133294 },
+ { 2139144193, 1478130914, 1137491905 },
+ { 2139117569, 1854880922, 964728507 },
+ { 2139076609, 202405335, 756508944 },
+ { 2139062273, 1399715741, 884826059 },
+ { 2139045889, 1051045798, 1202295476 },
+ { 2139033601, 1707715206, 632234634 },
+ { 2139006977, 2035853139, 231626690 },
+ { 2138951681, 183867876, 838350879 },
+ { 2138945537, 1403254661, 404460202 },
+ { 2138920961, 310865011, 1282911681 },
+ { 2138910721, 1328496553, 103472415 },
+ { 2138904577, 78831681, 993513549 },
+ { 2138902529, 1319697451, 1055904361 },
+ { 2138816513, 384338872, 1706202469 },
+ { 2138810369, 1084868275, 405677177 },
+ { 2138787841, 401181788, 1964773901 },
+ { 2138775553, 1850532988, 1247087473 },
+ { 2138767361, 874261901, 1576073565 },
+ { 2138757121, 1187474742, 993541415 },
+ { 2138748929, 1782458888, 1043206483 },
+ { 2138744833, 1221500487, 800141243 },
+ { 2138738689, 413465368, 1450660558 },
+ { 2138695681, 739045140, 342611472 },
+ { 2138658817, 1355845756, 672674190 },
+ { 2138644481, 608379162, 1538874380 },
+ { 2138632193, 1444914034, 686911254 },
+ { 2138607617, 484707818, 1435142134 },
+ { 2138591233, 539460669, 1290458549 },
+ { 2138572801, 2093538990, 2011138646 },
+ { 2138552321, 1149786988, 1076414907 },
+ { 2138546177, 840688206, 2108985273 },
+ { 2138533889, 209669619, 198172413 },
+ { 2138523649, 1975879426, 1277003968 },
+ { 2138490881, 1351891144, 1976858109 },
+ { 2138460161, 1817321013, 1979278293 },
+ { 2138429441, 1950077177, 203441928 },
+ { 2138400769, 908970113, 628395069 },
+ { 2138398721, 219890864, 758486760 },
+ { 2138376193, 1306654379, 977554090 },
+ { 2138351617, 298822498, 2004708503 },
+ { 2138337281, 441457816, 1049002108 },
+ { 2138320897, 1517731724, 1442269609 },
+ { 2138290177, 1355911197, 1647139103 },
+ { 2138234881, 531313247, 1746591962 },
+ { 2138214401, 1899410930, 781416444 },
+ { 2138202113, 1813477173, 1622508515 },
+ { 2138191873, 1086458299, 1025408615 },
+ { 2138183681, 1998800427, 827063290 },
+ { 2138173441, 1921308898, 749670117 },
+ { 2138103809, 1620902804, 2126787647 },
+ { 2138099713, 828647069, 1892961817 },
+ { 2138085377, 179405355, 1525506535 },
+ { 2138060801, 615683235, 1259580138 },
+ { 2138044417, 2030277840, 1731266562 },
+ { 2138042369, 2087222316, 1627902259 },
+ { 2138032129, 126388712, 1108640984 },
+ { 2138011649, 715026550, 1017980050 },
+ { 2137993217, 1693714349, 1351778704 },
+ { 2137888769, 1289762259, 1053090405 },
+ { 2137853953, 199991890, 1254192789 },
+ { 2137833473, 941421685, 896995556 },
+ { 2137817089, 750416446, 1251031181 },
+ { 2137792513, 798075119, 368077456 },
+ { 2137786369, 878543495, 1035375025 },
+ { 2137767937, 9351178, 1156563902 },
+ { 2137755649, 1382297614, 1686559583 },
+ { 2137724929, 1345472850, 1681096331 },
+ { 2137704449, 834666929, 630551727 },
+ { 2137673729, 1646165729, 1892091571 },
+ { 2137620481, 778943821, 48456461 },
+ { 2137618433, 1730837875, 1713336725 },
+ { 2137581569, 805610339, 1378891359 },
+ { 2137538561, 204342388, 1950165220 },
+ { 2137526273, 1947629754, 1500789441 },
+ { 2137516033, 719902645, 1499525372 },
+ { 2137491457, 230451261, 556382829 },
+ { 2137440257, 979573541, 412760291 },
+ { 2137374721, 927841248, 1954137185 },
+ { 2137362433, 1243778559, 861024672 },
+ { 2137313281, 1341338501, 980638386 },
+ { 2137311233, 937415182, 1793212117 },
+ { 2137255937, 795331324, 1410253405 },
+ { 2137243649, 150756339, 1966999887 },
+ { 2137182209, 163346914, 1939301431 },
+ { 2137171969, 1952552395, 758913141 },
+ { 2137159681, 570788721, 218668666 },
+ { 2137147393, 1896656810, 2045670345 },
+ { 2137141249, 358493842, 518199643 },
+ { 2137139201, 1505023029, 674695848 },
+ { 2137133057, 27911103, 830956306 },
+ { 2137122817, 439771337, 1555268614 },
+ { 2137116673, 790988579, 1871449599 },
+ { 2137110529, 432109234, 811805080 },
+ { 2137102337, 1357900653, 1184997641 },
+ { 2137098241, 515119035, 1715693095 },
+ { 2137090049, 408575203, 2085660657 },
+ { 2137085953, 2097793407, 1349626963 },
+ { 2137055233, 1556739954, 1449960883 },
+ { 2137030657, 1545758650, 1369303716 },
+ { 2136987649, 332602570, 103875114 },
+ { 2136969217, 1499989506, 1662964115 },
+ { 2136924161, 857040753, 4738842 },
+ { 2136895489, 1948872712, 570436091 },
+ { 2136893441, 58969960, 1568349634 },
+ { 2136887297, 2127193379, 273612548 },
+ { 2136850433, 111208983, 1181257116 },
+ { 2136809473, 1627275942, 1680317971 },
+ { 2136764417, 1574888217, 14011331 },
+ { 2136741889, 14011055, 1129154251 },
+ { 2136727553, 35862563, 1838555253 },
+ { 2136721409, 310235666, 1363928244 },
+ { 2136698881, 1612429202, 1560383828 },
+ { 2136649729, 1138540131, 800014364 },
+ { 2136606721, 602323503, 1433096652 },
+ { 2136563713, 182209265, 1919611038 },
+ { 2136555521, 324156477, 165591039 },
+ { 2136549377, 195513113, 217165345 },
+ { 2136526849, 1050768046, 939647887 },
+ { 2136508417, 1886286237, 1619926572 },
+ { 2136477697, 609647664, 35065157 },
+ { 2136471553, 679352216, 1452259468 },
+ { 2136457217, 128630031, 824816521 },
+ { 2136422401, 19787464, 1526049830 },
+ { 2136420353, 698316836, 1530623527 },
+ { 2136371201, 1651862373, 1804812805 },
+ { 2136334337, 326596005, 336977082 },
+ { 2136322049, 63253370, 1904972151 },
+ { 2136297473, 312176076, 172182411 },
+ { 2136248321, 381261841, 369032670 },
+ { 2136242177, 358688773, 1640007994 },
+ { 2136229889, 512677188, 75585225 },
+ { 2136219649, 2095003250, 1970086149 },
+ { 2136207361, 1909650722, 537760675 },
+ { 2136176641, 1334616195, 1533487619 },
+ { 2136158209, 2096285632, 1793285210 },
+ { 2136143873, 1897347517, 293843959 },
+ { 2136133633, 923586222, 1022655978 },
+ { 2136096769, 1464868191, 1515074410 },
+ { 2136094721, 2020679520, 2061636104 },
+ { 2136076289, 290798503, 1814726809 },
+ { 2136041473, 156415894, 1250757633 },
+ { 2135996417, 297459940, 1132158924 },
+ { 2135955457, 538755304, 1688831340 },
+ { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+ uint32_t w;
+
+ w = (uint32_t)x;
+ w += p & -(w >> 31);
+ return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+ return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+ uint32_t y;
+
+ y = 2 - p;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+ /*
+ * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+ * 2^31 - p.
+ */
+ return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a + b - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a - b;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+ a += p & -(a & 1);
+ return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+ uint64_t z, w;
+ uint32_t d;
+
+ z = (uint64_t)a * (uint64_t)b;
+ w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+ d = (uint32_t)((z + w) >> 31) - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+ uint32_t z;
+
+ /*
+ * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+ * representation), then double it with an addition.
+ */
+ z = modp_R(p);
+ z = modp_add(z, z, p);
+
+ /*
+ * Square it five times to obtain 2^32 in Montgomery representation
+ * (i.e. 2^63 mod p).
+ */
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+
+ /*
+ * Halve the value mod p to get 2^62.
+ */
+ z = (z + (p & -(z & 1))) >> 1;
+ return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+ int i;
+ uint32_t r, z;
+
+ /*
+ * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+ * representation of (2^31)^e mod p, where e = x-1.
+ * R2 is 2^31 in Montgomery representation.
+ */
+ x --;
+ r = R2;
+ z = modp_R(p);
+ for (i = 0; (1U << i) <= x; i ++) {
+ if ((x & (1U << i)) != 0) {
+ z = modp_montymul(z, r, p, p0i);
+ }
+ r = modp_montymul(r, r, p, p0i);
+ }
+ return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ * a dividend
+ * b divisor
+ * p odd prime modulus
+ * p0i -1/p mod 2^31
+ * R 2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+ uint32_t z, e;
+ int i;
+
+ e = p - 2;
+ z = R;
+ for (i = 30; i >= 0; i --) {
+ uint32_t z2;
+
+ z = modp_montymul(z, z, p, p0i);
+ z2 = modp_montymul(z, b, p, p0i);
+ z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+ }
+
+ /*
+ * The loop above just assumed that b was in Montgomery
+ * representation, i.e. really contained b*R; under that
+ * assumption, it returns 1/b in Montgomery representation,
+ * which is R/b. But we gave it b in normal representation,
+ * so the loop really returned R/(b/R) = R^2/b.
+ *
+ * We want a/b, so we need one Montgomery multiplication with a,
+ * which also remove one of the R factors, and another such
+ * multiplication to remove the second R factor.
+ */
+ z = modp_montymul(z, 1, p, p0i);
+ return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+ 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832,
+ 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928,
+ 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784,
+ 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976,
+ 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880,
+ 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904,
+ 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808,
+ 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000,
+ 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856,
+ 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952,
+ 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772,
+ 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964,
+ 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868,
+ 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916,
+ 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820,
+ 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012,
+ 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844,
+ 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940,
+ 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796,
+ 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988,
+ 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892,
+ 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898,
+ 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802,
+ 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994,
+ 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850,
+ 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946,
+ 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778,
+ 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970,
+ 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874,
+ 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922,
+ 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826,
+ 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018,
+ 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838,
+ 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934,
+ 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790,
+ 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982,
+ 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886,
+ 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910,
+ 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814,
+ 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006,
+ 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862,
+ 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958,
+ 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769,
+ 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961,
+ 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865,
+ 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913,
+ 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817,
+ 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009,
+ 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841,
+ 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937,
+ 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793,
+ 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985,
+ 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889,
+ 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901,
+ 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805,
+ 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997,
+ 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853,
+ 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949,
+ 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781,
+ 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973,
+ 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877,
+ 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925,
+ 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829,
+ 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021,
+ 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835,
+ 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931,
+ 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787,
+ 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979,
+ 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883,
+ 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907,
+ 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811,
+ 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003,
+ 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859,
+ 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955,
+ 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775,
+ 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967,
+ 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871,
+ 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919,
+ 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823,
+ 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015,
+ 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847,
+ 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943,
+ 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799,
+ 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991,
+ 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895,
+ 255, 767, 511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ * gm[rev(i)] = g^i mod p
+ * igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+ uint32_t g, uint32_t p, uint32_t p0i) {
+ size_t u, n;
+ unsigned k;
+ uint32_t ig, x1, x2, R2;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * We want g such that g^(2N) = 1 mod p, but the provided
+ * generator has order 2048. We must square it a few times.
+ */
+ R2 = modp_R2(p, p0i);
+ g = modp_montymul(g, R2, p, p0i);
+ for (k = logn; k < 10; k ++) {
+ g = modp_montymul(g, g, p, p0i);
+ }
+
+ ig = modp_div(R2, g, p, p0i, modp_R(p));
+ k = 10 - logn;
+ x1 = x2 = modp_R(p);
+ for (u = 0; u < n; u ++) {
+ size_t v;
+
+ v = REV10[u << k];
+ gm[v] = x1;
+ igm[v] = x2;
+ x1 = modp_montymul(x1, g, p, p0i);
+ x2 = modp_montymul(x2, ig, p, p0i);
+ }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, u, v1;
+
+ ht = t >> 1;
+ for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = gm[m + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + ht * stride;
+ for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = modp_montymul(*r2, s, p, p0i);
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_sub(x, y, p);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n, k;
+ uint32_t ni;
+ uint32_t *r;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = 1;
+ for (m = n; m > 1; m >>= 1) {
+ size_t hm, dt, u, v1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = igm[hm + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + t * stride;
+ for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = *r2;
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_montymul(
+ modp_sub(x, y, p), s, p, p0i);;
+ }
+ }
+ t = dt;
+ }
+
+ /*
+ * We need 1/n in Montgomery representation, i.e. R/n. Since
+ * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+ * thus a simple shift will do.
+ */
+ ni = (uint32_t)1 << (31 - logn);
+ for (k = 0, r = a; k < n; k ++, r += stride) {
+ *r = modp_montymul(*r, ni, p, p0i);
+ }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ size_t hn, u;
+
+ hn = (size_t)1 << (logn - 1);
+ for (u = 0; u < hn; u ++) {
+ uint32_t w0, w1;
+
+ w0 = f[(u << 1) + 0];
+ w1 = f[(u << 1) + 1];
+ f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ * - Rebuild the resultant and the polynomial coefficients from their
+ * values modulo small primes (of length 31 bits each).
+ *
+ * - Compute an extended GCD between the two computed resultants.
+ *
+ * - Extract top bits and add scaled values during the successive steps
+ * of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+ uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ cc = 0;
+ m = -ctl;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, w;
+
+ aw = a[u];
+ w = aw - b[u] - cc;
+ cc = w >> 31;
+ aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+ a[u] = aw;
+ }
+ return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < mlen; u ++) {
+ uint64_t z;
+
+ z = (uint64_t)m[u] * (uint64_t)x + cc;
+ m[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ * d is unsigned
+ * p is prime
+ * 2^30 < p < 2^31
+ * p0i = -(1/p) mod 2^31
+ * R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ uint32_t x;
+ size_t u;
+
+ /*
+ * Algorithm: we inject words one by one, starting with the high
+ * word. Each step is:
+ * - multiply x by 2^31
+ * - add new word
+ */
+ x = 0;
+ u = dlen;
+ while (u -- > 0) {
+ uint32_t w;
+
+ x = modp_montymul(x, R2, p, p0i);
+ w = d[u] - p;
+ w += p & -(w >> 31);
+ x = modp_add(x, w, p);
+ }
+ return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+ uint32_t z;
+
+ if (dlen == 0) {
+ return 0;
+ }
+ z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+ z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+ return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+ const uint32_t *y, size_t len, uint32_t s) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t xw, yw;
+ uint64_t z;
+
+ xw = x[u];
+ yw = y[u];
+ z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+ size_t u;
+ uint32_t r, bb;
+
+ /*
+ * Compare x with p/2. We use the shifted version of p, and p
+ * is odd, so we really compare with (p-1)/2; we want to perform
+ * the subtraction if and only if x > (p-1)/2.
+ */
+ r = 0;
+ bb = 0;
+ u = len;
+ while (u -- > 0) {
+ uint32_t wx, wp, cc;
+
+ /*
+ * Get the two words to compare in wx and wp (both over
+ * 31 bits exactly).
+ */
+ wx = x[u];
+ wp = (p[u] >> 1) | (bb << 30);
+ bb = p[u] & 1;
+
+ /*
+ * We set cc to -1, 0 or 1, depending on whether wp is
+ * lower than, equal to, or greater than wx.
+ */
+ cc = wp - wx;
+ cc = ((-cc) >> 31) | -(cc >> 31);
+
+ /*
+ * If r != 0 then it is either 1 or -1, and we keep its
+ * value. Otherwise, if r = 0, then we replace it with cc.
+ */
+ r |= cc & ((r & 1) - 1);
+ }
+
+ /*
+ * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+ * is lower than, equal to, or greater than x. We thus want to
+ * do the subtraction only if r = -1.
+ */
+ zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+ size_t num, const small_prime *primes, int normalize_signed,
+ uint32_t *tmp) {
+ size_t u;
+ uint32_t *x;
+
+ tmp[0] = primes[0].p;
+ for (u = 1; u < xlen; u ++) {
+ /*
+ * At the entry of each loop iteration:
+ * - the first u words of each array have been
+ * reassembled;
+ * - the first u words of tmp[] contains the
+ * product of the prime moduli processed so far.
+ *
+ * We call 'q' the product of all previous primes.
+ */
+ uint32_t p, p0i, s, R2;
+ size_t v;
+
+ p = primes[u].p;
+ s = primes[u].s;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ for (v = 0, x = xx; v < num; v ++, x += xstride) {
+ uint32_t xp, xq, xr;
+ /*
+ * xp = the integer x modulo the prime p for this
+ * iteration
+ * xq = (x mod q) mod p
+ */
+ xp = x[u];
+ xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+ /*
+ * New value is (x mod q) + q * (s * (xp - xq) mod p)
+ */
+ xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+ zint_add_mul_small(x, tmp, u, xr);
+ }
+
+ /*
+ * Update product of primes in tmp[].
+ */
+ tmp[u] = zint_mul_small(tmp, u, p);
+ }
+
+ /*
+ * Normalize the reconstructed values around 0.
+ */
+ if (normalize_signed) {
+ for (u = 0, x = xx; u < num; u ++, x += xstride) {
+ zint_norm_zero(x, tmp, xlen);
+ }
+ }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ /*
+ * If ctl = 1 then we flip the bits of a by XORing with
+ * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+ * with 0 and add 0, which leaves the value unchanged.
+ */
+ cc = ctl;
+ m = -ctl >> 1;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw;
+
+ aw = a[u];
+ aw = (aw ^ m) + cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ * 0 both values were positive
+ * 1 new a had to be negated
+ * 2 new b had to be negated
+ * 3 both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+ int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t nega, negb;
+
+ cca = 0;
+ ccb = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ nega = (uint32_t)((uint64_t)cca >> 63);
+ negb = (uint32_t)((uint64_t)ccb >> 63);
+ zint_negate(a, len, nega);
+ zint_negate(b, len, negb);
+ return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ * if neg = 1, then -m <= a < 0
+ * if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+ size_t u;
+ uint32_t cc, xm, ym;
+
+ /*
+ * First pass: compare a (assumed nonnegative) with m. Note that
+ * if the top word uses 32 bits, subtracting m must yield a
+ * value less than 2^31 since a < 2*m.
+ */
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ cc = (a[u] - m[u] - cc) >> 31;
+ }
+
+ /*
+ * If neg = 1 then we must add m (regardless of cc)
+ * If neg = 0 and cc = 0 then we must subtract m
+ * If neg = 0 and cc = 1 then we must do nothing
+ *
+ * In the loop below, we conditionally subtract either m or -m
+ * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+ * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+ */
+ xm = -neg >> 1;
+ ym = -(neg | (1 - cc));
+ cc = neg;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, mw;
+
+ aw = a[u];
+ mw = (m[u] ^ xm) & ym;
+ aw = aw - mw - cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+ uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t fa, fb;
+
+ /*
+ * These are actually four combined Montgomery multiplications.
+ */
+ cca = 0;
+ ccb = 0;
+ fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+ fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb
+ + m[u] * (uint64_t)fa + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+ + m[u] * (uint64_t)fb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ /*
+ * At this point:
+ * -m <= a < 2*m
+ * -m <= b < 2*m
+ * (this is a case of Montgomery reduction)
+ * The top words of 'a' and 'b' may have a 32-th bit set.
+ * We want to add or subtract the modulus, as required.
+ */
+ zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+ zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ * 0 <= u <= y
+ * 0 <= v <= x
+ * x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+ const uint32_t *x, const uint32_t *y,
+ size_t len, uint32_t *tmp) {
+ /*
+ * Algorithm is an extended binary GCD. We maintain 6 values
+ * a, b, u0, u1, v0 and v1 with the following invariants:
+ *
+ * a = x*u0 - y*v0
+ * b = x*u1 - y*v1
+ * 0 <= a <= x
+ * 0 <= b <= y
+ * 0 <= u0 < y
+ * 0 <= v0 < x
+ * 0 <= u1 <= y
+ * 0 <= v1 < x
+ *
+ * Initial values are:
+ *
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ *
+ * Each iteration reduces either a or b, and maintains the
+ * invariants. Algorithm stops when a = b, at which point their
+ * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+ * the values (u,v) we want to return.
+ *
+ * The formal definition of the algorithm is a sequence of steps:
+ *
+ * - If a is even, then:
+ * a <- a/2
+ * u0 <- u0/2 mod y
+ * v0 <- v0/2 mod x
+ *
+ * - Otherwise, if b is even, then:
+ * b <- b/2
+ * u1 <- u1/2 mod y
+ * v1 <- v1/2 mod x
+ *
+ * - Otherwise, if a > b, then:
+ * a <- (a-b)/2
+ * u0 <- (u0-u1)/2 mod y
+ * v0 <- (v0-v1)/2 mod x
+ *
+ * - Otherwise:
+ * b <- (b-a)/2
+ * u1 <- (u1-u0)/2 mod y
+ * v1 <- (v1-v0)/2 mod y
+ *
+ * We can show that the operations above preserve the invariants:
+ *
+ * - If a is even, then u0 and v0 are either both even or both
+ * odd (since a = x*u0 - y*v0, and x and y are both odd).
+ * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+ * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+ * the a = x*u0 - y*v0 invariant is preserved.
+ *
+ * - The same holds for the case where b is even.
+ *
+ * - If a and b are odd, and a > b, then:
+ *
+ * a-b = x*(u0-u1) - y*(v0-v1)
+ *
+ * In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+ * a-b > 0; therefore, it must be that v0 < v1, and the
+ * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+ * which preserves the invariants. Otherwise, if u0 > u1,
+ * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+ * b >= 0, hence a-b <= x. It follows that, in that case,
+ * v0-v1 >= 0. The first part of the update is then:
+ * (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+ * invariants.
+ *
+ * Either way, once the subtraction is done, the new value of
+ * a, which is the difference of two odd values, is even,
+ * and the remaining of this step is a subcase of the
+ * first algorithm case (i.e. when a is even).
+ *
+ * - If a and b are odd, and b > a, then the a similar
+ * argument holds.
+ *
+ * The values a and b start at x and y, respectively. Since x
+ * and y are odd, their GCD is odd, and it is easily seen that
+ * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+ * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+ * or b is reduced by at least one bit at each iteration, so
+ * the algorithm necessarily converges on the case a = b, at
+ * which point the common value is the GCD.
+ *
+ * In the algorithm expressed above, when a = b, the fourth case
+ * applies, and sets b = 0. Since a contains the GCD of x and y,
+ * which are both odd, a must be odd, and subsequent iterations
+ * (if any) will simply divide b by 2 repeatedly, which has no
+ * consequence. Thus, the algorithm can run for more iterations
+ * than necessary; the final GCD will be in a, and the (u,v)
+ * coefficients will be (u0,v0).
+ *
+ *
+ * The presentation above is bit-by-bit. It can be sped up by
+ * noticing that all decisions are taken based on the low bits
+ * and high bits of a and b. We can extract the two top words
+ * and low word of each of a and b, and compute reduction
+ * parameters pa, pb, qa and qb such that the new values for
+ * a and b are:
+ * a' = (a*pa + b*pb) / (2^31)
+ * b' = (a*qa + b*qb) / (2^31)
+ * the two divisions being exact. The coefficients are obtained
+ * just from the extracted words, and may be slightly off, requiring
+ * an optional correction: if a' < 0, then we replace pa with -pa
+ * and pb with -pb. Each such step will reduce the total length
+ * (sum of lengths of a and b) by at least 30 bits at each
+ * iteration.
+ */
+ uint32_t *u0, *u1, *v0, *v1, *a, *b;
+ uint32_t x0i, y0i;
+ uint32_t num, rc;
+ size_t j;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ /*
+ * u0 and v0 are the u and v result buffers; the four other
+ * values (u1, v1, a and b) are taken from tmp[].
+ */
+ u0 = u;
+ v0 = v;
+ u1 = tmp;
+ v1 = u1 + len;
+ a = v1 + len;
+ b = a + len;
+
+ /*
+ * We'll need the Montgomery reduction coefficients.
+ */
+ x0i = modp_ninv31(x[0]);
+ y0i = modp_ninv31(y[0]);
+
+ /*
+ * Initialize a, b, u0, u1, v0 and v1.
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ * Note that x is odd, so computing x-1 is easy.
+ */
+ memcpy(a, x, len * sizeof * x);
+ memcpy(b, y, len * sizeof * y);
+ u0[0] = 1;
+ memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+ memset(v0, 0, len * sizeof * v0);
+ memcpy(u1, y, len * sizeof * u1);
+ memcpy(v1, x, len * sizeof * v1);
+ v1[0] --;
+
+ /*
+ * Each input operand may be as large as 31*len bits, and we
+ * reduce the total length by at least 30 bits at each iteration.
+ */
+ for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+ uint32_t c0, c1;
+ uint32_t a0, a1, b0, b1;
+ uint64_t a_hi, b_hi;
+ uint32_t a_lo, b_lo;
+ int64_t pa, pb, qa, qb;
+ int i;
+ uint32_t r;
+
+ /*
+ * Extract the top words of a and b. If j is the highest
+ * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+ * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+ * If a and b are down to one word each, then we use
+ * a[0] and b[0].
+ */
+ c0 = (uint32_t) -1;
+ c1 = (uint32_t) -1;
+ a0 = 0;
+ a1 = 0;
+ b0 = 0;
+ b1 = 0;
+ j = len;
+ while (j -- > 0) {
+ uint32_t aw, bw;
+
+ aw = a[j];
+ bw = b[j];
+ a0 ^= (a0 ^ aw) & c0;
+ a1 ^= (a1 ^ aw) & c1;
+ b0 ^= (b0 ^ bw) & c0;
+ b1 ^= (b1 ^ bw) & c1;
+ c1 = c0;
+ c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+ }
+
+ /*
+ * If c1 = 0, then we grabbed two words for a and b.
+ * If c1 != 0 but c0 = 0, then we grabbed one word. It
+ * is not possible that c1 != 0 and c0 != 0, because that
+ * would mean that both integers are zero.
+ */
+ a1 |= a0 & c1;
+ a0 &= ~c1;
+ b1 |= b0 & c1;
+ b0 &= ~c1;
+ a_hi = ((uint64_t)a0 << 31) + a1;
+ b_hi = ((uint64_t)b0 << 31) + b1;
+ a_lo = a[0];
+ b_lo = b[0];
+
+ /*
+ * Compute reduction factors:
+ *
+ * a' = a*pa + b*pb
+ * b' = a*qa + b*qb
+ *
+ * such that a' and b' are both multiple of 2^31, but are
+ * only marginally larger than a and b.
+ */
+ pa = 1;
+ pb = 0;
+ qa = 0;
+ qb = 1;
+ for (i = 0; i < 31; i ++) {
+ /*
+ * At each iteration:
+ *
+ * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+ * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+ * a <- a/2 if: a is even
+ * b <- b/2 if: a is odd, b is even
+ *
+ * We multiply a_lo and b_lo by 2 at each
+ * iteration, thus a division by 2 really is a
+ * non-multiplication by 2.
+ */
+ uint32_t rt, oa, ob, cAB, cBA, cA;
+ uint64_t rz;
+
+ /*
+ * rt = 1 if a_hi > b_hi, 0 otherwise.
+ */
+ rz = b_hi - a_hi;
+ rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+ & (a_hi ^ rz))) >> 63);
+
+ /*
+ * cAB = 1 if b must be subtracted from a
+ * cBA = 1 if a must be subtracted from b
+ * cA = 1 if a must be divided by 2
+ *
+ * Rules:
+ *
+ * cAB and cBA cannot both be 1.
+ * If a is not divided by 2, b is.
+ */
+ oa = (a_lo >> i) & 1;
+ ob = (b_lo >> i) & 1;
+ cAB = oa & ob & rt;
+ cBA = oa & ob & ~rt;
+ cA = cAB | (oa ^ 1);
+
+ /*
+ * Conditional subtractions.
+ */
+ a_lo -= b_lo & -cAB;
+ a_hi -= b_hi & -(uint64_t)cAB;
+ pa -= qa & -(int64_t)cAB;
+ pb -= qb & -(int64_t)cAB;
+ b_lo -= a_lo & -cBA;
+ b_hi -= a_hi & -(uint64_t)cBA;
+ qa -= pa & -(int64_t)cBA;
+ qb -= pb & -(int64_t)cBA;
+
+ /*
+ * Shifting.
+ */
+ a_lo += a_lo & (cA - 1);
+ pa += pa & ((int64_t)cA - 1);
+ pb += pb & ((int64_t)cA - 1);
+ a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+ b_lo += b_lo & -cA;
+ qa += qa & -(int64_t)cA;
+ qb += qb & -(int64_t)cA;
+ b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+ }
+
+ /*
+ * Apply the computed parameters to our values. We
+ * may have to correct pa and pb depending on the
+ * returned value of zint_co_reduce() (when a and/or b
+ * had to be negated).
+ */
+ r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+ pa -= (pa + pa) & -(int64_t)(r & 1);
+ pb -= (pb + pb) & -(int64_t)(r & 1);
+ qa -= (qa + qa) & -(int64_t)(r >> 1);
+ qb -= (qb + qb) & -(int64_t)(r >> 1);
+ zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+ zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+ }
+
+ /*
+ * At that point, array a[] should contain the GCD, and the
+ * results (u,v) should already be set. We check that the GCD
+ * is indeed 1. We also check that the two operands x and y
+ * are odd.
+ */
+ rc = a[0] ^ 1;
+ for (j = 1; j < len; j ++) {
+ rc |= a[j];
+ }
+ return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, int32_t k,
+ uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ int32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t wy, wys, ccu;
+ uint64_t z;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ /*
+ * The expression below does not overflow.
+ */
+ z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+ /*
+ * Right-shifting the signed value z would yield
+ * implementation-defined results (arithmetic shift is
+ * not guaranteed). However, we can cast to unsigned,
+ * and get the next carry as an unsigned word. We can
+ * then convert it back to signed by using the guaranteed
+ * fact that 'int32_t' uses two's complement with no
+ * trap representation or padding bit, and with a layout
+ * compatible with that of 'uint32_t'.
+ */
+ ccu = (uint32_t)(z >> 31);
+ cc = *(int32_t *)&ccu;
+ }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ uint32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t w, wy, wys;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ w = x[u] - wys - cc;
+ x[u] = w & 0x7FFFFFFF;
+ cc = w >> 31;
+ }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+ uint32_t w;
+
+ w = x[0];
+ w |= (w & 0x40000000) << 1;
+ return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+ unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ if (flen == 0) {
+ for (u = 0; u < n; u ++) {
+ d[u] = fpr_zero;
+ }
+ return;
+ }
+ for (u = 0; u < n; u ++, f += fstride) {
+ size_t v;
+ uint32_t neg, cc, xm;
+ fpr x, fsc;
+
+ /*
+ * Get sign of the integer; if it is negative, then we
+ * will load its absolute value instead, and negate the
+ * result.
+ */
+ neg = -(f[flen - 1] >> 30);
+ xm = neg >> 1;
+ cc = neg & 1;
+ x = fpr_zero;
+ fsc = fpr_one;
+ for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+ uint32_t w;
+
+ w = (f[v] ^ xm) + cc;
+ cc = w >> 31;
+ w &= 0x7FFFFFFF;
+ w -= (w << 1) & neg;
+ x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+ }
+ d[u] = x;
+ }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = zint_one_to_plain(s + u);
+ if (z < -lim || z > lim) {
+ return 0;
+ }
+ d[u] = (int8_t)z;
+ }
+ return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t kf;
+ size_t v;
+ uint32_t *x;
+ const uint32_t *y;
+
+ kf = -k[u];
+ x = F + u * Fstride;
+ y = f;
+ for (v = 0; v < n; v ++) {
+ zint_add_scaled_mul_small(
+ x, Flen, y, flen, kf, sch, scl);
+ if (u + v == n - 1) {
+ x = F;
+ kf = -kf;
+ } else {
+ x += Fstride;
+ }
+ y += fstride;
+ }
+ }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+ uint32_t *tmp) {
+ uint32_t *gm, *igm, *fk, *t1, *x;
+ const uint32_t *y;
+ size_t n, u, tlen;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ tlen = flen + 1;
+ gm = tmp;
+ igm = gm + MKN(logn);
+ fk = igm + MKN(logn);
+ t1 = fk + n * tlen;
+
+ primes = PRIMES;
+
+ /*
+ * Compute k*f in fk[], in RNS notation.
+ */
+ for (u = 0; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0; v < n; v ++) {
+ t1[v] = modp_set(k[v], p);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, y = f, x = fk + u;
+ v < n; v ++, y += fstride, x += tlen) {
+ *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+ }
+ modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+ for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+ *x = modp_montymul(
+ modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+ }
+ modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild k*f.
+ */
+ zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+ /*
+ * Subtract k*f, scaled, from F.
+ */
+ for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+ zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+ }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+ /*
+ * We enforce little-endian representation.
+ */
+
+ /*
+ * On little-endian systems we just interpret the bytes "as is"
+ * (this is correct because the exact-width types such as
+ * 'uint64_t' are guaranteed to have no padding and no trap
+ * representation).
+ */
+ uint64_t r;
+
+ inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
+ return r;
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ * D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+ 1283868770400643928u, 6416574995475331444u, 4078260278032692663u,
+ 2353523259288686585u, 1227179971273316331u, 575931623374121527u,
+ 242543240509105209u, 91437049221049666u, 30799446349977173u,
+ 9255276791179340u, 2478152334826140u, 590642893610164u,
+ 125206034929641u, 23590435911403u, 3948334035941u,
+ 586753615614u, 77391054539u, 9056793210u,
+ 940121950u, 86539696u, 7062824u,
+ 510971u, 32764u, 1862u,
+ 94u, 4u, 0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+ unsigned u, g;
+ int val;
+
+ g = 1U << (10 - logn);
+ val = 0;
+ for (u = 0; u < g; u ++) {
+ /*
+ * Each iteration generates one value with the
+ * Gaussian distribution for N = 1024.
+ *
+ * We use two random 64-bit values. First value
+ * decides on whether the generated value is 0, and,
+ * if not, the sign of the value. Second random 64-bit
+ * word is used to generate the non-zero value.
+ *
+ * For constant-time code we have to read the complete
+ * table. This has negligible cost, compared with the
+ * remainder of the keygen process (solving the NTRU
+ * equation).
+ */
+ uint64_t r;
+ uint32_t f, v, k, neg;
+
+ /*
+ * First value:
+ * - flag 'neg' is randomly selected to be 0 or 1.
+ * - flag 'f' is set to 1 if the generated value is zero,
+ * or set to 0 otherwise.
+ */
+ r = get_rng_u64(rng);
+ neg = (uint32_t)(r >> 63);
+ r &= ~((uint64_t)1 << 63);
+ f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+ /*
+ * We produce a new random 63-bit integer r, and go over
+ * the array, starting at index 1. We store in v the
+ * index of the first array element which is not greater
+ * than r, unless the flag f was already 1.
+ */
+ v = 0;
+ r = get_rng_u64(rng);
+ r &= ~((uint64_t)1 << 63);
+ for (k = 1; k < (sizeof gauss_1024_12289)
+ / (sizeof gauss_1024_12289[0]); k ++) {
+ uint32_t t;
+
+ t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+ v |= k & -(t & (f ^ 1));
+ f |= t;
+ }
+
+ /*
+ * We apply the sign ('neg' flag). If the value is zero,
+ * the sign has no effect.
+ */
+ v = (v ^ -neg) + neg;
+
+ /*
+ * Generated value is added to val.
+ */
+ val += *(int32_t *)&v;
+ }
+ return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ * MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ * - Within an array, values grow.
+ *
+ * - The 'SMALL' array must have an entry for maximum depth, corresponding
+ * to the size of values used in the binary GCD. There is no such value
+ * for the 'LARGE' array (the binary GCD yields already reduced
+ * coefficients).
+ *
+ * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ * - Values must be large enough to handle the common cases, with some
+ * margins.
+ *
+ * - Values must not be "too large" either because we will convert some
+ * integers into floating-point values by considering the top 10 words,
+ * i.e. 310 bits; hence, for values of length more than 10 words, we
+ * should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51)
+ * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55)
+ * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77)
+ * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31)
+ * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04)
+ * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87)
+ * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38)
+ * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39)
+ * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73)
+ * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41)
+ * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+ 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+ 2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+ int avg;
+ int std;
+} BITLENGTH[] = {
+ { 4, 0 },
+ { 11, 1 },
+ { 24, 1 },
+ { 50, 1 },
+ { 102, 1 },
+ { 202, 2 },
+ { 401, 4 },
+ { 794, 5 },
+ { 1577, 8 },
+ { 3138, 13 },
+ { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG 4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = MKN(logn);
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = f[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(fpr);
+ if (km) {
+ k += (sizeof(fpr)) - km;
+ }
+ return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(uint32_t);
+ if (km) {
+ k += (sizeof(uint32_t)) - km;
+ }
+ return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ x[u] = fpr_of(f[u]);
+ }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+ int in_ntt, int out_ntt) {
+ size_t n, hn, u;
+ size_t slen, tlen;
+ uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+ const small_prime *primes;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ slen = MAX_BL_SMALL[depth];
+ tlen = MAX_BL_SMALL[depth + 1];
+ primes = PRIMES;
+
+ /*
+ * Prepare room for the result.
+ */
+ fd = data;
+ gd = fd + hn * tlen;
+ fs = gd + hn * tlen;
+ gs = fs + n * slen;
+ gm = gs + n * slen;
+ igm = gm + n;
+ t1 = igm + n;
+ memmove(fs, data, 2 * n * slen * sizeof * data);
+
+ /*
+ * First slen words: we use the input values directly, and apply
+ * inverse NTT as we go.
+ */
+ for (u = 0; u < slen; u ++) {
+ uint32_t p, p0i, R2;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+ }
+
+ for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+
+ /*
+ * Since the fs and gs words have been de-NTTized, we can use the
+ * CRT to rebuild the values.
+ */
+ zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+ zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+ /*
+ * Remaining words: use modular reductions to extract the values.
+ */
+ for (u = slen; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+ for (v = 0, x = fs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ for (v = 0, x = gs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ * 0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+ unsigned logn, unsigned depth, int out_ntt) {
+ size_t n, u;
+ uint32_t *ft, *gt, p0;
+ unsigned d;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ ft = data;
+ gt = ft + n;
+ primes = PRIMES;
+ p0 = primes[0].p;
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p0);
+ gt[u] = modp_set(g[u], p0);
+ }
+
+ if (depth == 0 && out_ntt) {
+ uint32_t *gm, *igm;
+ uint32_t p, p0i;
+
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ gm = gt + n;
+ igm = gm + MKN(logn);
+ modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ return;
+ }
+
+ if (depth == 0) {
+ return;
+ }
+
+ if (depth == 1) {
+ make_fg_step(data, logn, 0, 0, out_ntt);
+ return;
+ }
+
+ make_fg_step(data, logn, 0, 0, 1);
+ for (d = 1; d + 1 < depth; d ++) {
+ make_fg_step(data, logn - d, d, 1, 1);
+ }
+ make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t len;
+ uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+ const small_prime *primes;
+
+ len = MAX_BL_SMALL[logn_top];
+ primes = PRIMES;
+
+ Fp = tmp;
+ Gp = Fp + len;
+ fp = Gp + len;
+ gp = fp + len;
+ t1 = gp + len;
+
+ make_fg(fp, f, g, logn_top, logn_top, 0);
+
+ /*
+ * We use the CRT to rebuild the resultants as big integers.
+ * There are two such big integers. The resultants are always
+ * nonnegative.
+ */
+ zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+ /*
+ * Apply the binary GCD. The zint_bezout() function works only
+ * if both inputs are odd.
+ *
+ * We can test on the result and return 0 because that would
+ * imply failure of the NTRU solving equation, and the (f,g)
+ * values will be abandoned in that case.
+ */
+ if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+ return 0;
+ }
+
+ /*
+ * Multiply the two values by the target value q. Values must
+ * fit in the destination arrays.
+ * We can again test on the returned words: a non-zero output
+ * of zint_mul_small() means that we exceeded our array
+ * capacity, and that implies failure and rejection of (f,g).
+ */
+ q = 12289;
+ if (zint_mul_small(Fp, len, q) != 0
+ || zint_mul_small(Gp, len, q) != 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+ const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+ /*
+ * In this function, 'logn' is the log2 of the degree for
+ * this step. If N = 2^logn, then:
+ * - the F and G values already in fk->tmp (from the deeper
+ * levels) have degree N/2;
+ * - this function should return F and G of degree N.
+ */
+ unsigned logn;
+ size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+ int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+ uint32_t *x, *y;
+ int32_t *k;
+ const small_prime *primes;
+
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2 or N/3)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+ primes = PRIMES;
+
+ /*
+ * Fd and Gd are the F and G from the deeper level.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+
+ /*
+ * Compute the input f and g for this level. Note that we get f
+ * and g in RNS + NTT representation.
+ */
+ ft = Gd + dlen * hn;
+ make_fg(ft, f, g, logn_top, depth, 1);
+
+ /*
+ * Move the newly computed f and g to make room for our candidate
+ * F and G (unreduced).
+ */
+ Ft = tmp;
+ Gt = Ft + n * llen;
+ t1 = Gt + n * llen;
+ memmove(t1, ft, 2 * n * slen * sizeof * ft);
+ ft = t1;
+ gt = ft + slen * n;
+ t1 = gt + slen * n;
+
+ /*
+ * Move Fd and Gd _after_ f and g.
+ */
+ memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+ Fd = t1;
+ Gd = Fd + hn * dlen;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt (only n/2 values in each).
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * We do not need Fd and Gd after that point.
+ */
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * If we processed slen words, then f and g have been
+ * de-NTTized, and are in RNS; we can rebuild them.
+ */
+ if (u == slen) {
+ zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+ zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+ }
+
+ gm = t1;
+ igm = gm + n;
+ fx = igm + n;
+ gx = fx + n;
+
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ if (u < slen) {
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = *x;
+ gx[v] = *y;
+ }
+ modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+ modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+ } else {
+ uint32_t Rx;
+
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ for (v = 0, x = ft, y = gt;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = zint_mod_small_signed(x, slen,
+ p, p0i, R2, Rx);
+ gx[v] = zint_mod_small_signed(y, slen,
+ p, p0i, R2, Rx);
+ }
+ modp_NTT2(fx, gm, logn, p, p0i);
+ modp_NTT2(gx, gm, logn, p, p0i);
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed in
+ * a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * General case:
+ *
+ * we divide degree by d = 2 or 3
+ * f'(x^d) = N(f)(x^d) = f * adj(f)
+ * g'(x^d) = N(g)(x^d) = g * adj(g)
+ * f'*G' - g'*F' = q
+ * F = F'(x^d) * adj(g)
+ * G = G'(x^d) * adj(f)
+ *
+ * We compute things in the NTT. We group roots of phi
+ * such that all roots x in a group share the same x^d.
+ * If the roots in a group are x_1, x_2... x_d, then:
+ *
+ * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+ *
+ * Thus, we have:
+ *
+ * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * ...
+ * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+ *
+ * In all cases, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, in our chosen NTT representation, roots
+ * from the same group are consecutive in RAM.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+ v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild F and G with the CRT.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+ zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+ /*
+ * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+ * order).
+ */
+
+ /*
+ * Apply Babai reduction to bring back F and G to size slen.
+ *
+ * We use the FFT to compute successive approximations of the
+ * reduction coefficient. We first isolate the top bits of
+ * the coefficients of f and g, and convert them to floating
+ * point; with the FFT, we compute adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)).
+ *
+ * Then, we repeatedly apply the following:
+ *
+ * - Get the top bits of the coefficients of F and G into
+ * floating point, and use the FFT to compute:
+ * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+ *
+ * - Convert back that value into normal representation, and
+ * round it to the nearest integers, yielding a polynomial k.
+ * Proper scaling is applied to f, g, F and G so that the
+ * coefficients fit on 32 bits (signed).
+ *
+ * - Subtract k*f from F and k*g from G.
+ *
+ * Under normal conditions, this process reduces the size of F
+ * and G by some bits at each iteration. For constant-time
+ * operation, we do not want to measure the actual length of
+ * F and G; instead, we do the following:
+ *
+ * - f and g are converted to floating-point, with some scaling
+ * if necessary to keep values in the representable range.
+ *
+ * - For each iteration, we _assume_ a maximum size for F and G,
+ * and use the values at that size. If we overreach, then
+ * we get zeros, which is harmless: the resulting coefficients
+ * of k will be 0 and the value won't be reduced.
+ *
+ * - We conservatively assume that F and G will be reduced by
+ * at least 25 bits at each iteration.
+ *
+ * Even when reaching the bottom of the reduction, reduction
+ * coefficient will remain low. If it goes out-of-range, then
+ * something wrong occurred and the whole NTRU solving fails.
+ */
+
+ /*
+ * Memory layout:
+ * - We need to compute and keep adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+ * respectively).
+ * - At each iteration we need two extra fp buffer (N fp values),
+ * and produce a k (N 32-bit words). k will be shared with one
+ * of the fp buffers.
+ * - To compute k*f and k*g efficiently (with the NTT), we need
+ * some extra room; we reuse the space of the temporary buffers.
+ *
+ * Arrays of 'fpr' are obtained from the temporary array itself.
+ * We ensure that the base is at a properly aligned offset (the
+ * source array tmp[] is supposed to be already aligned).
+ */
+
+ rt3 = align_fpr(tmp, t1);
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt1 = rt5 + (n >> 1);
+ k = (int32_t *)align_u32(tmp, rt1);
+ rt2 = align_fpr(tmp, k + n);
+ if (rt2 < (rt1 + n)) {
+ rt2 = rt1 + n;
+ }
+ t1 = (uint32_t *)k + n;
+
+ /*
+ * Get f and g into rt3 and rt4 as floating-point approximations.
+ *
+ * We need to "scale down" the floating-point representation of
+ * coefficients when they are too big. We want to keep the value
+ * below 2^310 or so. Thus, when values are larger than 10 words,
+ * we consider only the top 10 words. Array lengths have been
+ * computed so that average maximum length will fall in the
+ * middle or the upper half of these top 10 words.
+ */
+ if (slen > 10) {
+ rlen = 10;
+ } else {
+ rlen = slen;
+ }
+ poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+ poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+ /*
+ * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+ */
+ scale_fg = 31 * (int)(slen - rlen);
+
+ /*
+ * Estimated boundaries for the maximum size (in bits) of the
+ * coefficients of (f,g). We use the measured average, and
+ * allow for a deviation of at most six times the standard
+ * deviation.
+ */
+ minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+ maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+ /*
+ * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+ * and adj(g) in rt3 and rt4, respectively.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt4, logn);
+
+ /*
+ * Reduce F and G repeatedly.
+ *
+ * The expected maximum bit length of coefficients of F and G
+ * is kept in maxbl_FG, with the corresponding word length in
+ * FGlen.
+ */
+ FGlen = llen;
+ maxbl_FG = 31 * (int)llen;
+
+ /*
+ * Each reduction operation computes the reduction polynomial
+ * "k". We need that polynomial to have coefficients that fit
+ * on 32-bit signed integers, with some scaling; thus, we use
+ * a descending sequence of scaling values, down to zero.
+ *
+ * The size of the coefficients of k is (roughly) the difference
+ * between the size of the coefficients of (F,G) and the size
+ * of the coefficients of (f,g). Thus, the maximum size of the
+ * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+ * this is our starting scale value for k.
+ *
+ * We need to estimate the size of (F,G) during the execution of
+ * the algorithm; we are allowed some overestimation but not too
+ * much (poly_big_to_fp() uses a 310-bit window). Generally
+ * speaking, after applying a reduction with k scaled to
+ * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+ * where 'dd' is a few bits to account for the fact that the
+ * reduction is never perfect (intuitively, dd is on the order
+ * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+ * bits).
+ *
+ * The size of (f,g) is not known exactly, but maxbl_fg is an
+ * upper bound.
+ */
+ scale_k = maxbl_FG - minbl_fg;
+
+ for (;;) {
+ int scale_FG, dc, new_maxbl_FG;
+ uint32_t scl, sch;
+ fpr pdc, pt;
+
+ /*
+ * Convert current F and G into floating-point. We apply
+ * scaling if the current length is more than 10 words.
+ */
+ if (FGlen > 10) {
+ rlen = 10;
+ } else {
+ rlen = FGlen;
+ }
+ scale_FG = 31 * (int)(FGlen - rlen);
+ poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+ poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(rt2, rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt2, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt2, logn);
+
+ /*
+ * (f,g) are scaled by 'scale_fg', meaning that the
+ * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+ * to have their true mathematical value.
+ *
+ * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+ * the value we computed in rt2 is scaled by
+ * 'scale_FG-scale_fg'.
+ *
+ * We want that value to be scaled by 'scale_k', hence we
+ * apply a corrective scaling. After scaling, the values
+ * should fit in -2^31-1..+2^31-1.
+ */
+ dc = scale_k - scale_FG + scale_fg;
+
+ /*
+ * We will need to multiply values by 2^(-dc). The value
+ * 'dc' is not secret, so we can compute 2^(-dc) with a
+ * non-constant-time process.
+ * (We could use ldexp(), but we prefer to avoid any
+ * dependency on libm. When using FP emulation, we could
+ * use our fpr_ldexp(), which is constant-time.)
+ */
+ if (dc < 0) {
+ dc = -dc;
+ pt = fpr_two;
+ } else {
+ pt = fpr_onehalf;
+ }
+ pdc = fpr_one;
+ while (dc != 0) {
+ if ((dc & 1) != 0) {
+ pdc = fpr_mul(pdc, pt);
+ }
+ dc >>= 1;
+ pt = fpr_sqr(pt);
+ }
+
+ for (u = 0; u < n; u ++) {
+ fpr xv;
+
+ xv = fpr_mul(rt2[u], pdc);
+
+ /*
+ * Sometimes the values can be out-of-bounds if
+ * the algorithm fails; we must not call
+ * fpr_rint() (and cast to int32_t) if the value
+ * is not in-bounds. Note that the test does not
+ * break constant-time discipline, since any
+ * failure here implies that we discard the current
+ * secret key (f,g).
+ */
+ if (!fpr_lt(fpr_mtwo31m1, xv)
+ || !fpr_lt(xv, fpr_ptwo31m1)) {
+ return 0;
+ }
+ k[u] = (int32_t)fpr_rint(xv);
+ }
+
+ /*
+ * Values in k[] are integers. They really are scaled
+ * down by maxbl_FG - minbl_fg bits.
+ *
+ * If we are at low depth, then we use the NTT to
+ * compute k*f and k*g.
+ */
+ sch = (uint32_t)(scale_k / 31);
+ scl = (uint32_t)(scale_k % 31);
+ if (depth <= DEPTH_INT_FG) {
+ poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn, t1);
+ poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn, t1);
+ } else {
+ poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn);
+ poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn);
+ }
+
+ /*
+ * We compute the new maximum size of (F,G), assuming that
+ * (f,g) has _maximal_ length (i.e. that reduction is
+ * "late" instead of "early". We also adjust FGlen
+ * accordingly.
+ */
+ new_maxbl_FG = scale_k + maxbl_fg + 10;
+ if (new_maxbl_FG < maxbl_FG) {
+ maxbl_FG = new_maxbl_FG;
+ if ((int)FGlen * 31 >= maxbl_FG + 31) {
+ FGlen --;
+ }
+ }
+
+ /*
+ * We suppose that scaling down achieves a reduction by
+ * at least 25 bits per iteration. We stop when we have
+ * done the loop with an unscaled k.
+ */
+ if (scale_k <= 0) {
+ break;
+ }
+ scale_k -= 25;
+ if (scale_k < 0) {
+ scale_k = 0;
+ }
+ }
+
+ /*
+ * If (F,G) length was lowered below 'slen', then we must take
+ * care to re-extend the sign.
+ */
+ if (FGlen < slen) {
+ for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+ size_t v;
+ uint32_t sw;
+
+ sw = -(Ft[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Ft[v] = sw;
+ }
+ sw = -(Gt[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Gt[v] = sw;
+ }
+ }
+ }
+
+ /*
+ * Compress encoding of all values to 'slen' words (this is the
+ * expected output format).
+ */
+ for (u = 0, x = tmp, y = tmp;
+ u < (n << 1); u ++, x += slen, y += llen) {
+ memmove(x, y, slen * sizeof * y);
+ }
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ /*
+ * The first half of this function is a copy of the corresponding
+ * part in solve_NTRU_intermediate(), for the reconstruction of
+ * the unreduced F and G. The second half (Babai reduction) is
+ * done differently, because the unreduced F and G fit in 53 bits
+ * of precision, allowing a much simpler process with lower RAM
+ * usage.
+ */
+ unsigned depth, logn;
+ size_t n_top, n, hn, slen, dlen, llen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+ uint32_t *x, *y;
+
+ depth = 1;
+ n_top = (size_t)1 << logn_top;
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ */
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+
+ /*
+ * Fd and Gd are the F and G from the deeper level. Ft and Gt
+ * are the destination arrays for the unreduced F and G.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+ Ft = Gd + dlen * hn;
+ Gt = Ft + llen * n;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * Now Fd and Gd are not needed anymore; we can squeeze them out.
+ */
+ memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+ Ft = tmp;
+ memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+ Gt = Ft + llen * n;
+ ft = Gt + llen * n;
+ gt = ft + slen * n;
+
+ t1 = gt + slen * n;
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ unsigned e;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * We recompute things from the source f and g, of full
+ * degree. However, we will need only the n first elements
+ * of the inverse NTT table (igm); the call to modp_mkgm()
+ * below will fill n_top elements in igm[] (thus overflowing
+ * into fx[]) but later code will overwrite these extra
+ * elements.
+ */
+ gm = t1;
+ igm = gm + n_top;
+ fx = igm + n;
+ gx = fx + n_top;
+ modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+ /*
+ * Set ft and gt to f and g modulo p, respectively.
+ */
+ for (v = 0; v < n_top; v ++) {
+ fx[v] = modp_set(f[v], p);
+ gx[v] = modp_set(g[v], p);
+ }
+
+ /*
+ * Convert to NTT and compute our f and g.
+ */
+ modp_NTT2(fx, gm, logn_top, p, p0i);
+ modp_NTT2(gx, gm, logn_top, p, p0i);
+ for (e = logn_top; e > logn; e --) {
+ modp_poly_rec_res(fx, e, p, p0i, R2);
+ modp_poly_rec_res(gx, e, p, p0i, R2);
+ }
+
+ /*
+ * From that point onward, we only need tables for
+ * degree n, so we can save some space.
+ */
+ if (depth > 0) { /* always true */
+ memmove(gm + n, igm, n * sizeof * igm);
+ igm = gm + n;
+ memmove(igm + n, fx, n * sizeof * ft);
+ fx = igm + n;
+ memmove(fx + n, gx, n * sizeof * gt);
+ gx = fx + n;
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed
+ * in a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * Equations are:
+ *
+ * f'(x^2) = N(f)(x^2) = f * adj(f)
+ * g'(x^2) = N(g)(x^2) = g * adj(g)
+ *
+ * f'*G' - g'*F' = q
+ *
+ * F = F'(x^2) * adj(g)
+ * G = G'(x^2) * adj(f)
+ *
+ * The NTT representation of f is f(w) for all w which
+ * are roots of phi. In the binary case, as well as in
+ * the ternary case for all depth except the deepest,
+ * these roots can be grouped in pairs (w,-w), and we
+ * then have:
+ *
+ * f(w) = adj(f)(-w)
+ * f(-w) = adj(f)(w)
+ *
+ * and w^2 is then a root for phi at the half-degree.
+ *
+ * At the deepest level in the ternary case, this still
+ * holds, in the following sense: the roots of x^2-x+1
+ * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+ * have:
+ *
+ * f(w) = adj(f)(-w^2)
+ * f(-w^2) = adj(f)(w)
+ *
+ * In all case, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, the two roots for each pair are consecutive
+ * in our bit-reversal encoding.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+ /*
+ * Also save ft and gt (only up to size slen).
+ */
+ if (u < slen) {
+ modp_iNTT2(fx, igm, logn, p, p0i);
+ modp_iNTT2(gx, igm, logn, p, p0i);
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ *x = fx[v];
+ *y = gx[v];
+ }
+ }
+ }
+
+ /*
+ * Rebuild f, g, F and G with the CRT. Note that the elements of F
+ * and G are consecutive, and thus can be rebuilt in a single
+ * loop; similarly, the elements of f and g are consecutive.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+ zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+ /*
+ * Here starts the Babai reduction, specialized for depth = 1.
+ *
+ * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+ * are converted to floating point. There is no scaling, and a
+ * single pass is sufficient.
+ */
+
+ /*
+ * Convert F and G into floating point (rt1 and rt2).
+ */
+ rt1 = align_fpr(tmp, gt + slen * n);
+ rt2 = rt1 + n;
+ poly_big_to_fp(rt1, Ft, llen, llen, logn);
+ poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+ /*
+ * Integer representation of F and G is no longer needed, we
+ * can remove it.
+ */
+ memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+ ft = tmp;
+ gt = ft + slen * n;
+ rt3 = align_fpr(tmp, gt + slen * n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * Convert f and g into floating point (rt3 and rt4).
+ */
+ poly_big_to_fp(rt3, ft, slen, slen, logn);
+ poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+ /*
+ * Remove unneeded ft and gt.
+ */
+ memmove(tmp, rt1, 4 * n * sizeof * rt1);
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * We now have:
+ * rt1 = F
+ * rt2 = G
+ * rt3 = f
+ * rt4 = g
+ * in that order in RAM. We convert all of them to FFT.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = F*adj(f) + G*adj(g)
+ * rt6 = 1 / (f*adj(f) + g*adj(g))
+ * (Note that rt6 is half-length.)
+ */
+ rt5 = rt4 + n;
+ rt6 = rt5 + n;
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+ /*
+ * Compute k as the rounded version of rt5. Check that none of
+ * the values is larger than 2^63-1 (in absolute value)
+ * because that would make the fpr_rint() do something undefined;
+ * note that any out-of-bounds value here implies a failure and
+ * (f,g) will be discarded, so we can make a simple test.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt5, logn);
+ for (u = 0; u < n; u ++) {
+ fpr z;
+
+ z = rt5[u];
+ if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+ return 0;
+ }
+ rt5[u] = fpr_of(fpr_rint(z));
+ }
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt5, logn);
+
+ /*
+ * Subtract k*f from F, and k*g from G.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt3, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(rt4, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt2, logn);
+
+ /*
+ * Convert back F and G to integers, and return.
+ */
+ Ft = tmp;
+ Gt = Ft + n;
+ rt3 = align_fpr(tmp, Gt + n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ for (u = 0; u < n; u ++) {
+ Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+ Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t n, hn, u;
+ uint32_t p, p0i, R2;
+ uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+ uint32_t *gm, *igm, *ft, *gt;
+ fpr *rt2, *rt3;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ *
+ * Everything should fit in 31-bit integers, hence we can just use
+ * the first small prime p = 2147473409.
+ */
+ p = PRIMES[0].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ Fp = tmp;
+ Gp = Fp + hn;
+ ft = Gp + hn;
+ gt = ft + n;
+ gm = gt + n;
+ igm = gm + n;
+
+ modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F' anf G' in NTT representation.
+ */
+ for (u = 0; u < hn; u ++) {
+ Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+ Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Load f and g and convert them to NTT representation.
+ */
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+
+ /*
+ * Build the unreduced F,G in ft and gt.
+ */
+ for (u = 0; u < n; u += 2) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = ft[u + 0];
+ ftB = ft[u + 1];
+ gtA = gt[u + 0];
+ gtB = gt[u + 1];
+ mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+ mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+ ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+ ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+ gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+ gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2(ft, igm, logn, p, p0i);
+ modp_iNTT2(gt, igm, logn, p, p0i);
+
+ Gp = Fp + n;
+ t1 = Gp + n;
+ memmove(Fp, ft, 2 * n * sizeof * ft);
+
+ /*
+ * We now need to apply the Babai reduction. At that point,
+ * we have F and G in two n-word arrays.
+ *
+ * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+ * modulo p, using the NTT. We still move memory around in
+ * order to save RAM.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+
+ /*
+ * Compute the NTT tables in t1 and t2. We do not keep t2
+ * (we'll recompute it later on).
+ */
+ modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F and G to NTT.
+ */
+ modp_NTT2(Fp, t1, logn, p, p0i);
+ modp_NTT2(Gp, t1, logn, p, p0i);
+
+ /*
+ * Load f and adj(f) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(f[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[n - u] = modp_set(-f[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Compute F*adj(f) in t2, and f*adj(f) in t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_montymul(w, Fp[u], p, p0i);
+ t3[u] = modp_montymul(w, t4[u], p, p0i);
+ }
+
+ /*
+ * Load g and adj(g) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(g[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(g[u], p);
+ t5[n - u] = modp_set(-g[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Add G*adj(g) to t2, and g*adj(g) to t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_add(t2[u],
+ modp_montymul(w, Gp[u], p, p0i), p);
+ t3[u] = modp_add(t3[u],
+ modp_montymul(w, t4[u], p, p0i), p);
+ }
+
+ /*
+ * Convert back t2 and t3 to normal representation (normalized
+ * around 0), and then
+ * move them to t1 and t2. We first need to recompute the
+ * inverse table for NTT.
+ */
+ modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+ modp_iNTT2(t2, t4, logn, p, p0i);
+ modp_iNTT2(t3, t4, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint32_t)modp_norm(t2[u], p);
+ t2[u] = (uint32_t)modp_norm(t3[u], p);
+ }
+
+ /*
+ * At that point, array contents are:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * F*adj(f)+G*adj(g) (t1)
+ * f*adj(f)+g*adj(g) (t2)
+ *
+ * We want to divide t1 by t2. The result is not integral; it
+ * must be rounded. We thus need to use the FFT.
+ */
+
+ /*
+ * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+ * polynomial is auto-adjoint, all its coordinates in FFT
+ * representation are actually real, so we can truncate off
+ * the imaginary parts.
+ */
+ rt3 = align_fpr(tmp, t3);
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t2)[u]);
+ }
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+ rt2 = align_fpr(tmp, t2);
+ memmove(rt2, rt3, hn * sizeof * rt3);
+
+ /*
+ * Convert F*adj(f)+G*adj(g) in FFT representation.
+ */
+ rt3 = rt2 + hn;
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t1)[u]);
+ }
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt3, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+ * its rounded normal representation in t1.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_div_autoadj_fft(rt3, rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt3, logn);
+ for (u = 0; u < n; u ++) {
+ t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+ }
+
+ /*
+ * RAM contents are now:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * k (t1)
+ *
+ * We want to compute F-k*f, and G-k*g.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+ modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(t1, t2, logn, p, p0i);
+ modp_NTT2(t4, t2, logn, p, p0i);
+ modp_NTT2(t5, t2, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t kw;
+
+ kw = modp_montymul(t1[u], R2, p, p0i);
+ Fp[u] = modp_sub(Fp[u],
+ modp_montymul(kw, t4[u], p, p0i), p);
+ Gp[u] = modp_sub(Gp[u],
+ modp_montymul(kw, t5[u], p, p0i), p);
+ }
+ modp_iNTT2(Fp, t3, logn, p, p0i);
+ modp_iNTT2(Gp, t3, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+ Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+ }
+
+ return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+ const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+ size_t n, u;
+ uint32_t *ft, *gt, *Ft, *Gt, *gm;
+ uint32_t p, p0i, r;
+ const small_prime *primes;
+
+ n = MKN(logn);
+
+ if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+ return 0;
+ }
+
+ /*
+ * For logn <= 2, we need to use solve_NTRU_intermediate()
+ * directly, because coefficients are a bit too large and
+ * do not fit the hypotheses in solve_NTRU_binary_depth0().
+ */
+ if (logn <= 2) {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 0) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ } else {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 2) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+ return 0;
+ }
+ if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+ return 0;
+ }
+ }
+
+ /*
+ * If no buffer has been provided for G, use a temporary one.
+ */
+ if (G == NULL) {
+ G = (int8_t *)(tmp + 2 * n);
+ }
+
+ /*
+ * Final F and G are in fk->tmp, one word per coefficient
+ * (signed value over 31 bits).
+ */
+ if (!poly_big_to_small(F, tmp, lim, logn)
+ || !poly_big_to_small(G, tmp + n, lim, logn)) {
+ return 0;
+ }
+
+ /*
+ * Verify that the NTRU equation is fulfilled. Since all elements
+ * have short lengths, verifying modulo a small prime p works, and
+ * allows using the NTT.
+ *
+ * We put Gt[] first in tmp[], and process it first, so that it does
+ * not overlap with G[] in case we allocated it ourselves.
+ */
+ Gt = tmp;
+ ft = Gt + n;
+ gt = ft + n;
+ Ft = gt + n;
+ gm = Ft + n;
+
+ primes = PRIMES;
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Gt[u] = modp_set(G[u], p);
+ }
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ Ft[u] = modp_set(F[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ modp_NTT2(Ft, gm, logn, p, p0i);
+ modp_NTT2(Gt, gm, logn, p, p0i);
+ r = modp_montymul(12289, 1, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t z;
+
+ z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+ modp_montymul(gt[u], Ft[u], p, p0i), p);
+ if (z != r) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+ size_t n, u;
+ unsigned mod2;
+
+ n = MKN(logn);
+ mod2 = 0;
+ for (u = 0; u < n; u ++) {
+ int s;
+
+restart:
+ s = mkgauss(rng, logn);
+
+ /*
+ * We need the coefficient to fit within -127..+127;
+ * realistically, this is always the case except for
+ * the very low degrees (N = 2 or 4), for which there
+ * is no real security anyway.
+ */
+ if (s < -127 || s > 127) {
+ goto restart;
+ }
+
+ /*
+ * We need the sum of all coefficients to be 1; otherwise,
+ * the resultant of the polynomial with X^N+1 will be even,
+ * and the binary GCD will fail.
+ */
+ if (u == n - 1) {
+ if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+ goto restart;
+ }
+ } else {
+ mod2 ^= (unsigned)(s & 1);
+ }
+ f[u] = (int8_t)s;
+ }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp) {
+ /*
+ * Algorithm is the following:
+ *
+ * - Generate f and g with the Gaussian distribution.
+ *
+ * - If either Res(f,phi) or Res(g,phi) is even, try again.
+ *
+ * - If ||(f,g)|| is too large, try again.
+ *
+ * - If ||B~_{f,g}|| is too large, try again.
+ *
+ * - If f is not invertible mod phi mod q, try again.
+ *
+ * - Compute h = g/f mod phi mod q.
+ *
+ * - Solve the NTRU equation fG - gF = q; if the solving fails,
+ * try again. Usual failure condition is when Res(f,phi)
+ * and Res(g,phi) are not prime to each other.
+ */
+ size_t n, u;
+ uint16_t *h2, *tmp2;
+ RNG_CONTEXT *rc;
+
+ n = MKN(logn);
+ rc = rng;
+
+ /*
+ * We need to generate f and g randomly, until we find values
+ * such that the norm of (g,-f), and of the orthogonalized
+ * vector, are satisfying. The orthogonalized vector is:
+ * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+ * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+ *
+ * In the binary case, coefficients of f and g are generated
+ * independently of each other, with a discrete Gaussian
+ * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+ * the two vectors have expected norm 1.17*sqrt(q), which is
+ * also our acceptance bound: we require both vectors to be no
+ * larger than that (this will be satisfied about 1/4th of the
+ * time, thus we expect sampling new (f,g) about 4 times for that
+ * step).
+ *
+ * We require that Res(f,phi) and Res(g,phi) are both odd (the
+ * NTRU equation solver requires it).
+ */
+ for (;;) {
+ fpr *rt1, *rt2, *rt3;
+ fpr bnorm;
+ uint32_t normf, normg, norm;
+ int lim;
+
+ /*
+ * The poly_small_mkgauss() function makes sure
+ * that the sum of coefficients is 1 modulo 2
+ * (i.e. the resultant of the polynomial with phi
+ * will be odd).
+ */
+ poly_small_mkgauss(rc, f, logn);
+ poly_small_mkgauss(rc, g, logn);
+
+ /*
+ * Verify that all coefficients are within the bounds
+ * defined in max_fg_bits. This is the case with
+ * overwhelming probability; this guarantees that the
+ * key will be encodable with FALCON_COMP_TRIM.
+ */
+ lim = 1 << (PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[logn] - 1);
+ for (u = 0; u < n; u ++) {
+ /*
+ * We can use non-CT tests since on any failure
+ * we will discard f and g.
+ */
+ if (f[u] >= lim || f[u] <= -lim
+ || g[u] >= lim || g[u] <= -lim) {
+ lim = -1;
+ break;
+ }
+ }
+ if (lim < 0) {
+ continue;
+ }
+
+ /*
+ * Bound is 1.17*sqrt(q). We compute the squared
+ * norms. With q = 12289, the squared bound is:
+ * (1.17^2)* 12289 = 16822.4121
+ * Since f and g are integral, the squared norm
+ * of (g,-f) is an integer.
+ */
+ normf = poly_small_sqnorm(f, logn);
+ normg = poly_small_sqnorm(g, logn);
+ norm = (normf + normg) | -((normf | normg) >> 31);
+ if (norm >= 16823) {
+ continue;
+ }
+
+ /*
+ * We compute the orthogonalized vector norm.
+ */
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ poly_small_to_fp(rt1, f, logn);
+ poly_small_to_fp(rt2, g, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_adj_fft(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(rt1, fpr_q, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(rt2, fpr_q, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_autoadj_fft(rt2, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(rt2, logn);
+ bnorm = fpr_zero;
+ for (u = 0; u < n; u ++) {
+ bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+ bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+ }
+ if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+ continue;
+ }
+
+ /*
+ * Compute public key h = g/f mod X^N+1 mod q. If this
+ * fails, we must restart.
+ */
+ if (h == NULL) {
+ h2 = (uint16_t *)tmp;
+ tmp2 = h2 + n;
+ } else {
+ h2 = h;
+ tmp2 = (uint16_t *)tmp;
+ }
+ if (!PQCLEAN_FALCONPADDED1024_AVX2_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+ continue;
+ }
+
+ /*
+ * Solve the NTRU equation to get F and G.
+ */
+ lim = (1 << (PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[logn] - 1)) - 1;
+ if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+ continue;
+ }
+
+ /*
+ * Key pair is generated.
+ */
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/pqclean.c
new file mode 100644
index 000000000..06560ed5c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include
+#include
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN 40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ * private key:
+ * header byte: 0101nnnn
+ * private f (6 or 5 bits by element, depending on degree)
+ * private g (6 or 5 bits by element, depending on degree)
+ * private F (8 bits by element)
+ *
+ * public key:
+ * header byte: 0000nnnn
+ * public h (14 bits by element)
+ *
+ * signature:
+ * header byte: 0011nnnn
+ * nonce (r) 40 bytes
+ * value (s) compressed format
+ * padding to 1280 bytes
+ *
+ * message + signature:
+ * signature 1280 bytes
+ * message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk) {
+ union {
+ uint8_t b[FALCON_KEYGEN_TEMP_10];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[1024], g[1024], F[1024];
+ uint16_t h[1024];
+ unsigned char seed[48];
+ inner_shake256_context rng;
+ size_t u, v;
+
+ /*
+ * Generate key pair.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&rng);
+ inner_shake256_inject(&rng, seed, sizeof seed);
+ inner_shake256_flip(&rng);
+ PQCLEAN_FALCONPADDED1024_AVX2_keygen(&rng, f, g, F, NULL, h, 10, tmp.b);
+ inner_shake256_ctx_release(&rng);
+
+ /*
+ * Encode private key.
+ */
+ sk[0] = 0x50 + 10;
+ u = 1;
+ v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u,
+ f, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u,
+ g, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u,
+ F, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[10]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+
+ /*
+ * Encode public key.
+ */
+ pk[0] = 0x00 + 10;
+ v = PQCLEAN_FALCONPADDED1024_AVX2_modq_encode(
+ pk + 1, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1,
+ h, 10);
+ if (v != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ union {
+ uint8_t b[72 * 1024];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[1024], g[1024], F[1024], G[1024];
+ struct {
+ int16_t sig[1024];
+ uint16_t hm[1024];
+ } r;
+ unsigned char seed[48];
+ inner_shake256_context sc;
+ size_t u, v;
+
+ /*
+ * Decode the private key.
+ */
+ if (sk[0] != 0x50 + 10) {
+ return -1;
+ }
+ u = 1;
+ v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+ f, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10],
+ sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+ g, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_fg_bits[10],
+ sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_AVX2_trim_i8_decode(
+ F, 10, PQCLEAN_FALCONPADDED1024_AVX2_max_FG_bits[10],
+ sk + u, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+ if (!PQCLEAN_FALCONPADDED1024_AVX2_complete_private(G, f, g, F, 10, tmp.b)) {
+ return -1;
+ }
+
+ /*
+ * Create a random nonce (40 bytes).
+ */
+ randombytes(nonce, NONCELEN);
+
+ /*
+ * Hash message nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(&sc, r.hm, 10, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Initialize a RNG.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, seed, sizeof seed);
+ inner_shake256_flip(&sc);
+
+ /*
+ * Compute and return the signature. This loops until a signature
+ * value is found that fits in the provided buffer.
+ */
+ for (;;) {
+ PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+ v = PQCLEAN_FALCONPADDED1024_AVX2_comp_encode(sigbuf, sigbuflen, r.sig, 10);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ memset(sigbuf + v, 0, sigbuflen - v);
+ return 0;
+ }
+ }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+ const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ union {
+ uint8_t b[2 * 1024];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ uint16_t h[1024], hm[1024];
+ int16_t sig[1024];
+ inner_shake256_context sc;
+ size_t v;
+
+ /*
+ * Decode public key.
+ */
+ if (pk[0] != 0x00 + 10) {
+ return -1;
+ }
+ if (PQCLEAN_FALCONPADDED1024_AVX2_modq_decode(h, 10,
+ pk + 1, PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1)
+ != PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+ PQCLEAN_FALCONPADDED1024_AVX2_to_ntt_monty(h, 10);
+
+ /*
+ * Decode signature.
+ */
+ if (sigbuflen == 0) {
+ return -1;
+ }
+
+ v = PQCLEAN_FALCONPADDED1024_AVX2_comp_decode(sig, 10, sigbuf, sigbuflen);
+ if (v == 0) {
+ return -1;
+ }
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
+
+ /*
+ * Hash nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED1024_AVX2_hash_to_point_ct(&sc, hm, 10, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Verify signature.
+ */
+ if (!PQCLEAN_FALCONPADDED1024_AVX2_verify_raw(hm, sig, h, 10, tmp.b)) {
+ return -1;
+ }
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ size_t vlen;
+
+ vlen = PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sig[0] = 0x30 + 10;
+ *siglen = 1 + NONCELEN + vlen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ if (siglen < 1 + NONCELEN) {
+ return -1;
+ }
+ if (sig[0] != 0x30 + 10) {
+ return -1;
+ }
+ return do_verify(sig + 1,
+ sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ uint8_t *sigbuf;
+ size_t sigbuflen;
+
+ /*
+ * Move the message to its final location; this is a memmove() so
+ * it handles overlaps properly.
+ */
+ memmove(sm + PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES, m, mlen);
+ sigbuf = sm + 1 + NONCELEN;
+ sigbuflen = PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sm[0] = 0x30 + 10;
+ sigbuflen ++;
+ *smlen = mlen + NONCELEN + sigbuflen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+ const uint8_t *sigbuf;
+ size_t pmlen, sigbuflen;
+
+ if (smlen < PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES) {
+ return -1;
+ }
+ sigbuflen = PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+ pmlen = smlen - PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES;
+ if (sm[0] != 0x30 + 10) {
+ return -1;
+ }
+ sigbuf = sm + 1 + NONCELEN;
+
+ /*
+ * The one-byte signature header has been verified. Nonce is at sm+1
+ * followed by the signature (pointed to by sigbuf). The message
+ * follows the signature value.
+ */
+ if (do_verify(sm + 1, sigbuf, sigbuflen,
+ sm + PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES, pmlen, pk) < 0) {
+ return -1;
+ }
+
+ /*
+ * Signature is correct, we just have to copy/move the message
+ * to its final destination. The memmove() properly handles
+ * overlaps.
+ */
+ memmove(m, sm + PQCLEAN_FALCONPADDED1024_AVX2_CRYPTO_BYTES, pmlen);
+ *mlen = pmlen;
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/rng.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/rng.c
new file mode 100644
index 000000000..001aecb4e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/rng.c
@@ -0,0 +1,179 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_prng_init(prng *p, inner_shake256_context *src) {
+ inner_shake256_extract(src, p->state.d, 56);
+ PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(prng *p) {
+
+ static const uint32_t CW[] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+ };
+
+ uint64_t cc;
+ size_t u;
+ int i;
+ uint32_t *sw;
+ union {
+ uint32_t w[16];
+ __m256i y[2]; /* for alignment */
+ } t;
+ __m256i state[16], init[16];
+
+ sw = (uint32_t *)p->state.d;
+
+ /*
+ * XOR next counter values into state.
+ */
+ cc = *(uint64_t *)(p->state.d + 48);
+ for (u = 0; u < 8; u ++) {
+ t.w[u] = (uint32_t)(cc + u);
+ t.w[u + 8] = (uint32_t)((cc + u) >> 32);
+ }
+ *(uint64_t *)(p->state.d + 48) = cc + 8;
+
+ /*
+ * Load state.
+ */
+ for (u = 0; u < 4; u ++) {
+ state[u] = init[u] =
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)CW[u]));
+ }
+ for (u = 0; u < 10; u ++) {
+ state[u + 4] = init[u + 4] =
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[u]));
+ }
+ state[14] = init[14] = _mm256_xor_si256(
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[10])),
+ _mm256_loadu_si256((__m256i *)&t.w[0]));
+ state[15] = init[15] = _mm256_xor_si256(
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[11])),
+ _mm256_loadu_si256((__m256i *)&t.w[8]));
+
+ /*
+ * Do all rounds.
+ */
+ for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d) do { \
+ state[a] = _mm256_add_epi32(state[a], state[b]); \
+ state[d] = _mm256_xor_si256(state[d], state[a]); \
+ state[d] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[d], 16), \
+ _mm256_srli_epi32(state[d], 16)); \
+ state[c] = _mm256_add_epi32(state[c], state[d]); \
+ state[b] = _mm256_xor_si256(state[b], state[c]); \
+ state[b] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[b], 12), \
+ _mm256_srli_epi32(state[b], 20)); \
+ state[a] = _mm256_add_epi32(state[a], state[b]); \
+ state[d] = _mm256_xor_si256(state[d], state[a]); \
+ state[d] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[d], 8), \
+ _mm256_srli_epi32(state[d], 24)); \
+ state[c] = _mm256_add_epi32(state[c], state[d]); \
+ state[b] = _mm256_xor_si256(state[b], state[c]); \
+ state[b] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[b], 7), \
+ _mm256_srli_epi32(state[b], 25)); \
+ } while (0)
+
+ QROUND( 0, 4, 8, 12);
+ QROUND( 1, 5, 9, 13);
+ QROUND( 2, 6, 10, 14);
+ QROUND( 3, 7, 11, 15);
+ QROUND( 0, 5, 10, 15);
+ QROUND( 1, 6, 11, 12);
+ QROUND( 2, 7, 8, 13);
+ QROUND( 3, 4, 9, 14);
+
+#undef QROUND
+
+ }
+
+ /*
+ * Add initial state back and encode the result in the destination
+ * buffer. We can dump the AVX2 values "as is" because the non-AVX2
+ * code uses a compatible order of values.
+ */
+ for (u = 0; u < 16; u ++) {
+ _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
+ _mm256_add_epi32(state[u], init[u]));
+ }
+
+ p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) {
+ uint8_t *buf;
+
+ buf = dst;
+ while (len > 0) {
+ size_t clen;
+
+ clen = (sizeof p->buf.d) - p->ptr;
+ if (clen > len) {
+ clen = len;
+ }
+ memcpy(buf, p->buf.d, clen);
+ buf += clen;
+ len -= clen;
+ p->ptr += clen;
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED1024_AVX2_prng_refill(p);
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/sign.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/sign.c
new file mode 100644
index 000000000..6761dbd60
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/sign.c
@@ -0,0 +1,1319 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ * N = 2^logn
+ * phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+ /*
+ * For logn = 0 (polynomials are constant), the "tree" is a
+ * single element. Otherwise, the tree node has size 2^logn, and
+ * has two child trees for size logn-1 each. Thus, treesize s()
+ * must fulfill these two relations:
+ *
+ * s(0) = 1
+ * s(logn) = (2^logn) + 2*s(logn-1)
+ */
+ return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+ fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g0[0];
+ return;
+ }
+ hn = n >> 1;
+
+ /*
+ * The LDL decomposition yields L (which is written in the tree)
+ * and the diagonal of D. Since d00 = g0, we just write d11
+ * into tmp.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+ /*
+ * Split d00 (currently in g0) and d11 (currently in tmp). We
+ * reuse g0 and g1 as temporary storage spaces:
+ * d00 splits into g1, g1+hn
+ * d11 splits into g0, g0+hn
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(g1, g1 + hn, g0, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+ /*
+ * Each split result is the first row of a new auto-adjoint
+ * quasicyclic matrix for the next recursive step.
+ */
+ ffLDL_fft_inner(tree + n,
+ g1, g1 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+ const fpr *g01, const fpr *g11,
+ unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *d00, *d11;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g00[0];
+ return;
+ }
+ hn = n >> 1;
+ d00 = tmp;
+ d11 = tmp + n;
+ tmp += n << 1;
+
+ memcpy(d00, g00, n * sizeof * g00);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(tmp, tmp + hn, d00, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(d00, d00 + hn, d11, logn);
+ memcpy(d11, tmp, n * sizeof * tmp);
+ ffLDL_fft_inner(tree + n,
+ d11, d11 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+ /*
+ * TODO: make an iterative version.
+ */
+ size_t n;
+
+ n = MKN(logn);
+ if (n == 1) {
+ /*
+ * We actually store in the tree leaf the inverse of
+ * the value mandated by the specification: this
+ * saves a division both here and in the sampler.
+ */
+ tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+ } else {
+ ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+ ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+ orig_logn, logn - 1);
+ }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ r[u] = fpr_of(t[u]);
+ }
+}
+
+/*
+ * The expanded private key contains:
+ * - The B0 matrix (four elements)
+ * - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+ (void)logn;
+ return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+ return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+ return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+ return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+ return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp) {
+ size_t n;
+ fpr *rf, *rg, *rF, *rG;
+ fpr *b00, *b01, *b10, *b11;
+ fpr *g00, *g01, *g11, *gxx;
+ fpr *tree;
+
+ n = MKN(logn);
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * We load the private key elements directly into the B0 matrix,
+ * since B0 = [[g, -f], [G, -F]].
+ */
+ rf = b01;
+ rg = b00;
+ rF = b11;
+ rG = b10;
+
+ smallints_to_fpr(rf, f, logn);
+ smallints_to_fpr(rg, g, logn);
+ smallints_to_fpr(rF, F, logn);
+ smallints_to_fpr(rG, G, logn);
+
+ /*
+ * Compute the FFT for the key elements, and negate f and F.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rf, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rg, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rF, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(rG, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(rf, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(rF, logn);
+
+ /*
+ * The Gram matrix is G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle).
+ */
+ g00 = (fpr *)tmp;
+ g01 = g00 + n;
+ g11 = g01 + n;
+ gxx = g11 + n;
+
+ memcpy(g00, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(g00, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(g00, gxx, logn);
+
+ memcpy(g01, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(g01, b10, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(gxx, b11, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(g01, gxx, logn);
+
+ memcpy(g11, b10, n * sizeof * b10);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(g11, logn);
+ memcpy(gxx, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(g11, gxx, logn);
+
+ /*
+ * Compute the Falcon tree.
+ */
+ ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+ /*
+ * Normalize tree.
+ */
+ ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+ fpr *t0, fpr *t1,
+ fpr *g00, fpr *g01, fpr *g11,
+ unsigned orig_logn, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *z0, *z1;
+
+ /*
+ * Deepest level: the LDL tree leaf value is just g00 (the
+ * array has length only 1 at this point); we normalize it
+ * with regards to sigma, then use it for sampling.
+ */
+ if (logn == 0) {
+ fpr leaf;
+
+ leaf = g00[0];
+ leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+ t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+ t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+ return;
+ }
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Decompose G into LDL. We only need d00 (identical to g00),
+ * d11, and l10; we do that in place.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_LDL_fft(g00, g01, g11, logn);
+
+ /*
+ * Split d00 and d11 and expand them into half-size quasi-cyclic
+ * Gram matrices. We also save l10 in tmp[].
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(tmp, tmp + hn, g00, logn);
+ memcpy(g00, tmp, n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(tmp, tmp + hn, g11, logn);
+ memcpy(g11, tmp, n * sizeof * tmp);
+ memcpy(tmp, g01, n * sizeof * g01);
+ memcpy(g01, g00, hn * sizeof * g00);
+ memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+ /*
+ * The half-size Gram matrices for the recursive LDL tree
+ * building are now:
+ * - left sub-tree: g00, g00+hn, g01
+ * - right sub-tree: g11, g11+hn, g01+hn
+ * l10 is in tmp[].
+ */
+
+ /*
+ * We split t1 and use the first recursive call on the two
+ * halves, using the right sub-tree. The result is merged
+ * back into tmp + 2*n.
+ */
+ z1 = tmp + n;
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+ g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * l10.
+ * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+ * in tmp + (n << 1). The buffer in z1 is free.
+ *
+ * In the end, z1 is written over t1, and tb0 is in t0.
+ */
+ memcpy(z1, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(z1, tmp + (n << 1), logn);
+ memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(t0, tmp, logn);
+
+ /*
+ * Second recursive invocation, on the split tb0 (currently in t0)
+ * and the left sub-tree.
+ */
+ z0 = tmp;
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z0, z0 + hn, t0, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+ g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+ fpr *z0, fpr *z1,
+ const fpr *tree,
+ const fpr *t0, const fpr *t1, unsigned logn,
+ fpr *tmp) {
+ size_t n, hn;
+ const fpr *tree0, *tree1;
+
+ /*
+ * When logn == 2, we inline the last two recursion levels.
+ */
+ if (logn == 2) {
+ fpr w0, w1, w2, w3, sigma;
+ __m128d ww0, ww1, wa, wb, wc, wd;
+ __m128d wy0, wy1, wz0, wz1;
+ __m128d half, invsqrt8, invsqrt2, neghi, neglo;
+ int si0, si1, si2, si3;
+
+ tree0 = tree + 4;
+ tree1 = tree + 8;
+
+ half = _mm_set1_pd(0.5);
+ invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
+ invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
+ neghi = _mm_set_pd(-0.0, 0.0);
+ neglo = _mm_set_pd(0.0, -0.0);
+
+ /*
+ * We split t1 into w*, then do the recursive invocation,
+ * with output in w*. We finally merge back into z1.
+ */
+ ww0 = _mm_loadu_pd(&t1[0].v);
+ ww1 = _mm_loadu_pd(&t1[2].v);
+ wa = _mm_unpacklo_pd(ww0, ww1);
+ wb = _mm_unpackhi_pd(ww0, ww1);
+ wc = _mm_add_pd(wa, wb);
+ ww0 = _mm_mul_pd(wc, half);
+ wc = _mm_sub_pd(wa, wb);
+ wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+ ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+ w2.v = _mm_cvtsd_f64(ww1);
+ w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+ wa = ww1;
+ sigma = tree1[3];
+ si2 = samp(samp_ctx, w2, sigma);
+ si3 = samp(samp_ctx, w3, sigma);
+ ww1 = _mm_set_pd((double)si3, (double)si2);
+ wa = _mm_sub_pd(wa, ww1);
+ wb = _mm_loadu_pd(&tree1[0].v);
+ wc = _mm_mul_pd(wa, wb);
+ wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+ wa = _mm_unpacklo_pd(wc, wd);
+ wb = _mm_unpackhi_pd(wc, wd);
+ ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+ w0.v = _mm_cvtsd_f64(ww0);
+ w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+ sigma = tree1[2];
+ si0 = samp(samp_ctx, w0, sigma);
+ si1 = samp(samp_ctx, w1, sigma);
+ ww0 = _mm_set_pd((double)si1, (double)si0);
+
+ wc = _mm_mul_pd(
+ _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+ invsqrt2);
+ wa = _mm_add_pd(ww0, wc);
+ wb = _mm_sub_pd(ww0, wc);
+ ww0 = _mm_unpacklo_pd(wa, wb);
+ ww1 = _mm_unpackhi_pd(wa, wb);
+ _mm_storeu_pd(&z1[0].v, ww0);
+ _mm_storeu_pd(&z1[2].v, ww1);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+ */
+ wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
+ wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
+ wz0 = _mm_loadu_pd(&tree[0].v);
+ wz1 = _mm_loadu_pd(&tree[2].v);
+ ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
+ ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
+ ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
+ ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
+
+ /*
+ * Second recursive invocation.
+ */
+ wa = _mm_unpacklo_pd(ww0, ww1);
+ wb = _mm_unpackhi_pd(ww0, ww1);
+ wc = _mm_add_pd(wa, wb);
+ ww0 = _mm_mul_pd(wc, half);
+ wc = _mm_sub_pd(wa, wb);
+ wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+ ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+ w2.v = _mm_cvtsd_f64(ww1);
+ w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+ wa = ww1;
+ sigma = tree0[3];
+ si2 = samp(samp_ctx, w2, sigma);
+ si3 = samp(samp_ctx, w3, sigma);
+ ww1 = _mm_set_pd((double)si3, (double)si2);
+ wa = _mm_sub_pd(wa, ww1);
+ wb = _mm_loadu_pd(&tree0[0].v);
+ wc = _mm_mul_pd(wa, wb);
+ wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+ wa = _mm_unpacklo_pd(wc, wd);
+ wb = _mm_unpackhi_pd(wc, wd);
+ ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+ w0.v = _mm_cvtsd_f64(ww0);
+ w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+ sigma = tree0[2];
+ si0 = samp(samp_ctx, w0, sigma);
+ si1 = samp(samp_ctx, w1, sigma);
+ ww0 = _mm_set_pd((double)si1, (double)si0);
+
+ wc = _mm_mul_pd(
+ _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+ invsqrt2);
+ wa = _mm_add_pd(ww0, wc);
+ wb = _mm_sub_pd(ww0, wc);
+ ww0 = _mm_unpacklo_pd(wa, wb);
+ ww1 = _mm_unpackhi_pd(wa, wb);
+ _mm_storeu_pd(&z0[0].v, ww0);
+ _mm_storeu_pd(&z0[2].v, ww1);
+
+ return;
+ }
+
+ /*
+ * Case logn == 1 is reachable only when using Falcon-2 (the
+ * smallest size for which Falcon is mathematically defined, but
+ * of course way too insecure to be of any use).
+ */
+ if (logn == 1) {
+ fpr x0, x1, y0, y1, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ x0 = t1[0];
+ x1 = t1[1];
+ sigma = tree[3];
+ z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree[0];
+ b_im = tree[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, t0[0]);
+ x1 = fpr_add(c_im, t0[1]);
+ sigma = tree[2];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+ return;
+ }
+
+ /*
+ * Normal end of recursion is for logn == 0. Since the last
+ * steps of the recursions were inlined in the blocks above
+ * (when logn == 1 or 2), this case is not reachable, and is
+ * retained here only for documentation purposes.
+
+ if (logn == 0) {
+ fpr x0, x1, sigma;
+
+ x0 = t0[0];
+ x1 = t1[0];
+ sigma = tree[0];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+ return;
+ }
+
+ */
+
+ /*
+ * General recursive case (logn >= 3).
+ */
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ tree0 = tree + n;
+ tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+ /*
+ * We split t1 into z1 (reused as temporary storage), then do
+ * the recursive invocation, with output in tmp. We finally
+ * merge back into z1.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree1, z1, z1 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+ */
+ memcpy(tmp, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_sub(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tmp, tree, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(tmp, t0, logn);
+
+ /*
+ * Second recursive invocation.
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_split_fft(z0, z0 + hn, tmp, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree0, z0, z0 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const fpr *expanded_key,
+ const uint16_t *hm,
+ unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ const fpr *b00, *b01, *b10, *b11, *tree;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+ t0 = tmp;
+ t1 = t0 + n;
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t0, ni, logn);
+
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Apply sampling. Output is written back in [tx, ty].
+ */
+ ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(t0, tx, n * sizeof * tx);
+ memcpy(t1, ty, n * sizeof * ty);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(t1, ty, logn);
+
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t1, logn);
+
+ /*
+ * Compute the signature.
+ */
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+
+ /*
+ * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b11, logn);
+
+ /*
+ * Compute the Gram matrix G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle). g10 is not kept
+ * since it is equal to adj(g01).
+ *
+ * We _replace_ the matrix B with the Gram matrix, but we
+ * must keep b01 and b11 for computing the target vector.
+ */
+ t0 = b11 + n;
+ t1 = t0 + n;
+
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(t0, logn); // t0 <- b01*adj(b01)
+
+ memcpy(t1, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(t1, b10, logn); // t1 <- b00*adj(b10)
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(b00, logn); // b00 <- b00*adj(b00)
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(b00, t0, logn); // b00 <- g00
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_muladj_fft(b01, b11, logn); // b01 <- b01*adj(b11)
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(b01, t1, logn); // b01 <- g01
+
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(b10, logn); // b10 <- b10*adj(b10)
+ memcpy(t1, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulselfadj_fft(t1, logn); // t1 <- b11*adj(b11)
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(b10, t1, logn); // b10 <- g11
+
+ /*
+ * We rename variables to make things clearer. The three elements
+ * of the Gram matrix uses the first 3*n slots of tmp[], followed
+ * by b11 and b01 (in that order).
+ */
+ g00 = b00;
+ g01 = b01;
+ g11 = b10;
+ b01 = t0;
+ t0 = b01 + n;
+ t1 = t0 + n;
+
+ /*
+ * Memory layout at that point:
+ * g00 g01 g11 b11 b01 t0 t1
+ */
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mulconst(t0, ni, logn);
+
+ /*
+ * b01 and b11 can be discarded, so we move back (t0,t1).
+ * Memory layout is now:
+ * g00 g01 g11 t0 t1
+ */
+ memcpy(b11, t0, n * 2 * sizeof * t0);
+ t0 = g11 + n;
+ t1 = t0 + n;
+
+ /*
+ * Apply sampling; result is written over (t0,t1).
+ */
+ ffSampling_fft_dyntree(samp, samp_ctx,
+ t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+ /*
+ * We arrange the layout back to:
+ * b00 b01 b10 b11 t0 t1
+ *
+ * We did not conserve the matrix basis, so we must recompute
+ * it now.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ memmove(b11 + n, t0, n * 2 * sizeof * t0);
+ t0 = b11 + n;
+ t1 = t0 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_neg(b11, logn);
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(tx, t0, n * sizeof * t0);
+ memcpy(ty, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_poly_add(t1, ty, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED1024_AVX2_iFFT(t1, logn);
+
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED1024_AVX2_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_gaussian0_sampler(prng *p) {
+
+ /*
+ * High words.
+ */
+ static const union {
+ uint16_t u16[16];
+ __m256i ymm[1];
+ } rhi15 = {
+ {
+ 0x51FB, 0x2A69, 0x113E, 0x0568,
+ 0x014A, 0x003B, 0x0008, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000
+ }
+ };
+
+ static const union {
+ uint64_t u64[20];
+ __m256i ymm[5];
+ } rlo57 = {
+ {
+ 0x1F42ED3AC391802, 0x12B181F3F7DDB82,
+ 0x1CDD0934829C1FF, 0x1754377C7994AE4,
+ 0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
+ 0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
+ 0x01F80D88A7B6428, 0x001C3FDB2040C69,
+ 0x00012CF24D031FB, 0x00000949F8B091F,
+ 0x0000003665DA998, 0x00000000EBF6EBB,
+ 0x0000000002F5D7E, 0x000000000007098,
+ 0x0000000000000C6, 0x000000000000001,
+ 0x000000000000000, 0x000000000000000
+ }
+ };
+
+ uint64_t lo;
+ unsigned hi;
+ __m256i xhi, rhi, gthi, eqhi, eqm;
+ __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
+ __m128i t, zt;
+ int r;
+
+ /*
+ * Get a 72-bit random value and split it into a low part
+ * (57 bits) and a high part (15 bits)
+ */
+ lo = prng_get_u64(p);
+ hi = prng_get_u8(p);
+ hi = (hi << 7) | (unsigned)(lo >> 57);
+ lo &= 0x1FFFFFFFFFFFFFF;
+
+ /*
+ * Broadcast the high part and compare it with the relevant
+ * values. We need both a "greater than" and an "equal"
+ * comparisons.
+ */
+ xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128((int)hi));
+ rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
+ gthi = _mm256_cmpgt_epi16(rhi, xhi);
+ eqhi = _mm256_cmpeq_epi16(rhi, xhi);
+
+ /*
+ * The result is the number of 72-bit values (among the list of 19)
+ * which are greater than the 72-bit random value. We first count
+ * all non-zero 16-bit elements in the first eight of gthi. Such
+ * elements have value -1 or 0, so we first negate them.
+ */
+ t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
+ zt = _mm_setzero_si128();
+ t = _mm_hadd_epi16(t, zt);
+ t = _mm_hadd_epi16(t, zt);
+ t = _mm_hadd_epi16(t, zt);
+ r = _mm_cvtsi128_si32(t);
+
+ /*
+ * We must look at the low bits for all values for which the
+ * high bits are an "equal" match; values 8-18 all have the
+ * same high bits (0).
+ * On 32-bit systems, 'lo' really is two registers, requiring
+ * some extra code.
+ */
+ #if defined(__x86_64__) || defined(_M_X64)
+ xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
+ #else
+ {
+ uint32_t e0, e1;
+ int32_t f0, f1;
+
+ e0 = (uint32_t)lo;
+ e1 = (uint32_t)(lo >> 32);
+ f0 = *(int32_t *)&e0;
+ f1 = *(int32_t *)&e1;
+ xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
+ }
+ #endif
+ gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo);
+ gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo);
+ gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo);
+ gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo);
+ gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo);
+
+ /*
+ * Keep only comparison results that correspond to the non-zero
+ * elements in eqhi.
+ */
+ gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
+ _mm256_castsi256_si128(eqhi)));
+ gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
+ _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
+ eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
+ gtlo2 = _mm256_and_si256(gtlo2, eqm);
+ gtlo3 = _mm256_and_si256(gtlo3, eqm);
+ gtlo4 = _mm256_and_si256(gtlo4, eqm);
+
+ /*
+ * Add all values to count the total number of "-1" elements.
+ * Since the first eight "high" words are all different, only
+ * one element (at most) in gtlo0:gtlo1 can be non-zero; however,
+ * if the high word of the random value is zero, then many
+ * elements of gtlo2:gtlo3:gtlo4 can be non-zero.
+ */
+ gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
+ gtlo0 = _mm256_add_epi64(
+ _mm256_add_epi64(gtlo0, gtlo2),
+ _mm256_add_epi64(gtlo3, gtlo4));
+ t = _mm_add_epi64(
+ _mm256_castsi256_si128(gtlo0),
+ _mm256_extracti128_si256(gtlo0, 1));
+ t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
+ r -= _mm_cvtsi128_si32(t);
+
+ return r;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+ int s, i;
+ fpr r;
+ uint32_t sw, w;
+ uint64_t z;
+
+ /*
+ * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+ * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+ */
+ s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+ r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+ /*
+ * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+ * (the minimum value for sigma), r = 0 and b = 1, then we get
+ * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+ * with probability about 0.000000000230383991, which is
+ * approximatively equal to 2^(-32). In any case, if s >= 64,
+ * then BerExp will be non-zero with probability less than
+ * 2^(-64), so we can simply saturate s at 63.
+ */
+ sw = (uint32_t)s;
+ sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+ s = (int)sw;
+
+ /*
+ * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+ * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+ * We scale it up to 2^64, then right-shift it by s bits because
+ * we really want exp(-x) = 2^(-s)*exp(-r).
+ *
+ * The "-1" operation makes sure that the value fits on 64 bits
+ * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+ * case). The bias is negligible since fpr_expm_p63() only computes
+ * with 51 bits of precision or so.
+ */
+ z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+ /*
+ * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+ * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+ * PRNG output to limit its consumption, the sign of the difference
+ * yields the expected result.
+ */
+ i = 64;
+ do {
+ i -= 8;
+ w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+ } while (!w && i > 0);
+ return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_sampler(void *ctx, fpr mu, fpr isigma) {
+ sampler_context *spc;
+ int s;
+ fpr r, dss, ccs;
+
+ spc = ctx;
+
+ /*
+ * Center is mu. We compute mu = s + r where s is an integer
+ * and 0 <= r < 1.
+ */
+ s = (int)fpr_floor(mu);
+ r = fpr_sub(mu, fpr_of(s));
+
+ /*
+ * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+ */
+ dss = fpr_half(fpr_sqr(isigma));
+
+ /*
+ * ccs = sigma_min / sigma = sigma_min * isigma.
+ */
+ ccs = fpr_mul(isigma, spc->sigma_min);
+
+ /*
+ * We now need to sample on center r.
+ */
+ for (;;) {
+ int z0, z, b;
+ fpr x;
+
+ /*
+ * Sample z for a Gaussian distribution. Then get a
+ * random bit b to turn the sampling into a bimodal
+ * distribution: if b = 1, we use z+1, otherwise we
+ * use -z. We thus have two situations:
+ *
+ * - b = 1: z >= 1 and sampled against a Gaussian
+ * centered on 1.
+ * - b = 0: z <= 0 and sampled against a Gaussian
+ * centered on 0.
+ */
+ z0 = PQCLEAN_FALCONPADDED1024_AVX2_gaussian0_sampler(&spc->p);
+ b = (int)prng_get_u8(&spc->p) & 1;
+ z = b + ((b << 1) - 1) * z0;
+
+ /*
+ * Rejection sampling. We want a Gaussian centered on r;
+ * but we sampled against a Gaussian centered on b (0 or
+ * 1). But we know that z is always in the range where
+ * our sampling distribution is greater than the Gaussian
+ * distribution, so rejection works.
+ *
+ * We got z with distribution:
+ * G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+ * We target distribution:
+ * S(z) = exp(-((z-r)^2)/(2*sigma^2))
+ * Rejection sampling works by keeping the value z with
+ * probability S(z)/G(z), and starting again otherwise.
+ * This requires S(z) <= G(z), which is the case here.
+ * Thus, we simply need to keep our z with probability:
+ * P = exp(-x)
+ * where:
+ * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+ *
+ * Here, we scale up the Bernouilli distribution, which
+ * makes rejection more probable, but makes rejection
+ * rate sufficiently decorrelated from the Gaussian
+ * center and standard deviation that the whole sampler
+ * can be said to be constant-time.
+ */
+ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+ x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+ if (BerExp(&spc->p, x, ccs)) {
+ /*
+ * Rejection sampling was centered on r, but the
+ * actual center is mu = s + r.
+ */
+ return s + z;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED1024_AVX2_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED1024_AVX2_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_tree(samp, samp_ctx, sig,
+ expanded_key, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED1024_AVX2_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED1024_AVX2_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_dyn(samp, samp_ctx, sig,
+ f, g, F, G, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_avx2/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/vrfy.c
new file mode 100644
index 000000000..534d5d8c0
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_avx2/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ * n = 2^logn (2 <= n <= 1024)
+ * phi = X^n + 1
+ * q = 12289
+ * q0i = -1/q mod 2^16
+ * R = 2^16 mod q
+ * R2 = 2^32 mod q
+ */
+
+#define Q 12289
+#define Q0I 12287
+#define R 4091
+#define R2 10952
+
+/*
+ * Table for NTT, binary case:
+ * GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+ 4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759,
+ 1591, 6399, 9477, 5266, 586, 5825, 7538, 9710,
+ 1134, 6407, 1711, 965, 7099, 7674, 3743, 6442,
+ 10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180,
+ 12210, 6240, 997, 117, 4783, 4407, 1549, 7072,
+ 2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042,
+ 12189, 432, 10751, 1237, 7610, 1534, 3983, 7863,
+ 2181, 6308, 8720, 6570, 4843, 1690, 14, 3872,
+ 5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340,
+ 1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045,
+ 3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180,
+ 9277, 6130, 3323, 883, 10469, 489, 1502, 2851,
+ 11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195,
+ 730, 1762, 3854, 2030, 5892, 10922, 9020, 5274,
+ 9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446,
+ 7613, 9386, 834, 7703, 6836, 3403, 5351, 12276,
+ 3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525,
+ 10401, 2749, 7338, 10574, 6040, 943, 9330, 1477,
+ 6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680,
+ 8188, 6902, 3533, 9807, 6090, 727, 10099, 7003,
+ 6945, 1949, 9731, 10559, 6057, 378, 7871, 8763,
+ 8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821,
+ 5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159,
+ 1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188,
+ 737, 3698, 4699, 5753, 9046, 3687, 16, 914,
+ 5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381,
+ 10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357,
+ 6409, 8197, 2962, 6375, 5064, 6634, 5625, 278,
+ 932, 10229, 8927, 7642, 351, 9298, 237, 5858,
+ 7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204,
+ 4602, 1748, 11300, 340, 3711, 4614, 300, 10993,
+ 5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654,
+ 3835, 5553, 1224, 8476, 9237, 3845, 250, 11209,
+ 4225, 6326, 9680, 12254, 4136, 2778, 692, 8808,
+ 6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433,
+ 6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416,
+ 8418, 10824, 11986, 5733, 876, 7030, 2167, 2436,
+ 3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434,
+ 7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328,
+ 8557, 1867, 9454, 2416, 3816, 9076, 686, 5393,
+ 2523, 4339, 6115, 619, 937, 2834, 7775, 3279,
+ 2363, 7488, 6112, 5056, 824, 10204, 11690, 1113,
+ 2727, 9848, 896, 2028, 5075, 2654, 10464, 7884,
+ 12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520,
+ 1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399,
+ 11192, 315, 4511, 1158, 6061, 6751, 11865, 357,
+ 7367, 4550, 983, 8534, 8352, 10126, 7530, 9253,
+ 4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652,
+ 3374, 11477, 1753, 292, 8681, 2806, 10378, 12188,
+ 5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928,
+ 4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650,
+ 7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344,
+ 8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561,
+ 6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114,
+ 7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323,
+ 10438, 9471, 1271, 408, 6911, 3079, 360, 8276,
+ 11535, 9156, 9049, 11539, 850, 8617, 784, 7919,
+ 8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600,
+ 9779, 1012, 721, 2784, 6676, 6552, 5348, 4424,
+ 6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333,
+ 8801, 9661, 7308, 5788, 4910, 909, 11613, 4395,
+ 8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216,
+ 4296, 11918, 695, 4371, 9793, 4884, 2411, 10230,
+ 2650, 841, 3890, 10231, 7248, 8505, 11196, 6688,
+ 4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868,
+ 11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525,
+ 7938, 7982, 11977, 6755, 537, 4562, 1623, 8227,
+ 11453, 7544, 906, 11816, 9548, 10858, 9703, 2815,
+ 11736, 6813, 6979, 819, 8903, 6271, 10843, 348,
+ 7514, 8339, 6439, 694, 852, 5659, 2781, 3716,
+ 11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885,
+ 2978, 7289, 11884, 9123, 9323, 11830, 98, 2526,
+ 2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224,
+ 10871, 8092, 9651, 5989, 7140, 8480, 1670, 159,
+ 10923, 4918, 128, 7312, 725, 9157, 5006, 6393,
+ 3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668,
+ 3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365,
+ 5110, 45, 2400, 1921, 4377, 2720, 1695, 51,
+ 2808, 650, 1896, 9997, 9971, 11980, 8098, 4833,
+ 4135, 4257, 5838, 4765, 10985, 11532, 590, 12198,
+ 482, 12173, 2006, 7064, 10018, 3912, 12016, 10519,
+ 11362, 6954, 2210, 284, 5413, 6601, 3865, 10339,
+ 11188, 6231, 517, 9564, 11281, 3863, 1210, 4604,
+ 8160, 11447, 153, 7204, 5763, 5089, 9248, 12154,
+ 11748, 1354, 6672, 179, 5532, 2646, 5941, 12185,
+ 862, 3158, 477, 7279, 5678, 7914, 4254, 302,
+ 2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824,
+ 10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449,
+ 5159, 1308, 8315, 3404, 1877, 1231, 112, 6398,
+ 11724, 12272, 7286, 1459, 12274, 9896, 3456, 800,
+ 1397, 10678, 103, 7420, 7976, 936, 764, 632,
+ 7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946,
+ 6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139,
+ 4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850,
+ 7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217,
+ 10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711,
+ 2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729,
+ 4997, 7415, 6315, 12044, 4374, 7157, 4844, 211,
+ 8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875,
+ 8192, 986, 7527, 1401, 870, 3615, 8465, 2756,
+ 9770, 2034, 10168, 3264, 6132, 54, 2880, 4763,
+ 11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038,
+ 2567, 708, 893, 6465, 4962, 10024, 2090, 5718,
+ 10743, 780, 4733, 4623, 2134, 2087, 4802, 884,
+ 5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664,
+ 4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791,
+ 6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032,
+ 11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062,
+ 8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348,
+ 4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499,
+ 2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326,
+ 5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830,
+ 11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582,
+ 3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762,
+ 7496, 10383, 755, 1654, 12053, 4952, 10134, 4394,
+ 6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674,
+ 10358, 4901, 7414, 8771, 710, 6764, 8462, 7193,
+ 5371, 7274, 11084, 290, 7864, 6827, 11822, 2509,
+ 6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105,
+ 11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776,
+ 7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277,
+ 9182, 11456, 633, 10046, 11554, 5633, 9587, 2333,
+ 7008, 7084, 5047, 7199, 9865, 8997, 569, 6390,
+ 10845, 9679, 8268, 11472, 4203, 1997, 2, 9331,
+ 162, 6182, 2000, 3649, 9792, 6363, 7557, 6187,
+ 8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067,
+ 5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165,
+ 6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ * iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+ 4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329,
+ 2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698,
+ 3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875,
+ 5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155,
+ 8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108,
+ 4426, 8306, 10755, 4679, 11052, 1538, 11857, 100,
+ 8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460,
+ 5217, 10740, 7882, 7506, 12172, 11292, 6049, 79,
+ 13, 6938, 8886, 5453, 4586, 11455, 2903, 4676,
+ 9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110,
+ 7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559,
+ 11094, 2211, 1808, 7319, 48, 9547, 2560, 1228,
+ 9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012,
+ 6109, 2796, 2203, 1652, 711, 7004, 1053, 8973,
+ 5244, 1517, 9322, 11269, 900, 3888, 11133, 10736,
+ 4949, 7616, 9974, 4746, 10270, 126, 2921, 6720,
+ 6635, 6543, 1582, 4868, 42, 673, 2240, 7219,
+ 1296, 11989, 7675, 8578, 11949, 989, 10541, 7687,
+ 7085, 8487, 1004, 10236, 4703, 163, 9143, 4597,
+ 6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357,
+ 12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880,
+ 6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556,
+ 6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103,
+ 11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552,
+ 6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822,
+ 9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609,
+ 3468, 4659, 625, 2700, 7738, 3443, 3060, 3388,
+ 3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344,
+ 5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101,
+ 4609, 8605, 8226, 144, 5656, 8704, 2621, 5424,
+ 10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888,
+ 3764, 39, 8219, 2080, 2502, 1469, 10550, 8709,
+ 5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639,
+ 2059, 9878, 7405, 2496, 7918, 11594, 371, 7993,
+ 3073, 10326, 40, 10004, 9245, 7987, 5603, 4051,
+ 7894, 676, 11380, 7379, 6501, 4981, 2628, 3488,
+ 10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473,
+ 7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510,
+ 6689, 386, 4462, 105, 2076, 10443, 119, 3955,
+ 4370, 11505, 3672, 11439, 750, 3240, 3133, 754,
+ 4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851,
+ 4966, 8181, 2688, 6205, 6814, 926, 2936, 4327,
+ 10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255,
+ 728, 7569, 6056, 10432, 11036, 2452, 2811, 3787,
+ 945, 8998, 1244, 8815, 11017, 11218, 5894, 4325,
+ 4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707,
+ 1361, 9812, 2949, 11265, 10301, 9108, 478, 6489,
+ 101, 1911, 9483, 3608, 11997, 10536, 812, 8915,
+ 637, 8159, 5299, 9128, 3512, 8290, 7068, 7922,
+ 3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922,
+ 11932, 424, 5538, 6228, 11131, 7778, 11974, 1097,
+ 2890, 10027, 2569, 2250, 2352, 821, 2550, 11016,
+ 7769, 136, 617, 3157, 5889, 9219, 6855, 120,
+ 4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562,
+ 11176, 599, 2085, 11465, 7233, 6177, 4801, 9926,
+ 9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766,
+ 6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732,
+ 7961, 1457, 10857, 8069, 832, 1628, 3410, 4900,
+ 10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847,
+ 9853, 10122, 5259, 11413, 6556, 303, 1465, 3871,
+ 4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852,
+ 3856, 928, 4933, 8530, 1871, 2184, 5571, 5879,
+ 3481, 11597, 9511, 8153, 35, 2609, 5963, 8064,
+ 1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454,
+ 2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028,
+ 2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795,
+ 9222, 10837, 280, 8583, 3270, 6753, 2354, 3779,
+ 6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127,
+ 2958, 12287, 10292, 8086, 817, 4021, 2610, 1444,
+ 5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281,
+ 9956, 2702, 6656, 735, 2243, 11656, 833, 3107,
+ 6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278,
+ 3513, 9769, 3025, 779, 9433, 3392, 7437, 668,
+ 10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711,
+ 9780, 467, 5462, 4425, 11999, 1205, 5015, 6918,
+ 5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931,
+ 6615, 1541, 8708, 260, 3385, 4792, 4391, 5697,
+ 7895, 2155, 7337, 236, 10635, 11534, 1906, 4793,
+ 9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556,
+ 707, 1088, 4936, 678, 10245, 18, 5684, 960,
+ 4459, 7957, 226, 2451, 6, 8874, 320, 6298,
+ 8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876,
+ 9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679,
+ 7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378,
+ 5227, 952, 4319, 9810, 4356, 3088, 11118, 840,
+ 6257, 486, 6000, 1342, 10382, 6017, 4798, 5489,
+ 4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037,
+ 1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917,
+ 11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546,
+ 6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722,
+ 2251, 11199, 5356, 7408, 2861, 4003, 9215, 484,
+ 7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519,
+ 9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097,
+ 2414, 6496, 9953, 10554, 808, 2999, 2130, 4286,
+ 12078, 7445, 5132, 7915, 245, 5974, 4874, 7292,
+ 7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022,
+ 9578, 8934, 11074, 9498, 294, 4711, 3391, 1377,
+ 9072, 10189, 4569, 10890, 9909, 6923, 53, 4653,
+ 439, 10253, 7028, 10207, 8343, 1141, 2556, 7601,
+ 8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765,
+ 10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293,
+ 11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892,
+ 11489, 8833, 2393, 15, 10830, 5003, 17, 565,
+ 5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130,
+ 5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020,
+ 2465, 8191, 384, 2642, 2729, 5399, 2175, 9396,
+ 11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427,
+ 104, 6348, 9643, 6757, 12110, 5617, 10935, 541,
+ 135, 3041, 7200, 6526, 5085, 12136, 842, 4129,
+ 7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101,
+ 1950, 8424, 5688, 6876, 12005, 10079, 5335, 927,
+ 1770, 273, 8377, 2271, 5225, 10283, 116, 11807,
+ 91, 11699, 757, 1304, 7524, 6451, 8032, 8154,
+ 7456, 4191, 309, 2318, 2292, 10393, 11639, 9481,
+ 12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179,
+ 3924, 3188, 367, 2077, 336, 5384, 5631, 8596,
+ 4621, 1775, 8866, 451, 6108, 1317, 6246, 8795,
+ 5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366,
+ 12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418,
+ 10065, 4156, 8373, 8644, 10445, 882, 8158, 10173,
+ 9763, 12191, 459, 2966, 3166, 405, 5000, 9311,
+ 6404, 8986, 1551, 8175, 3630, 10766, 9265, 700,
+ 8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775,
+ 11941, 1446, 6018, 3386, 11470, 5310, 5476, 553,
+ 9474, 2586, 1431, 2741, 473, 11383, 4745, 836,
+ 4062, 10666, 7727, 11752, 5534, 312, 4307, 4351,
+ 5764, 8679, 8381, 8187, 5, 7395, 4363, 1152,
+ 5421, 5231, 6473, 436, 7567, 8603, 6229, 8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+ /*
+ * If x < 0, the cast to uint32_t will set the high bit to 1.
+ */
+ uint32_t y;
+
+ y = (uint32_t)x;
+ y += Q & -(y >> 31);
+ return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+ /*
+ * We compute x + y - q. If the result is negative, then the
+ * high bit will be set, and 'd >> 31' will be equal to 1;
+ * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+ * it will be an all-zero pattern. In other words, this
+ * implements a conditional addition of q.
+ */
+ uint32_t d;
+
+ d = x + y - Q;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+ /*
+ * As in mq_add(), we use a conditional addition to ensure the
+ * result is in the 0..q-1 range.
+ */
+ uint32_t d;
+
+ d = x - y;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+ x += Q & -(x & 1);
+ return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+ uint32_t z, w;
+
+ /*
+ * We compute x*y + k*q with a value of k chosen so that the 16
+ * low bits of the result are 0. We can then shift the value.
+ * After the shift, result may still be larger than q, but it
+ * will be lower than 2*q, so a conditional subtraction works.
+ */
+
+ z = x * y;
+ w = ((z * Q0I) & 0xFFFF) * Q;
+
+ /*
+ * When adding z and w, the result will have its low 16 bits
+ * equal to 0. Since x, y and z are lower than q, the sum will
+ * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+ * fit on 29 bits.
+ */
+ z = (z + w) >> 16;
+
+ /*
+ * After the shift, analysis shows that the value will be less
+ * than 2q. We do a subtraction then conditional subtraction to
+ * ensure the result is in the expected range.
+ */
+ z -= Q;
+ z += Q & -(z >> 31);
+ return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+ return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+ /*
+ * We invert y by computing y^(q-2) mod q.
+ *
+ * We use the following addition chain for exponent e = 12287:
+ *
+ * e0 = 1
+ * e1 = 2 * e0 = 2
+ * e2 = e1 + e0 = 3
+ * e3 = e2 + e1 = 5
+ * e4 = 2 * e3 = 10
+ * e5 = 2 * e4 = 20
+ * e6 = 2 * e5 = 40
+ * e7 = 2 * e6 = 80
+ * e8 = 2 * e7 = 160
+ * e9 = e8 + e2 = 163
+ * e10 = e9 + e8 = 323
+ * e11 = 2 * e10 = 646
+ * e12 = 2 * e11 = 1292
+ * e13 = e12 + e9 = 1455
+ * e14 = 2 * e13 = 2910
+ * e15 = 2 * e14 = 5820
+ * e16 = e15 + e10 = 6143
+ * e17 = 2 * e16 = 12286
+ * e18 = e17 + e0 = 12287
+ *
+ * Additions on exponents are converted to Montgomery
+ * multiplications. We define all intermediate results as so
+ * many local variables, and let the C compiler work out which
+ * must be kept around.
+ */
+ uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+ uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+ y0 = mq_montymul(y, R2);
+ y1 = mq_montysqr(y0);
+ y2 = mq_montymul(y1, y0);
+ y3 = mq_montymul(y2, y1);
+ y4 = mq_montysqr(y3);
+ y5 = mq_montysqr(y4);
+ y6 = mq_montysqr(y5);
+ y7 = mq_montysqr(y6);
+ y8 = mq_montysqr(y7);
+ y9 = mq_montymul(y8, y2);
+ y10 = mq_montymul(y9, y8);
+ y11 = mq_montysqr(y10);
+ y12 = mq_montysqr(y11);
+ y13 = mq_montymul(y12, y9);
+ y14 = mq_montysqr(y13);
+ y15 = mq_montysqr(y14);
+ y16 = mq_montymul(y15, y10);
+ y17 = mq_montysqr(y16);
+ y18 = mq_montymul(y17, y0);
+
+ /*
+ * Final multiplication with x, which is not in Montgomery
+ * representation, computes the correct division result.
+ */
+ return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, i, j1;
+
+ ht = t >> 1;
+ for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+ size_t j, j2;
+ uint32_t s;
+
+ s = GMb[m + i];
+ j2 = j1 + ht;
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v;
+
+ u = a[j];
+ v = mq_montymul(a[j + ht], s);
+ a[j] = (uint16_t)mq_add(u, v);
+ a[j + ht] = (uint16_t)mq_sub(u, v);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+ uint32_t ni;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ while (m > 1) {
+ size_t hm, dt, i, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+ size_t j, j2;
+ uint32_t s;
+
+ j2 = j1 + t;
+ s = iGMb[hm + i];
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v, w;
+
+ u = a[j];
+ v = a[j + t];
+ a[j] = (uint16_t)mq_add(u, v);
+ w = mq_sub(u, v);
+ a[j + t] = (uint16_t)
+ mq_montymul(w, s);
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * To complete the inverse NTT, we must now divide all values by
+ * n (the vector size). We thus need the inverse of n, i.e. we
+ * need to divide 1 by 2 logn times. But we also want it in
+ * Montgomery representation, i.e. we also want to multiply it
+ * by R = 2^16. In the common case, this should be a simple right
+ * shift. The loop below is generic and works also in corner cases;
+ * its computation time is negligible.
+ */
+ ni = R;
+ for (m = n; m > 1; m >>= 1) {
+ ni = mq_rshift1(ni);
+ }
+ for (m = 0; m < n; m ++) {
+ a[m] = (uint16_t)mq_montymul(a[m], ni);
+ }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], R2);
+ }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+ }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_sub(f[u], g[u]);
+ }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_AVX2_to_ntt_monty(uint16_t *h, unsigned logn) {
+ mq_NTT(h, logn);
+ mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+
+ /*
+ * Reduce s2 elements modulo q ([0..q-1] range).
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+ */
+ mq_NTT(tt, logn);
+ mq_poly_montymul_ntt(tt, h, logn);
+ mq_iNTT(tt, logn);
+ mq_poly_sub(tt, c0, logn);
+
+ /*
+ * Normalize -s1 elements into the [-q/2..q/2] range.
+ */
+ for (u = 0; u < n; u ++) {
+ int32_t w;
+
+ w = (int32_t)tt[u];
+ w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+ ((int16_t *)tt)[u] = (int16_t)w;
+ }
+
+ /*
+ * Signature is valid if and only if the aggregate (-s1,s2) vector
+ * is short enough.
+ */
+ return PQCLEAN_FALCONPADDED1024_AVX2_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ tt[u] = (uint16_t)mq_conv_small(f[u]);
+ h[u] = (uint16_t)mq_conv_small(g[u]);
+ }
+ mq_NTT(h, logn);
+ mq_NTT(tt, logn);
+ for (u = 0; u < n; u ++) {
+ if (tt[u] == 0) {
+ return 0;
+ }
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *t1, *t2;
+
+ n = (size_t)1 << logn;
+ t1 = (uint16_t *)tmp;
+ t2 = t1 + n;
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint16_t)mq_conv_small(g[u]);
+ t2[u] = (uint16_t)mq_conv_small(F[u]);
+ }
+ mq_NTT(t1, logn);
+ mq_NTT(t2, logn);
+ mq_poly_tomonty(t1, logn);
+ mq_poly_montymul_ntt(t1, t2, logn);
+ for (u = 0; u < n; u ++) {
+ t2[u] = (uint16_t)mq_conv_small(f[u]);
+ }
+ mq_NTT(t2, logn);
+ for (u = 0; u < n; u ++) {
+ if (t2[u] == 0) {
+ return 0;
+ }
+ t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+ }
+ mq_iNTT(t1, logn);
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+ int32_t gi;
+
+ w = t1[u];
+ w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+ gi = *(int32_t *)&w;
+ if (gi < -127 || gi > +127) {
+ return 0;
+ }
+ G[u] = (int8_t)gi;
+ }
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+ mq_NTT(tt, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ }
+ return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+ * and c0 - s1 into h[].
+ */
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+
+ w = (uint32_t)s1[u];
+ w += Q & -(w >> 31);
+ w = mq_sub(c0[u], w);
+ h[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+ * is zero (in NTT representation) then the operation fails. We
+ * keep that information into a flag so that we do not deviate
+ * from strict constant-time processing; if all coefficients of
+ * s2 are non-zero, then the high bit of r will be zero.
+ */
+ mq_NTT(tt, logn);
+ mq_NTT(h, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+
+ /*
+ * Signature is acceptable if and only if it is short enough,
+ * and s2 was invertible mod phi mod q. The caller must still
+ * check that the rebuilt public key matches the expected
+ * value (e.g. through a hash).
+ */
+ r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED1024_AVX2_is_short(s1, s2, logn);
+ return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+ uint16_t *s2;
+ size_t u, n;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ s2 = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)sig[u];
+ w += Q & -(w >> 31);
+ s2[u] = (uint16_t)w;
+ }
+ mq_NTT(s2, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u] - 1u;
+ r += (w >> 31);
+ }
+ return (int)r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/LICENSE b/src/sig/falcon/pqclean_falcon-padded-1024_clean/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin .
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/api.h b/src/sig/falcon/pqclean_falcon-padded-1024_clean/api.h
new file mode 100644
index 000000000..0d38a55f7
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED1024_CLEAN_API_H
+#define PQCLEAN_FALCONPADDED1024_CLEAN_API_H
+
+#include
+#include
+
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES 2305
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES 1793
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES 1280
+
+#define PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_ALGNAME "Falcon-padded-1024"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ * public (pk): PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES
+ * private (sk): PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/codec.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/codec.c
new file mode 100644
index 000000000..9556fe73a
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_modq_encode(
+ void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn) {
+ size_t n, out_len, u;
+ uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ if (x[u] >= 12289) {
+ return 0;
+ }
+ }
+ out_len = ((n * 14) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << 14) | x[u];
+ acc_len += 14;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_modq_decode(
+ uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len, u;
+ const uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * 14) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ u = 0;
+ while (u < n) {
+ acc = (acc << 8) | (*buf ++);
+ acc_len += 8;
+ if (acc_len >= 14) {
+ unsigned w;
+
+ acc_len -= 14;
+ w = (acc >> acc_len) & 0x3FFF;
+ if (w >= 12289) {
+ return 0;
+ }
+ x[u ++] = (uint16_t)w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint16_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_decode(
+ int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ w |= -(w & mask2);
+ x[u ++] = (int16_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+ void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint8_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+ int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ x[u ++] = (int8_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_comp_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn) {
+ uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = out;
+
+ /*
+ * Make sure that all values are within the -2047..+2047 range.
+ */
+ for (u = 0; u < n; u ++) {
+ if (x[u] < -2047 || x[u] > +2047) {
+ return 0;
+ }
+ }
+
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ int t;
+ unsigned w;
+
+ /*
+ * Get sign and absolute value of next integer; push the
+ * sign bit.
+ */
+ acc <<= 1;
+ t = x[u];
+ if (t < 0) {
+ t = -t;
+ acc |= 1;
+ }
+ w = (unsigned)t;
+
+ /*
+ * Push the low 7 bits of the absolute value.
+ */
+ acc <<= 7;
+ acc |= w & 127u;
+ w >>= 7;
+
+ /*
+ * We pushed exactly 8 bits.
+ */
+ acc_len += 8;
+
+ /*
+ * Push as many zeros as necessary, then a one. Since the
+ * absolute value is at most 2047, w can only range up to
+ * 15 at this point, thus we will add at most 16 bits
+ * here. With the 8 bits above and possibly up to 7 bits
+ * from previous iterations, we may go up to 31 bits, which
+ * will fit in the accumulator, which is an uint32_t.
+ */
+ acc <<= (w + 1);
+ acc |= 1;
+ acc_len += w + 1;
+
+ /*
+ * Produce all full bytes.
+ */
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc >> acc_len);
+ }
+ v ++;
+ }
+ }
+
+ /*
+ * Flush remaining bits (if any).
+ */
+ if (acc_len > 0) {
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc << (8 - acc_len));
+ }
+ v ++;
+ }
+
+ return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED1024_CLEAN_comp_decode(
+ int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ const uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ unsigned b, s, m;
+
+ /*
+ * Get next eight bits: sign and low seven bits of the
+ * absolute value.
+ */
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ b = acc >> acc_len;
+ s = b & 128;
+ m = b & 127;
+
+ /*
+ * Get next bits until a 1 is reached.
+ */
+ for (;;) {
+ if (acc_len == 0) {
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ acc_len = 8;
+ }
+ acc_len --;
+ if (((acc >> acc_len) & 1) != 0) {
+ break;
+ }
+ m += 128;
+ if (m > 2047) {
+ return 0;
+ }
+ }
+
+ /*
+ * "-0" is forbidden.
+ */
+ if (s && m == 0) {
+ return 0;
+ }
+ if (s) {
+ x[u] = (int16_t) - m;
+ } else {
+ x[u] = (int16_t)m;
+ }
+ }
+
+ /*
+ * Unused bits in the last byte must be zero.
+ */
+ if ((acc & ((1u << acc_len) - 1u)) != 0) {
+ return 0;
+ }
+
+ return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ * log(n) n max(f,g) std(f,g) max(F,G) std(F,G)
+ * 1 2 129 56.31 143 60.02
+ * 2 4 123 40.93 160 46.52
+ * 3 8 97 28.97 159 38.01
+ * 4 16 100 21.48 154 32.50
+ * 5 32 71 15.41 151 29.36
+ * 6 64 59 11.07 138 27.77
+ * 7 128 39 7.91 144 27.00
+ * 8 256 32 5.63 148 26.61
+ * 9 512 22 4.00 137 26.46
+ * 10 1024 15 2.84 146 26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 7,
+ 7,
+ 6,
+ 6,
+ 5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ * log(n) n max sig coeff (theoretical)
+ * 1 2 412
+ * 2 4 583
+ * 3 8 824
+ * 4 16 1166
+ * 5 32 1649
+ * 6 64 2332
+ * 7 128 3299
+ * 8 256 4665
+ * 9 512 6598
+ * 10 1024 9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_sig_bits[] = {
+ 0, /* unused */
+ 10,
+ 11,
+ 11,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/common.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/common.c
new file mode 100644
index 000000000..87c6771c2
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_vartime(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn) {
+ /*
+ * This is the straightforward per-the-spec implementation. It
+ * is not constant-time, thus it might reveal information on the
+ * plaintext (at least, enough to check the plaintext against a
+ * list of potential plaintexts) in a scenario where the
+ * attacker does not have access to the signature value or to
+ * the public key, but knows the nonce (without knowledge of the
+ * nonce, the hashed output cannot be matched against potential
+ * plaintexts).
+ */
+ size_t n;
+
+ n = (size_t)1 << logn;
+ while (n > 0) {
+ uint8_t buf[2];
+ uint32_t w;
+
+ inner_shake256_extract(sc, (void *)buf, sizeof buf);
+ w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+ if (w < 61445) {
+ while (w >= 12289) {
+ w -= 12289;
+ }
+ *x ++ = (uint16_t)w;
+ n --;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp) {
+ /*
+ * Each 16-bit sample is a value in 0..65535. The value is
+ * kept if it falls in 0..61444 (because 61445 = 5*12289)
+ * and rejected otherwise; thus, each sample has probability
+ * about 0.93758 of being selected.
+ *
+ * We want to oversample enough to be sure that we will
+ * have enough values with probability at least 1 - 2^(-256).
+ * Depending on degree N, this leads to the following
+ * required oversampling:
+ *
+ * logn n oversampling
+ * 1 2 65
+ * 2 4 67
+ * 3 8 71
+ * 4 16 77
+ * 5 32 86
+ * 6 64 100
+ * 7 128 122
+ * 8 256 154
+ * 9 512 205
+ * 10 1024 287
+ *
+ * If logn >= 7, then the provided temporary buffer is large
+ * enough. Otherwise, we use a stack buffer of 63 entries
+ * (i.e. 126 bytes) for the values that do not fit in tmp[].
+ */
+
+ static const uint16_t overtab[] = {
+ 0, /* unused */
+ 65,
+ 67,
+ 71,
+ 77,
+ 86,
+ 100,
+ 122,
+ 154,
+ 205,
+ 287
+ };
+
+ unsigned n, n2, u, m, p, over;
+ uint16_t *tt1, tt2[63];
+
+ /*
+ * We first generate m 16-bit value. Values 0..n-1 go to x[].
+ * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+ * We also reduce modulo q the values; rejected values are set
+ * to 0xFFFF.
+ */
+ n = 1U << logn;
+ n2 = n << 1;
+ over = overtab[logn];
+ m = n + over;
+ tt1 = (uint16_t *)tmp;
+ for (u = 0; u < m; u ++) {
+ uint8_t buf[2];
+ uint32_t w, wr;
+
+ inner_shake256_extract(sc, buf, sizeof buf);
+ w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+ wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+ wr |= ((w - 61445) >> 31) - 1;
+ if (u < n) {
+ x[u] = (uint16_t)wr;
+ } else if (u < n2) {
+ tt1[u - n] = (uint16_t)wr;
+ } else {
+ tt2[u - n2] = (uint16_t)wr;
+ }
+ }
+
+ /*
+ * Now we must "squeeze out" the invalid values. We do this in
+ * a logarithmic sequence of passes; each pass computes where a
+ * value should go, and moves it down by 'p' slots if necessary,
+ * where 'p' uses an increasing powers-of-two scale. It can be
+ * shown that in all cases where the loop decides that a value
+ * has to be moved down by p slots, the destination slot is
+ * "free" (i.e. contains an invalid value).
+ */
+ for (p = 1; p <= over; p <<= 1) {
+ unsigned v;
+
+ /*
+ * In the loop below:
+ *
+ * - v contains the index of the final destination of
+ * the value; it is recomputed dynamically based on
+ * whether values are valid or not.
+ *
+ * - u is the index of the value we consider ("source");
+ * its address is s.
+ *
+ * - The loop may swap the value with the one at index
+ * u-p. The address of the swap destination is d.
+ */
+ v = 0;
+ for (u = 0; u < m; u ++) {
+ uint16_t *s, *d;
+ unsigned j, sv, dv, mk;
+
+ if (u < n) {
+ s = &x[u];
+ } else if (u < n2) {
+ s = &tt1[u - n];
+ } else {
+ s = &tt2[u - n2];
+ }
+ sv = *s;
+
+ /*
+ * The value in sv should ultimately go to
+ * address v, i.e. jump back by u-v slots.
+ */
+ j = u - v;
+
+ /*
+ * We increment v for the next iteration, but
+ * only if the source value is valid. The mask
+ * 'mk' is -1 if the value is valid, 0 otherwise,
+ * so we _subtract_ mk.
+ */
+ mk = (sv >> 15) - 1U;
+ v -= mk;
+
+ /*
+ * In this loop we consider jumps by p slots; if
+ * u < p then there is nothing more to do.
+ */
+ if (u < p) {
+ continue;
+ }
+
+ /*
+ * Destination for the swap: value at address u-p.
+ */
+ if ((u - p) < n) {
+ d = &x[u - p];
+ } else if ((u - p) < n2) {
+ d = &tt1[(u - p) - n];
+ } else {
+ d = &tt2[(u - p) - n2];
+ }
+ dv = *d;
+
+ /*
+ * The swap should be performed only if the source
+ * is valid AND the jump j has its 'p' bit set.
+ */
+ mk &= -(((j & p) + 0x1FF) >> 9);
+
+ *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+ *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+ }
+ }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+ 0, /* unused */
+ 101498,
+ 208714,
+ 428865,
+ 892039,
+ 1852696,
+ 3842630,
+ 7959734,
+ 16468416,
+ 34034726,
+ 70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_is_short(
+ const int16_t *s1, const int16_t *s2, unsigned logn) {
+ /*
+ * We use the l2-norm. Code below uses only 32-bit operations to
+ * compute the square of the norm with saturation to 2^32-1 if
+ * the value exceeds 2^31-1.
+ */
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = (size_t)1 << logn;
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s1[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ z = s2[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ s |= -(ng >> 31);
+
+ return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(
+ uint32_t sqn, const int16_t *s2, unsigned logn) {
+ size_t n, u;
+ uint32_t ng;
+
+ n = (size_t)1 << logn;
+ ng = -(sqn >> 31);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s2[u];
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ }
+ sqn |= -(ng >> 31);
+
+ return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/fft.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fft.c
new file mode 100644
index 000000000..f0d5bd842
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fft.c
@@ -0,0 +1,699 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_add(a_re, b_re); \
+ fpct_im = fpr_add(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_sub(a_re, b_re); \
+ fpct_im = fpr_sub(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+ fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+ fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ * Re(f(w_j)) -> slot rev(j)/2
+ * Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_FFT(fpr *f, unsigned logn) {
+ /*
+ * FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = N
+ * for m = 1; m < N; m *= 2:
+ * ht = t/2
+ * for i1 = 0; i1 < m; i1 ++:
+ * j1 = i1 * t
+ * s = GM[m + i1]
+ * for j = j1; j < (j1 + ht); j ++:
+ * x = f[j]
+ * y = s * f[j + ht]
+ * f[j] = x + y
+ * f[j + ht] = x - y
+ * t = ht
+ *
+ * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+ *
+ * In the description above, f[] is supposed to contain complex
+ * numbers. In our in-memory representation, the real and
+ * imaginary parts of f[k] are in array slots k and k+N/2.
+ *
+ * We only keep the first half of the complex numbers. We can
+ * see that after the first iteration, the first and second halves
+ * of the array of complex numbers have separate lives, so we
+ * simply ignore the second part.
+ */
+
+ unsigned u;
+ size_t t, n, hn, m;
+
+ /*
+ * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+ * (because GM[1] = w^rev(1) = w^(N/2) = i).
+ * In our chosen representation, this is a no-op: everything is
+ * already where it should be.
+ */
+
+ /*
+ * Subsequent iterations are truncated to use only the first
+ * half of values.
+ */
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ t = hn;
+ for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+ size_t ht, hm, i1, j1;
+
+ ht = t >> 1;
+ hm = m >> 1;
+ for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+ size_t j, j2;
+
+ j2 = j1 + ht;
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+ s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + ht];
+ y_im = f[j + ht + hn];
+ FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(f[j + ht], f[j + ht + hn],
+ x_re, x_im, y_re, y_im);
+ }
+ }
+ t = ht;
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(fpr *f, unsigned logn) {
+ /*
+ * Inverse FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = 1
+ * for m = N; m > 1; m /= 2:
+ * hm = m/2
+ * dt = t*2
+ * for i1 = 0; i1 < hm; i1 ++:
+ * j1 = i1 * dt
+ * s = iGM[hm + i1]
+ * for j = j1; j < (j1 + t); j ++:
+ * x = f[j]
+ * y = f[j + t]
+ * f[j] = x + y
+ * f[j + t] = s * (x - y)
+ * t = dt
+ * for i1 = 0; i1 < N; i1 ++:
+ * f[i1] = f[i1] / N
+ *
+ * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+ * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+ *
+ * In the main loop (not counting the final division loop), in
+ * all iterations except the last, the first and second half of f[]
+ * (as an array of complex numbers) are separate. In our chosen
+ * representation, we do not keep the second half.
+ *
+ * The last iteration recombines the recomputed half with the
+ * implicit half, and should yield only real numbers since the
+ * target polynomial is real; moreover, s = i at that step.
+ * Thus, when considering x and y:
+ * y = conj(x) since the final f[j] must be real
+ * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+ * filled with 2*Im(x).
+ * But we already have Re(x) and Im(x) in array slots j and j+t
+ * in our chosen representation. That last iteration is thus a
+ * simple doubling of the values in all the array.
+ *
+ * We make the last iteration a no-op by tweaking the final
+ * division into a division by N/2, not N.
+ */
+ size_t u, n, hn, t, m;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ hn = n >> 1;
+ for (u = logn; u > 1; u --) {
+ size_t hm, dt, i1, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+ size_t j, j2;
+
+ j2 = j1 + t;
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+ s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + t];
+ y_im = f[j + t + hn];
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
+ FPC_MUL(f[j + t], f[j + t + hn],
+ x_re, x_im, s_re, s_im);
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * Last iteration is a no-op, provided that we divide by N/2
+ * instead of N. We need to make a special case for logn = 0.
+ */
+ if (logn > 0) {
+ fpr ni;
+
+ ni = fpr_p2_tab[logn];
+ for (u = 0; u < n; u ++) {
+ f[u] = fpr_mul(f[u], ni);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_add(a[u], b[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_sub(a[u], b[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = (n >> 1); u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = fpr_neg(b[u + hn]);
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+ a[u + hn] = fpr_zero;
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_mul(a[u], x);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+ fpr b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ d[u] = fpr_inv(fpr_add(
+ fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+ fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr F_re, F_im, G_re, G_im;
+ fpr f_re, f_im, g_re, g_im;
+ fpr a_re, a_im, b_re, b_im;
+
+ F_re = F[u];
+ F_im = F[u + hn];
+ G_re = G[u];
+ G_im = G[u + hn];
+ f_re = f[u];
+ f_im = f[u + hn];
+ g_re = g[u];
+ g_im = g[u + hn];
+
+ FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+ FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+ d[u] = fpr_add(a_re, b_re);
+ d[u + hn] = fpr_add(a_im, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ a[u] = fpr_mul(a[u], b[u]);
+ a[u + hn] = fpr_mul(a[u + hn], b[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr ib;
+
+ ib = fpr_inv(b[u]);
+ a[u] = fpr_mul(a[u], ib);
+ a[u + hn] = fpr_mul(a[u + hn], ib);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDL_fft(
+ const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
+ g01[u] = mu_re;
+ g01[u + hn] = fpr_neg(mu_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(
+ fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
+ l10[u] = mu_re;
+ l10[u + hn] = fpr_neg(mu_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(
+ fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn) {
+ /*
+ * The FFT representation we use is in bit-reversed order
+ * (element i contains f(w^(rev(i))), where rev() is the
+ * bit-reversal function over the ring degree. This changes
+ * indexes with regards to the Falcon specification.
+ */
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ /*
+ * We process complex values by pairs. For logn = 1, there is only
+ * one complex value (the other one is the implicit conjugate),
+ * so we add the two lines below because the loop will be
+ * skipped.
+ */
+ f0[0] = f[0];
+ f1[0] = f[hn];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f[(u << 1) + 0];
+ a_im = f[(u << 1) + 0 + hn];
+ b_re = f[(u << 1) + 1];
+ b_im = f[(u << 1) + 1 + hn];
+
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f0[u] = fpr_half(t_re);
+ f0[u + qn] = fpr_half(t_im);
+
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ FPC_MUL(t_re, t_im, t_re, t_im,
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+ f1[u] = fpr_half(t_re);
+ f1[u + qn] = fpr_half(t_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(
+ fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn) {
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ /*
+ * An extra copy to handle the special case logn = 1.
+ */
+ f[0] = f0[0];
+ f[hn] = f1[0];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f0[u];
+ a_im = f0[u + qn];
+ FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_gm_tab[((u + hn) << 1) + 1]);
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 0] = t_re;
+ f[(u << 1) + 0 + hn] = t_im;
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 1] = t_re;
+ f[(u << 1) + 1 + hn] = t_im;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.c
new file mode 100644
index 000000000..82ff1df46
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.c
@@ -0,0 +1,1622 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/*
+ * Normalize a provided unsigned integer to the 2^63..2^64-1 range by
+ * left-shifting it if necessary. The exponent e is adjusted accordingly
+ * (i.e. if the value was left-shifted by n bits, then n is subtracted
+ * from e). If source m is 0, then it remains 0, but e is altered.
+ * Both m and e must be simple variables (no expressions allowed).
+ */
+#define FPR_NORM64(m, e) do { \
+ uint32_t nt; \
+ \
+ (e) -= 63; \
+ \
+ nt = (uint32_t)((m) >> 32); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 5); \
+ \
+ nt = (uint32_t)((m) >> 48); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 4); \
+ \
+ nt = (uint32_t)((m) >> 56); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 8)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 3); \
+ \
+ nt = (uint32_t)((m) >> 60); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 4)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 2); \
+ \
+ nt = (uint32_t)((m) >> 62); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 2)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 1); \
+ \
+ nt = (uint32_t)((m) >> 63); \
+ (m) ^= ((m) ^ ((m) << 1)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt); \
+ } while (0)
+
+fpr
+fpr_scaled(int64_t i, int sc) {
+ /*
+ * To convert from int to float, we have to do the following:
+ * 1. Get the absolute value of the input, and its sign
+ * 2. Shift right or left the value as appropriate
+ * 3. Pack the result
+ *
+ * We can assume that the source integer is not -2^63.
+ */
+ int s, e;
+ uint32_t t;
+ uint64_t m;
+
+ /*
+ * Extract sign bit.
+ * We have: -i = 1 + ~i
+ */
+ s = (int)((uint64_t)i >> 63);
+ i ^= -(int64_t)s;
+ i += s;
+
+ /*
+ * For now we suppose that i != 0.
+ * Otherwise, we set m to i and left-shift it as much as needed
+ * to get a 1 in the top bit. We can do that in a logarithmic
+ * number of conditional shifts.
+ */
+ m = (uint64_t)i;
+ e = 9 + sc;
+ FPR_NORM64(m, e);
+
+ /*
+ * Now m is in the 2^63..2^64-1 range. We must divide it by 512;
+ * if one of the dropped bits is a 1, this should go into the
+ * "sticky bit".
+ */
+ m |= ((uint32_t)m & 0x1FF) + 0x1FF;
+ m >>= 9;
+
+ /*
+ * Corrective action: if i = 0 then all of the above was
+ * incorrect, and we clamp e and m down to zero.
+ */
+ t = (uint32_t)((uint64_t)(i | -i) >> 63);
+ m &= -(uint64_t)t;
+ e &= -(int)t;
+
+ /*
+ * Assemble back everything. The FPR() function will handle cases
+ * where e is too low.
+ */
+ return FPR(s, e, m);
+}
+
+fpr
+fpr_add(fpr x, fpr y) {
+ uint64_t m, xu, yu, za;
+ uint32_t cs;
+ int ex, ey, sx, sy, cc;
+
+ /*
+ * Make sure that the first operand (x) has the larger absolute
+ * value. This guarantees that the exponent of y is less than
+ * or equal to the exponent of x, and, if they are equal, then
+ * the mantissa of y will not be greater than the mantissa of x.
+ *
+ * After this swap, the result will have the sign x, except in
+ * the following edge case: abs(x) = abs(y), and x and y have
+ * opposite sign bits; in that case, the result shall be +0
+ * even if the sign bit of x is 1. To handle this case properly,
+ * we do the swap is abs(x) = abs(y) AND the sign of x is 1.
+ */
+ m = ((uint64_t)1 << 63) - 1;
+ za = (x & m) - (y & m);
+ cs = (uint32_t)(za >> 63)
+ | ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63));
+ m = (x ^ y) & -(uint64_t)cs;
+ x ^= m;
+ y ^= m;
+
+ /*
+ * Extract sign bits, exponents and mantissas. The mantissas are
+ * scaled up to 2^55..2^56-1, and the exponent is unbiased. If
+ * an operand is zero, its mantissa is set to 0 at this step, and
+ * its exponent will be -1078.
+ */
+ ex = (int)(x >> 52);
+ sx = ex >> 11;
+ ex &= 0x7FF;
+ m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52;
+ xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3;
+ ex -= 1078;
+ ey = (int)(y >> 52);
+ sy = ey >> 11;
+ ey &= 0x7FF;
+ m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52;
+ yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3;
+ ey -= 1078;
+
+ /*
+ * x has the larger exponent; hence, we only need to right-shift y.
+ * If the shift count is larger than 59 bits then we clamp the
+ * value to zero.
+ */
+ cc = ex - ey;
+ yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31);
+ cc &= 63;
+
+ /*
+ * The lowest bit of yu is "sticky".
+ */
+ m = fpr_ulsh(1, cc) - 1;
+ yu |= (yu & m) + m;
+ yu = fpr_ursh(yu, cc);
+
+ /*
+ * If the operands have the same sign, then we add the mantissas;
+ * otherwise, we subtract the mantissas.
+ */
+ xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy));
+
+ /*
+ * The result may be smaller, or slightly larger. We normalize
+ * it to the 2^63..2^64-1 range (if xu is zero, then it stays
+ * at zero).
+ */
+ FPR_NORM64(xu, ex);
+
+ /*
+ * Scale down the value to 2^54..s^55-1, handling the last bit
+ * as sticky.
+ */
+ xu |= ((uint32_t)xu & 0x1FF) + 0x1FF;
+ xu >>= 9;
+ ex += 9;
+
+ /*
+ * In general, the result has the sign of x. However, if the
+ * result is exactly zero, then the following situations may
+ * be encountered:
+ * x > 0, y = -x -> result should be +0
+ * x < 0, y = -x -> result should be +0
+ * x = +0, y = +0 -> result should be +0
+ * x = -0, y = +0 -> result should be +0
+ * x = +0, y = -0 -> result should be +0
+ * x = -0, y = -0 -> result should be -0
+ *
+ * But at the conditional swap step at the start of the
+ * function, we ensured that if abs(x) = abs(y) and the
+ * sign of x was 1, then x and y were swapped. Thus, the
+ * two following cases cannot actually happen:
+ * x < 0, y = -x
+ * x = -0, y = +0
+ * In all other cases, the sign bit of x is conserved, which
+ * is what the FPR() function does. The FPR() function also
+ * properly clamps values to zero when the exponent is too
+ * low, but does not alter the sign in that case.
+ */
+ return FPR(sx, ex, xu);
+}
+
+fpr
+fpr_mul(fpr x, fpr y) {
+ uint64_t xu, yu, w, zu, zv;
+ uint32_t x0, x1, y0, y1, z0, z1, z2;
+ int ex, ey, d, e, s;
+
+ /*
+ * Extract absolute values as scaled unsigned integers. We
+ * don't extract exponents yet.
+ */
+ xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+ yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+ /*
+ * We have two 53-bit integers to multiply; we need to split
+ * each into a lower half and a upper half. Moreover, we
+ * prefer to have lower halves to be of 25 bits each, for
+ * reasons explained later on.
+ */
+ x0 = (uint32_t)xu & 0x01FFFFFF;
+ x1 = (uint32_t)(xu >> 25);
+ y0 = (uint32_t)yu & 0x01FFFFFF;
+ y1 = (uint32_t)(yu >> 25);
+ w = (uint64_t)x0 * (uint64_t)y0;
+ z0 = (uint32_t)w & 0x01FFFFFF;
+ z1 = (uint32_t)(w >> 25);
+ w = (uint64_t)x0 * (uint64_t)y1;
+ z1 += (uint32_t)w & 0x01FFFFFF;
+ z2 = (uint32_t)(w >> 25);
+ w = (uint64_t)x1 * (uint64_t)y0;
+ z1 += (uint32_t)w & 0x01FFFFFF;
+ z2 += (uint32_t)(w >> 25);
+ zu = (uint64_t)x1 * (uint64_t)y1;
+ z2 += (z1 >> 25);
+ z1 &= 0x01FFFFFF;
+ zu += z2;
+
+ /*
+ * Since xu and yu are both in the 2^52..2^53-1 range, the
+ * product is in the 2^104..2^106-1 range. We first reassemble
+ * it and round it into the 2^54..2^56-1 range; the bottom bit
+ * is made "sticky". Since the low limbs z0 and z1 are 25 bits
+ * each, we just take the upper part (zu), and consider z0 and
+ * z1 only for purposes of stickiness.
+ * (This is the reason why we chose 25-bit limbs above.)
+ */
+ zu |= ((z0 | z1) + 0x01FFFFFF) >> 25;
+
+ /*
+ * We normalize zu to the 2^54..s^55-1 range: it could be one
+ * bit too large at this point. This is done with a conditional
+ * right-shift that takes into account the sticky bit.
+ */
+ zv = (zu >> 1) | (zu & 1);
+ w = zu >> 55;
+ zu ^= (zu ^ zv) & -w;
+
+ /*
+ * Get the aggregate scaling factor:
+ *
+ * - Each exponent is biased by 1023.
+ *
+ * - Integral mantissas are scaled by 2^52, hence an
+ * extra 52 bias for each exponent.
+ *
+ * - However, we right-shifted z by 50 bits, and then
+ * by 0 or 1 extra bit (depending on the value of w).
+ *
+ * In total, we must add the exponents, then subtract
+ * 2 * (1023 + 52), then add 50 + w.
+ */
+ ex = (int)((x >> 52) & 0x7FF);
+ ey = (int)((y >> 52) & 0x7FF);
+ e = ex + ey - 2100 + (int)w;
+
+ /*
+ * Sign bit is the XOR of the operand sign bits.
+ */
+ s = (int)((x ^ y) >> 63);
+
+ /*
+ * Corrective actions for zeros: if either of the operands is
+ * zero, then the computations above were wrong. Test for zero
+ * is whether ex or ey is zero. We just have to set the mantissa
+ * (zu) to zero, the FPR() function will normalize e.
+ */
+ d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11;
+ zu &= -(uint64_t)d;
+
+ /*
+ * FPR() packs the result and applies proper rounding.
+ */
+ return FPR(s, e, zu);
+}
+
+fpr
+fpr_div(fpr x, fpr y) {
+ uint64_t xu, yu, q, q2, w;
+ int i, ex, ey, e, d, s;
+
+ /*
+ * Extract mantissas of x and y (unsigned).
+ */
+ xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+ yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+ /*
+ * Perform bit-by-bit division of xu by yu. We run it for 55 bits.
+ */
+ q = 0;
+ for (i = 0; i < 55; i ++) {
+ /*
+ * If yu is less than or equal xu, then subtract it and
+ * push a 1 in the quotient; otherwise, leave xu unchanged
+ * and push a 0.
+ */
+ uint64_t b;
+
+ b = ((xu - yu) >> 63) - 1;
+ xu -= b & yu;
+ q |= b & 1;
+ xu <<= 1;
+ q <<= 1;
+ }
+
+ /*
+ * We got 55 bits in the quotient, followed by an extra zero. We
+ * want that 56th bit to be "sticky": it should be a 1 if and
+ * only if the remainder (xu) is non-zero.
+ */
+ q |= (xu | -xu) >> 63;
+
+ /*
+ * Quotient is at most 2^56-1. Its top bit may be zero, but in
+ * that case the next-to-top bit will be a one, since the
+ * initial xu and yu were both in the 2^52..2^53-1 range.
+ * We perform a conditional shift to normalize q to the
+ * 2^54..2^55-1 range (with the bottom bit being sticky).
+ */
+ q2 = (q >> 1) | (q & 1);
+ w = q >> 55;
+ q ^= (q ^ q2) & -w;
+
+ /*
+ * Extract exponents to compute the scaling factor:
+ *
+ * - Each exponent is biased and we scaled them up by
+ * 52 bits; but these biases will cancel out.
+ *
+ * - The division loop produced a 55-bit shifted result,
+ * so we must scale it down by 55 bits.
+ *
+ * - If w = 1, we right-shifted the integer by 1 bit,
+ * hence we must add 1 to the scaling.
+ */
+ ex = (int)((x >> 52) & 0x7FF);
+ ey = (int)((y >> 52) & 0x7FF);
+ e = ex - ey - 55 + (int)w;
+
+ /*
+ * Sign is the XOR of the signs of the operands.
+ */
+ s = (int)((x ^ y) >> 63);
+
+ /*
+ * Corrective actions for zeros: if x = 0, then the computation
+ * is wrong, and we must clamp e and q to 0. We do not care
+ * about the case y = 0 (as per assumptions in this module,
+ * the caller does not perform divisions by zero).
+ */
+ d = (ex + 0x7FF) >> 11;
+ s &= d;
+ e &= -d;
+ q &= -(uint64_t)d;
+
+ /*
+ * FPR() packs the result and applies proper rounding.
+ */
+ return FPR(s, e, q);
+}
+
+fpr
+fpr_sqrt(fpr x) {
+ uint64_t xu, q, s, r;
+ int ex, e;
+
+ /*
+ * Extract the mantissa and the exponent. We don't care about
+ * the sign: by assumption, the operand is nonnegative.
+ * We want the "true" exponent corresponding to a mantissa
+ * in the 1..2 range.
+ */
+ xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+ ex = (int)((x >> 52) & 0x7FF);
+ e = ex - 1023;
+
+ /*
+ * If the exponent is odd, double the mantissa and decrement
+ * the exponent. The exponent is then halved to account for
+ * the square root.
+ */
+ xu += xu & -(uint64_t)(e & 1);
+ e >>= 1;
+
+ /*
+ * Double the mantissa.
+ */
+ xu <<= 1;
+
+ /*
+ * We now have a mantissa in the 2^53..2^55-1 range. It
+ * represents a value between 1 (inclusive) and 4 (exclusive)
+ * in fixed point notation (with 53 fractional bits). We
+ * compute the square root bit by bit.
+ */
+ q = 0;
+ s = 0;
+ r = (uint64_t)1 << 53;
+ for (int i = 0; i < 54; i ++) {
+ uint64_t t, b;
+
+ t = s + r;
+ b = ((xu - t) >> 63) - 1;
+ s += (r << 1) & b;
+ xu -= t & b;
+ q += r & b;
+ xu <<= 1;
+ r >>= 1;
+ }
+
+ /*
+ * Now, q is a rounded-low 54-bit value, with a leading 1,
+ * 52 fractional digits, and an additional guard bit. We add
+ * an extra sticky bit to account for what remains of the operand.
+ */
+ q <<= 1;
+ q |= (xu | -xu) >> 63;
+
+ /*
+ * Result q is in the 2^54..2^55-1 range; we bias the exponent
+ * by 54 bits (the value e at that point contains the "true"
+ * exponent, but q is now considered an integer, i.e. scaled
+ * up.
+ */
+ e -= 54;
+
+ /*
+ * Corrective action for an operand of value zero.
+ */
+ q &= -(uint64_t)((ex + 0x7FF) >> 11);
+
+ /*
+ * Apply rounding and back result.
+ */
+ return FPR(0, e, q);
+}
+
+uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+ /*
+ * Polynomial approximation of exp(-x) is taken from FACCT:
+ * https://eprint.iacr.org/2018/1234
+ * Specifically, values are extracted from the implementation
+ * referenced from the FACCT article, and available at:
+ * https://github.com/raykzhao/gaussian
+ * Here, the coefficients have been scaled up by 2^63 and
+ * converted to integers.
+ *
+ * Tests over more than 24 billions of random inputs in the
+ * 0..log(2) range have never shown a deviation larger than
+ * 2^(-50) from the true mathematical value.
+ */
+ static const uint64_t C[] = {
+ 0x00000004741183A3u,
+ 0x00000036548CFC06u,
+ 0x0000024FDCBF140Au,
+ 0x0000171D939DE045u,
+ 0x0000D00CF58F6F84u,
+ 0x000680681CF796E3u,
+ 0x002D82D8305B0FEAu,
+ 0x011111110E066FD0u,
+ 0x0555555555070F00u,
+ 0x155555555581FF00u,
+ 0x400000000002B400u,
+ 0x7FFFFFFFFFFF4800u,
+ 0x8000000000000000u
+ };
+
+ uint64_t z, y;
+ unsigned u;
+ uint32_t z0, z1, y0, y1;
+ uint64_t a, b;
+
+ y = C[0];
+ z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
+ for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) {
+ /*
+ * Compute product z * y over 128 bits, but keep only
+ * the top 64 bits.
+ *
+ * TODO: On some architectures/compilers we could use
+ * some intrinsics (__umulh() on MSVC) or other compiler
+ * extensions (unsigned __int128 on GCC / Clang) for
+ * improved speed; however, most 64-bit architectures
+ * also have appropriate IEEE754 floating-point support,
+ * which is better.
+ */
+ uint64_t c;
+
+ z0 = (uint32_t)z;
+ z1 = (uint32_t)(z >> 32);
+ y0 = (uint32_t)y;
+ y1 = (uint32_t)(y >> 32);
+ a = ((uint64_t)z0 * (uint64_t)y1)
+ + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+ b = ((uint64_t)z1 * (uint64_t)y0);
+ c = (a >> 32) + (b >> 32);
+ c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+ c += (uint64_t)z1 * (uint64_t)y1;
+ y = C[u] - c;
+ }
+
+ /*
+ * The scaling factor must be applied at the end. Since y is now
+ * in fixed-point notation, we have to convert the factor to the
+ * same format, and do an extra integer multiplication.
+ */
+ z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+ z0 = (uint32_t)z;
+ z1 = (uint32_t)(z >> 32);
+ y0 = (uint32_t)y;
+ y1 = (uint32_t)(y >> 32);
+ a = ((uint64_t)z0 * (uint64_t)y1)
+ + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+ b = ((uint64_t)z1 * (uint64_t)y0);
+ y = (a >> 32) + (b >> 32);
+ y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+ y += (uint64_t)z1 * (uint64_t)y1;
+
+ return y;
+}
+
+const fpr fpr_gm_tab[] = {
+ 0, 0,
+ 9223372036854775808U, 4607182418800017408U,
+ 4604544271217802189U, 4604544271217802189U,
+ 13827916308072577997U, 4604544271217802189U,
+ 4606496786581982534U, 4600565431771507043U,
+ 13823937468626282851U, 4606496786581982534U,
+ 4600565431771507043U, 4606496786581982534U,
+ 13829868823436758342U, 4600565431771507043U,
+ 4607009347991985328U, 4596196889902818827U,
+ 13819568926757594635U, 4607009347991985328U,
+ 4603179351334086856U, 4605664432017547683U,
+ 13829036468872323491U, 4603179351334086856U,
+ 4605664432017547683U, 4603179351334086856U,
+ 13826551388188862664U, 4605664432017547683U,
+ 4596196889902818827U, 4607009347991985328U,
+ 13830381384846761136U, 4596196889902818827U,
+ 4607139046673687846U, 4591727299969791020U,
+ 13815099336824566828U, 4607139046673687846U,
+ 4603889326261607894U, 4605137878724712257U,
+ 13828509915579488065U, 4603889326261607894U,
+ 4606118860100255153U, 4602163548591158843U,
+ 13825535585445934651U, 4606118860100255153U,
+ 4598900923775164166U, 4606794571824115162U,
+ 13830166608678890970U, 4598900923775164166U,
+ 4606794571824115162U, 4598900923775164166U,
+ 13822272960629939974U, 4606794571824115162U,
+ 4602163548591158843U, 4606118860100255153U,
+ 13829490896955030961U, 4602163548591158843U,
+ 4605137878724712257U, 4603889326261607894U,
+ 13827261363116383702U, 4605137878724712257U,
+ 4591727299969791020U, 4607139046673687846U,
+ 13830511083528463654U, 4591727299969791020U,
+ 4607171569234046334U, 4587232218149935124U,
+ 13810604255004710932U, 4607171569234046334U,
+ 4604224084862889120U, 4604849113969373103U,
+ 13828221150824148911U, 4604224084862889120U,
+ 4606317631232591731U, 4601373767755717824U,
+ 13824745804610493632U, 4606317631232591731U,
+ 4599740487990714333U, 4606655894547498725U,
+ 13830027931402274533U, 4599740487990714333U,
+ 4606912484326125783U, 4597922303871901467U,
+ 13821294340726677275U, 4606912484326125783U,
+ 4602805845399633902U, 4605900952042040894U,
+ 13829272988896816702U, 4602805845399633902U,
+ 4605409869824231233U, 4603540801876750389U,
+ 13826912838731526197U, 4605409869824231233U,
+ 4594454542771183930U, 4607084929468638487U,
+ 13830456966323414295U, 4594454542771183930U,
+ 4607084929468638487U, 4594454542771183930U,
+ 13817826579625959738U, 4607084929468638487U,
+ 4603540801876750389U, 4605409869824231233U,
+ 13828781906679007041U, 4603540801876750389U,
+ 4605900952042040894U, 4602805845399633902U,
+ 13826177882254409710U, 4605900952042040894U,
+ 4597922303871901467U, 4606912484326125783U,
+ 13830284521180901591U, 4597922303871901467U,
+ 4606655894547498725U, 4599740487990714333U,
+ 13823112524845490141U, 4606655894547498725U,
+ 4601373767755717824U, 4606317631232591731U,
+ 13829689668087367539U, 4601373767755717824U,
+ 4604849113969373103U, 4604224084862889120U,
+ 13827596121717664928U, 4604849113969373103U,
+ 4587232218149935124U, 4607171569234046334U,
+ 13830543606088822142U, 4587232218149935124U,
+ 4607179706000002317U, 4582730748936808062U,
+ 13806102785791583870U, 4607179706000002317U,
+ 4604386048625945823U, 4604698657331085206U,
+ 13828070694185861014U, 4604386048625945823U,
+ 4606409688975526202U, 4600971798440897930U,
+ 13824343835295673738U, 4606409688975526202U,
+ 4600154912527631775U, 4606578871587619388U,
+ 13829950908442395196U, 4600154912527631775U,
+ 4606963563043808649U, 4597061974398750563U,
+ 13820434011253526371U, 4606963563043808649U,
+ 4602994049708411683U, 4605784983948558848U,
+ 13829157020803334656U, 4602994049708411683U,
+ 4605539368864982914U, 4603361638657888991U,
+ 13826733675512664799U, 4605539368864982914U,
+ 4595327571478659014U, 4607049811591515049U,
+ 13830421848446290857U, 4595327571478659014U,
+ 4607114680469659603U, 4593485039402578702U,
+ 13816857076257354510U, 4607114680469659603U,
+ 4603716733069447353U, 4605276012900672507U,
+ 13828648049755448315U, 4603716733069447353U,
+ 4606012266443150634U, 4602550884377336506U,
+ 13825922921232112314U, 4606012266443150634U,
+ 4598476289818621559U, 4606856142606846307U,
+ 13830228179461622115U, 4598476289818621559U,
+ 4606727809065869586U, 4599322407794599425U,
+ 13822694444649375233U, 4606727809065869586U,
+ 4601771097584682078U, 4606220668805321205U,
+ 13829592705660097013U, 4601771097584682078U,
+ 4604995550503212910U, 4604058477489546729U,
+ 13827430514344322537U, 4604995550503212910U,
+ 4589965306122607094U, 4607158013403433018U,
+ 13830530050258208826U, 4589965306122607094U,
+ 4607158013403433018U, 4589965306122607094U,
+ 13813337342977382902U, 4607158013403433018U,
+ 4604058477489546729U, 4604995550503212910U,
+ 13828367587357988718U, 4604058477489546729U,
+ 4606220668805321205U, 4601771097584682078U,
+ 13825143134439457886U, 4606220668805321205U,
+ 4599322407794599425U, 4606727809065869586U,
+ 13830099845920645394U, 4599322407794599425U,
+ 4606856142606846307U, 4598476289818621559U,
+ 13821848326673397367U, 4606856142606846307U,
+ 4602550884377336506U, 4606012266443150634U,
+ 13829384303297926442U, 4602550884377336506U,
+ 4605276012900672507U, 4603716733069447353U,
+ 13827088769924223161U, 4605276012900672507U,
+ 4593485039402578702U, 4607114680469659603U,
+ 13830486717324435411U, 4593485039402578702U,
+ 4607049811591515049U, 4595327571478659014U,
+ 13818699608333434822U, 4607049811591515049U,
+ 4603361638657888991U, 4605539368864982914U,
+ 13828911405719758722U, 4603361638657888991U,
+ 4605784983948558848U, 4602994049708411683U,
+ 13826366086563187491U, 4605784983948558848U,
+ 4597061974398750563U, 4606963563043808649U,
+ 13830335599898584457U, 4597061974398750563U,
+ 4606578871587619388U, 4600154912527631775U,
+ 13823526949382407583U, 4606578871587619388U,
+ 4600971798440897930U, 4606409688975526202U,
+ 13829781725830302010U, 4600971798440897930U,
+ 4604698657331085206U, 4604386048625945823U,
+ 13827758085480721631U, 4604698657331085206U,
+ 4582730748936808062U, 4607179706000002317U,
+ 13830551742854778125U, 4582730748936808062U,
+ 4607181740574479067U, 4578227681973159812U,
+ 13801599718827935620U, 4607181740574479067U,
+ 4604465633578481725U, 4604621949701367983U,
+ 13827993986556143791U, 4604465633578481725U,
+ 4606453861145241227U, 4600769149537129431U,
+ 13824141186391905239U, 4606453861145241227U,
+ 4600360675823176935U, 4606538458821337243U,
+ 13829910495676113051U, 4600360675823176935U,
+ 4606987119037722413U, 4596629994023683153U,
+ 13820002030878458961U, 4606987119037722413U,
+ 4603087070374583113U, 4605725276488455441U,
+ 13829097313343231249U, 4603087070374583113U,
+ 4605602459698789090U, 4603270878689749849U,
+ 13826642915544525657U, 4605602459698789090U,
+ 4595762727260045105U, 4607030246558998647U,
+ 13830402283413774455U, 4595762727260045105U,
+ 4607127537664763515U, 4592606767730311893U,
+ 13815978804585087701U, 4607127537664763515U,
+ 4603803453461190356U, 4605207475328619533U,
+ 13828579512183395341U, 4603803453461190356U,
+ 4606066157444814153U, 4602357870542944470U,
+ 13825729907397720278U, 4606066157444814153U,
+ 4598688984595225406U, 4606826008603986804U,
+ 13830198045458762612U, 4598688984595225406U,
+ 4606761837001494797U, 4599112075441176914U,
+ 13822484112295952722U, 4606761837001494797U,
+ 4601967947786150793U, 4606170366472647579U,
+ 13829542403327423387U, 4601967947786150793U,
+ 4605067233569943231U, 4603974338538572089U,
+ 13827346375393347897U, 4605067233569943231U,
+ 4590846768565625881U, 4607149205763218185U,
+ 13830521242617993993U, 4590846768565625881U,
+ 4607165468267934125U, 4588998070480937184U,
+ 13812370107335712992U, 4607165468267934125U,
+ 4604141730443515286U, 4604922840319727473U,
+ 13828294877174503281U, 4604141730443515286U,
+ 4606269759522929756U, 4601573027631668967U,
+ 13824945064486444775U, 4606269759522929756U,
+ 4599531889160152938U, 4606692493141721470U,
+ 13830064529996497278U, 4599531889160152938U,
+ 4606884969294623682U, 4598262871476403630U,
+ 13821634908331179438U, 4606884969294623682U,
+ 4602710690099904183U, 4605957195211051218U,
+ 13829329232065827026U, 4602710690099904183U,
+ 4605343481119364930U, 4603629178146150899U,
+ 13827001215000926707U, 4605343481119364930U,
+ 4594016801320007031U, 4607100477024622401U,
+ 13830472513879398209U, 4594016801320007031U,
+ 4607068040143112603U, 4594891488091520602U,
+ 13818263524946296410U, 4607068040143112603U,
+ 4603451617570386922U, 4605475169017376660U,
+ 13828847205872152468U, 4603451617570386922U,
+ 4605843545406134034U, 4602900303344142735U,
+ 13826272340198918543U, 4605843545406134034U,
+ 4597492765973365521U, 4606938683557690074U,
+ 13830310720412465882U, 4597492765973365521U,
+ 4606618018794815019U, 4599948172872067014U,
+ 13823320209726842822U, 4606618018794815019U,
+ 4601173347964633034U, 4606364276725003740U,
+ 13829736313579779548U, 4601173347964633034U,
+ 4604774382555066977U, 4604305528345395596U,
+ 13827677565200171404U, 4604774382555066977U,
+ 4585465300892538317U, 4607176315382986589U,
+ 13830548352237762397U, 4585465300892538317U,
+ 4607176315382986589U, 4585465300892538317U,
+ 13808837337747314125U, 4607176315382986589U,
+ 4604305528345395596U, 4604774382555066977U,
+ 13828146419409842785U, 4604305528345395596U,
+ 4606364276725003740U, 4601173347964633034U,
+ 13824545384819408842U, 4606364276725003740U,
+ 4599948172872067014U, 4606618018794815019U,
+ 13829990055649590827U, 4599948172872067014U,
+ 4606938683557690074U, 4597492765973365521U,
+ 13820864802828141329U, 4606938683557690074U,
+ 4602900303344142735U, 4605843545406134034U,
+ 13829215582260909842U, 4602900303344142735U,
+ 4605475169017376660U, 4603451617570386922U,
+ 13826823654425162730U, 4605475169017376660U,
+ 4594891488091520602U, 4607068040143112603U,
+ 13830440076997888411U, 4594891488091520602U,
+ 4607100477024622401U, 4594016801320007031U,
+ 13817388838174782839U, 4607100477024622401U,
+ 4603629178146150899U, 4605343481119364930U,
+ 13828715517974140738U, 4603629178146150899U,
+ 4605957195211051218U, 4602710690099904183U,
+ 13826082726954679991U, 4605957195211051218U,
+ 4598262871476403630U, 4606884969294623682U,
+ 13830257006149399490U, 4598262871476403630U,
+ 4606692493141721470U, 4599531889160152938U,
+ 13822903926014928746U, 4606692493141721470U,
+ 4601573027631668967U, 4606269759522929756U,
+ 13829641796377705564U, 4601573027631668967U,
+ 4604922840319727473U, 4604141730443515286U,
+ 13827513767298291094U, 4604922840319727473U,
+ 4588998070480937184U, 4607165468267934125U,
+ 13830537505122709933U, 4588998070480937184U,
+ 4607149205763218185U, 4590846768565625881U,
+ 13814218805420401689U, 4607149205763218185U,
+ 4603974338538572089U, 4605067233569943231U,
+ 13828439270424719039U, 4603974338538572089U,
+ 4606170366472647579U, 4601967947786150793U,
+ 13825339984640926601U, 4606170366472647579U,
+ 4599112075441176914U, 4606761837001494797U,
+ 13830133873856270605U, 4599112075441176914U,
+ 4606826008603986804U, 4598688984595225406U,
+ 13822061021450001214U, 4606826008603986804U,
+ 4602357870542944470U, 4606066157444814153U,
+ 13829438194299589961U, 4602357870542944470U,
+ 4605207475328619533U, 4603803453461190356U,
+ 13827175490315966164U, 4605207475328619533U,
+ 4592606767730311893U, 4607127537664763515U,
+ 13830499574519539323U, 4592606767730311893U,
+ 4607030246558998647U, 4595762727260045105U,
+ 13819134764114820913U, 4607030246558998647U,
+ 4603270878689749849U, 4605602459698789090U,
+ 13828974496553564898U, 4603270878689749849U,
+ 4605725276488455441U, 4603087070374583113U,
+ 13826459107229358921U, 4605725276488455441U,
+ 4596629994023683153U, 4606987119037722413U,
+ 13830359155892498221U, 4596629994023683153U,
+ 4606538458821337243U, 4600360675823176935U,
+ 13823732712677952743U, 4606538458821337243U,
+ 4600769149537129431U, 4606453861145241227U,
+ 13829825898000017035U, 4600769149537129431U,
+ 4604621949701367983U, 4604465633578481725U,
+ 13827837670433257533U, 4604621949701367983U,
+ 4578227681973159812U, 4607181740574479067U,
+ 13830553777429254875U, 4578227681973159812U,
+ 4607182249242036882U, 4573724215515480177U,
+ 13797096252370255985U, 4607182249242036882U,
+ 4604505071555817232U, 4604583231088591477U,
+ 13827955267943367285U, 4604505071555817232U,
+ 4606475480113671417U, 4600667422348321968U,
+ 13824039459203097776U, 4606475480113671417U,
+ 4600463181646572228U, 4606517779747998088U,
+ 13829889816602773896U, 4600463181646572228U,
+ 4606998399608725124U, 4596413578358834022U,
+ 13819785615213609830U, 4606998399608725124U,
+ 4603133304188877240U, 4605694995810664660U,
+ 13829067032665440468U, 4603133304188877240U,
+ 4605633586259814045U, 4603225210076562971U,
+ 13826597246931338779U, 4605633586259814045U,
+ 4595979936813835462U, 4607019963775302583U,
+ 13830392000630078391U, 4595979936813835462U,
+ 4607133460805585796U, 4592167175087283203U,
+ 13815539211942059011U, 4607133460805585796U,
+ 4603846496621587377U, 4605172808754305228U,
+ 13828544845609081036U, 4603846496621587377U,
+ 4606092657816072624U, 4602260871257280788U,
+ 13825632908112056596U, 4606092657816072624U,
+ 4598795050632330097U, 4606810452769876110U,
+ 13830182489624651918U, 4598795050632330097U,
+ 4606778366364612594U, 4599006600037663623U,
+ 13822378636892439431U, 4606778366364612594U,
+ 4602065906208722008U, 4606144763310860551U,
+ 13829516800165636359U, 4602065906208722008U,
+ 4605102686554936490U, 4603931940768740167U,
+ 13827303977623515975U, 4605102686554936490U,
+ 4591287158938884897U, 4607144295058764886U,
+ 13830516331913540694U, 4591287158938884897U,
+ 4607168688050493276U, 4588115294056142819U,
+ 13811487330910918627U, 4607168688050493276U,
+ 4604183020748362039U, 4604886103475043762U,
+ 13828258140329819570U, 4604183020748362039U,
+ 4606293848208650998U, 4601473544562720001U,
+ 13824845581417495809U, 4606293848208650998U,
+ 4599636300858866724U, 4606674353838411301U,
+ 13830046390693187109U, 4599636300858866724U,
+ 4606898891031025132U, 4598136582470364665U,
+ 13821508619325140473U, 4606898891031025132U,
+ 4602758354025980442U, 4605929219593405673U,
+ 13829301256448181481U, 4602758354025980442U,
+ 4605376811039722786U, 4603585091850767959U,
+ 13826957128705543767U, 4605376811039722786U,
+ 4594235767444503503U, 4607092871118901179U,
+ 13830464907973676987U, 4594235767444503503U,
+ 4607076652372832968U, 4594673119063280916U,
+ 13818045155918056724U, 4607076652372832968U,
+ 4603496309891590679U, 4605442656228245717U,
+ 13828814693083021525U, 4603496309891590679U,
+ 4605872393621214213U, 4602853162432841185U,
+ 13826225199287616993U, 4605872393621214213U,
+ 4597707695679609371U, 4606925748668145757U,
+ 13830297785522921565U, 4597707695679609371U,
+ 4606637115963965612U, 4599844446633109139U,
+ 13823216483487884947U, 4606637115963965612U,
+ 4601273700967202825U, 4606341107699334546U,
+ 13829713144554110354U, 4601273700967202825U,
+ 4604811873195349477U, 4604264921241055824U,
+ 13827636958095831632U, 4604811873195349477U,
+ 4586348876009622851U, 4607174111710118367U,
+ 13830546148564894175U, 4586348876009622851U,
+ 4607178180169683960U, 4584498631466405633U,
+ 13807870668321181441U, 4607178180169683960U,
+ 4604345904647073908U, 4604736643460027021U,
+ 13828108680314802829U, 4604345904647073908U,
+ 4606387137437298591U, 4601072712526242277U,
+ 13824444749381018085U, 4606387137437298591U,
+ 4600051662802353687U, 4606598603759044570U,
+ 13829970640613820378U, 4600051662802353687U,
+ 4606951288507767453U, 4597277522845151878U,
+ 13820649559699927686U, 4606951288507767453U,
+ 4602947266358709886U, 4605814408482919348U,
+ 13829186445337695156U, 4602947266358709886U,
+ 4605507406967535927U, 4603406726595779752U,
+ 13826778763450555560U, 4605507406967535927U,
+ 4595109641634432498U, 4607059093103722971U,
+ 13830431129958498779U, 4595109641634432498U,
+ 4607107746899444102U, 4593797652641645341U,
+ 13817169689496421149U, 4607107746899444102U,
+ 4603673059103075106U, 4605309881318010327U,
+ 13828681918172786135U, 4603673059103075106U,
+ 4605984877841711338U, 4602646891659203088U,
+ 13826018928513978896U, 4605984877841711338U,
+ 4598369669086960528U, 4606870719641066940U,
+ 13830242756495842748U, 4598369669086960528U,
+ 4606710311774494716U, 4599427256825614420U,
+ 13822799293680390228U, 4606710311774494716U,
+ 4601672213217083403U, 4606245366082353408U,
+ 13829617402937129216U, 4601672213217083403U,
+ 4604959323120302796U, 4604100215502905499U,
+ 13827472252357681307U, 4604959323120302796U,
+ 4589524267239410099U, 4607161910007591876U,
+ 13830533946862367684U, 4589524267239410099U,
+ 4607153778602162496U, 4590406145430462614U,
+ 13813778182285238422U, 4607153778602162496U,
+ 4604016517974851588U, 4605031521104517324U,
+ 13828403557959293132U, 4604016517974851588U,
+ 4606195668621671667U, 4601869677011524443U,
+ 13825241713866300251U, 4606195668621671667U,
+ 4599217346014614711U, 4606744984357082948U,
+ 13830117021211858756U, 4599217346014614711U,
+ 4606841238740778884U, 4598582729657176439U,
+ 13821954766511952247U, 4606841238740778884U,
+ 4602454542796181607U, 4606039359984203741U,
+ 13829411396838979549U, 4602454542796181607U,
+ 4605241877142478242U, 4603760198400967492U,
+ 13827132235255743300U, 4605241877142478242U,
+ 4593046061348462537U, 4607121277474223905U,
+ 13830493314328999713U, 4593046061348462537U,
+ 4607040195955932526U, 4595545269419264690U,
+ 13818917306274040498U, 4607040195955932526U,
+ 4603316355454250015U, 4605571053506370248U,
+ 13828943090361146056U, 4603316355454250015U,
+ 4605755272910869620U, 4603040651631881451U,
+ 13826412688486657259U, 4605755272910869620U,
+ 4596846128749438754U, 4606975506703684317U,
+ 13830347543558460125U, 4596846128749438754U,
+ 4606558823023444576U, 4600257918160607478U,
+ 13823629955015383286U, 4606558823023444576U,
+ 4600870609507958271U, 4606431930490633905U,
+ 13829803967345409713U, 4600870609507958271U,
+ 4604660425598397818U, 4604425958770613225U,
+ 13827797995625389033U, 4604660425598397818U,
+ 4580962600092897021U, 4607180892816495009U,
+ 13830552929671270817U, 4580962600092897021U,
+ 4607180892816495009U, 4580962600092897021U,
+ 13804334636947672829U, 4607180892816495009U,
+ 4604425958770613225U, 4604660425598397818U,
+ 13828032462453173626U, 4604425958770613225U,
+ 4606431930490633905U, 4600870609507958271U,
+ 13824242646362734079U, 4606431930490633905U,
+ 4600257918160607478U, 4606558823023444576U,
+ 13829930859878220384U, 4600257918160607478U,
+ 4606975506703684317U, 4596846128749438754U,
+ 13820218165604214562U, 4606975506703684317U,
+ 4603040651631881451U, 4605755272910869620U,
+ 13829127309765645428U, 4603040651631881451U,
+ 4605571053506370248U, 4603316355454250015U,
+ 13826688392309025823U, 4605571053506370248U,
+ 4595545269419264690U, 4607040195955932526U,
+ 13830412232810708334U, 4595545269419264690U,
+ 4607121277474223905U, 4593046061348462537U,
+ 13816418098203238345U, 4607121277474223905U,
+ 4603760198400967492U, 4605241877142478242U,
+ 13828613913997254050U, 4603760198400967492U,
+ 4606039359984203741U, 4602454542796181607U,
+ 13825826579650957415U, 4606039359984203741U,
+ 4598582729657176439U, 4606841238740778884U,
+ 13830213275595554692U, 4598582729657176439U,
+ 4606744984357082948U, 4599217346014614711U,
+ 13822589382869390519U, 4606744984357082948U,
+ 4601869677011524443U, 4606195668621671667U,
+ 13829567705476447475U, 4601869677011524443U,
+ 4605031521104517324U, 4604016517974851588U,
+ 13827388554829627396U, 4605031521104517324U,
+ 4590406145430462614U, 4607153778602162496U,
+ 13830525815456938304U, 4590406145430462614U,
+ 4607161910007591876U, 4589524267239410099U,
+ 13812896304094185907U, 4607161910007591876U,
+ 4604100215502905499U, 4604959323120302796U,
+ 13828331359975078604U, 4604100215502905499U,
+ 4606245366082353408U, 4601672213217083403U,
+ 13825044250071859211U, 4606245366082353408U,
+ 4599427256825614420U, 4606710311774494716U,
+ 13830082348629270524U, 4599427256825614420U,
+ 4606870719641066940U, 4598369669086960528U,
+ 13821741705941736336U, 4606870719641066940U,
+ 4602646891659203088U, 4605984877841711338U,
+ 13829356914696487146U, 4602646891659203088U,
+ 4605309881318010327U, 4603673059103075106U,
+ 13827045095957850914U, 4605309881318010327U,
+ 4593797652641645341U, 4607107746899444102U,
+ 13830479783754219910U, 4593797652641645341U,
+ 4607059093103722971U, 4595109641634432498U,
+ 13818481678489208306U, 4607059093103722971U,
+ 4603406726595779752U, 4605507406967535927U,
+ 13828879443822311735U, 4603406726595779752U,
+ 4605814408482919348U, 4602947266358709886U,
+ 13826319303213485694U, 4605814408482919348U,
+ 4597277522845151878U, 4606951288507767453U,
+ 13830323325362543261U, 4597277522845151878U,
+ 4606598603759044570U, 4600051662802353687U,
+ 13823423699657129495U, 4606598603759044570U,
+ 4601072712526242277U, 4606387137437298591U,
+ 13829759174292074399U, 4601072712526242277U,
+ 4604736643460027021U, 4604345904647073908U,
+ 13827717941501849716U, 4604736643460027021U,
+ 4584498631466405633U, 4607178180169683960U,
+ 13830550217024459768U, 4584498631466405633U,
+ 4607174111710118367U, 4586348876009622851U,
+ 13809720912864398659U, 4607174111710118367U,
+ 4604264921241055824U, 4604811873195349477U,
+ 13828183910050125285U, 4604264921241055824U,
+ 4606341107699334546U, 4601273700967202825U,
+ 13824645737821978633U, 4606341107699334546U,
+ 4599844446633109139U, 4606637115963965612U,
+ 13830009152818741420U, 4599844446633109139U,
+ 4606925748668145757U, 4597707695679609371U,
+ 13821079732534385179U, 4606925748668145757U,
+ 4602853162432841185U, 4605872393621214213U,
+ 13829244430475990021U, 4602853162432841185U,
+ 4605442656228245717U, 4603496309891590679U,
+ 13826868346746366487U, 4605442656228245717U,
+ 4594673119063280916U, 4607076652372832968U,
+ 13830448689227608776U, 4594673119063280916U,
+ 4607092871118901179U, 4594235767444503503U,
+ 13817607804299279311U, 4607092871118901179U,
+ 4603585091850767959U, 4605376811039722786U,
+ 13828748847894498594U, 4603585091850767959U,
+ 4605929219593405673U, 4602758354025980442U,
+ 13826130390880756250U, 4605929219593405673U,
+ 4598136582470364665U, 4606898891031025132U,
+ 13830270927885800940U, 4598136582470364665U,
+ 4606674353838411301U, 4599636300858866724U,
+ 13823008337713642532U, 4606674353838411301U,
+ 4601473544562720001U, 4606293848208650998U,
+ 13829665885063426806U, 4601473544562720001U,
+ 4604886103475043762U, 4604183020748362039U,
+ 13827555057603137847U, 4604886103475043762U,
+ 4588115294056142819U, 4607168688050493276U,
+ 13830540724905269084U, 4588115294056142819U,
+ 4607144295058764886U, 4591287158938884897U,
+ 13814659195793660705U, 4607144295058764886U,
+ 4603931940768740167U, 4605102686554936490U,
+ 13828474723409712298U, 4603931940768740167U,
+ 4606144763310860551U, 4602065906208722008U,
+ 13825437943063497816U, 4606144763310860551U,
+ 4599006600037663623U, 4606778366364612594U,
+ 13830150403219388402U, 4599006600037663623U,
+ 4606810452769876110U, 4598795050632330097U,
+ 13822167087487105905U, 4606810452769876110U,
+ 4602260871257280788U, 4606092657816072624U,
+ 13829464694670848432U, 4602260871257280788U,
+ 4605172808754305228U, 4603846496621587377U,
+ 13827218533476363185U, 4605172808754305228U,
+ 4592167175087283203U, 4607133460805585796U,
+ 13830505497660361604U, 4592167175087283203U,
+ 4607019963775302583U, 4595979936813835462U,
+ 13819351973668611270U, 4607019963775302583U,
+ 4603225210076562971U, 4605633586259814045U,
+ 13829005623114589853U, 4603225210076562971U,
+ 4605694995810664660U, 4603133304188877240U,
+ 13826505341043653048U, 4605694995810664660U,
+ 4596413578358834022U, 4606998399608725124U,
+ 13830370436463500932U, 4596413578358834022U,
+ 4606517779747998088U, 4600463181646572228U,
+ 13823835218501348036U, 4606517779747998088U,
+ 4600667422348321968U, 4606475480113671417U,
+ 13829847516968447225U, 4600667422348321968U,
+ 4604583231088591477U, 4604505071555817232U,
+ 13827877108410593040U, 4604583231088591477U,
+ 4573724215515480177U, 4607182249242036882U,
+ 13830554286096812690U, 4573724215515480177U,
+ 4607182376410422530U, 4569220649180767418U,
+ 13792592686035543226U, 4607182376410422530U,
+ 4604524701268679793U, 4604563781218984604U,
+ 13827935818073760412U, 4604524701268679793U,
+ 4606486172460753999U, 4600616459743653188U,
+ 13823988496598428996U, 4606486172460753999U,
+ 4600514338912178239U, 4606507322377452870U,
+ 13829879359232228678U, 4600514338912178239U,
+ 4607003915349878877U, 4596305267720071930U,
+ 13819677304574847738U, 4607003915349878877U,
+ 4603156351203636159U, 4605679749231851918U,
+ 13829051786086627726U, 4603156351203636159U,
+ 4605649044311923410U, 4603202304363743346U,
+ 13826574341218519154U, 4605649044311923410U,
+ 4596088445927168004U, 4607014697483910382U,
+ 13830386734338686190U, 4596088445927168004U,
+ 4607136295912168606U, 4591947271803021404U,
+ 13815319308657797212U, 4607136295912168606U,
+ 4603867938232615808U, 4605155376589456981U,
+ 13828527413444232789U, 4603867938232615808U,
+ 4606105796280968177U, 4602212250118051877U,
+ 13825584286972827685U, 4606105796280968177U,
+ 4598848011564831930U, 4606802552898869248U,
+ 13830174589753645056U, 4598848011564831930U,
+ 4606786509620734768U, 4598953786765296928U,
+ 13822325823620072736U, 4606786509620734768U,
+ 4602114767134999006U, 4606131849150971908U,
+ 13829503886005747716U, 4602114767134999006U,
+ 4605120315324767624U, 4603910660507251362U,
+ 13827282697362027170U, 4605120315324767624U,
+ 4591507261658050721U, 4607141713064252300U,
+ 13830513749919028108U, 4591507261658050721U,
+ 4607170170974224083U, 4587673791460508439U,
+ 13811045828315284247U, 4607170170974224083U,
+ 4604203581176243359U, 4604867640218014515U,
+ 13828239677072790323U, 4604203581176243359U,
+ 4606305777984577632U, 4601423692641949331U,
+ 13824795729496725139U, 4606305777984577632U,
+ 4599688422741010356U, 4606665164148251002U,
+ 13830037201003026810U, 4599688422741010356U,
+ 4606905728766014348U, 4598029484874872834U,
+ 13821401521729648642U, 4606905728766014348U,
+ 4602782121393764535U, 4605915122243179241U,
+ 13829287159097955049U, 4602782121393764535U,
+ 4605393374401988274U, 4603562972219549215U,
+ 13826935009074325023U, 4605393374401988274U,
+ 4594345179472540681U, 4607088942243446236U,
+ 13830460979098222044U, 4594345179472540681U,
+ 4607080832832247697U, 4594563856311064231U,
+ 13817935893165840039U, 4607080832832247697U,
+ 4603518581031047189U, 4605426297151190466U,
+ 13828798334005966274U, 4603518581031047189U,
+ 4605886709123365959U, 4602829525820289164U,
+ 13826201562675064972U, 4605886709123365959U,
+ 4597815040470278984U, 4606919157647773535U,
+ 13830291194502549343U, 4597815040470278984U,
+ 4606646545123403481U, 4599792496117920694U,
+ 13823164532972696502U, 4606646545123403481U,
+ 4601323770373937522U, 4606329407841126011U,
+ 13829701444695901819U, 4601323770373937522U,
+ 4604830524903495634U, 4604244531615310815U,
+ 13827616568470086623U, 4604830524903495634U,
+ 4586790578280679046U, 4607172882816799076U,
+ 13830544919671574884U, 4586790578280679046U,
+ 4607178985458280057U, 4583614727651146525U,
+ 13806986764505922333U, 4607178985458280057U,
+ 4604366005771528720U, 4604717681185626434U,
+ 13828089718040402242U, 4604366005771528720U,
+ 4606398451906509788U, 4601022290077223616U,
+ 13824394326931999424U, 4606398451906509788U,
+ 4600103317933788342U, 4606588777269136769U,
+ 13829960814123912577U, 4600103317933788342U,
+ 4606957467106717424U, 4597169786279785693U,
+ 13820541823134561501U, 4606957467106717424U,
+ 4602970680601913687U, 4605799732098147061U,
+ 13829171768952922869U, 4602970680601913687U,
+ 4605523422498301790U, 4603384207141321914U,
+ 13826756243996097722U, 4605523422498301790U,
+ 4595218635031890910U, 4607054494135176056U,
+ 13830426530989951864U, 4595218635031890910U,
+ 4607111255739239816U, 4593688012422887515U,
+ 13817060049277663323U, 4607111255739239816U,
+ 4603694922063032361U, 4605292980606880364U,
+ 13828665017461656172U, 4603694922063032361U,
+ 4605998608960791335U, 4602598930031891166U,
+ 13825970966886666974U, 4605998608960791335U,
+ 4598423001813699022U, 4606863472012527185U,
+ 13830235508867302993U, 4598423001813699022U,
+ 4606719100629313491U, 4599374859150636784U,
+ 13822746896005412592U, 4606719100629313491U,
+ 4601721693286060937U, 4606233055365547081U,
+ 13829605092220322889U, 4601721693286060937U,
+ 4604977468824438271U, 4604079374282302598U,
+ 13827451411137078406U, 4604977468824438271U,
+ 4589744810590291021U, 4607160003989618959U,
+ 13830532040844394767U, 4589744810590291021U,
+ 4607155938267770208U, 4590185751760970393U,
+ 13813557788615746201U, 4607155938267770208U,
+ 4604037525321326463U, 4605013567986435066U,
+ 13828385604841210874U, 4604037525321326463U,
+ 4606208206518262803U, 4601820425647934753U,
+ 13825192462502710561U, 4606208206518262803U,
+ 4599269903251194481U, 4606736437002195879U,
+ 13830108473856971687U, 4599269903251194481U,
+ 4606848731493011465U, 4598529532600161144U,
+ 13821901569454936952U, 4606848731493011465U,
+ 4602502755147763107U, 4606025850160239809U,
+ 13829397887015015617U, 4602502755147763107U,
+ 4605258978359093269U, 4603738491917026584U,
+ 13827110528771802392U, 4605258978359093269U,
+ 4593265590854265407U, 4607118021058468598U,
+ 13830490057913244406U, 4593265590854265407U,
+ 4607045045516813836U, 4595436449949385485U,
+ 13818808486804161293U, 4607045045516813836U,
+ 4603339021357904144U, 4605555245917486022U,
+ 13828927282772261830U, 4603339021357904144U,
+ 4605770164172969910U, 4603017373458244943U,
+ 13826389410313020751U, 4605770164172969910U,
+ 4596954088216812973U, 4606969576261663845U,
+ 13830341613116439653U, 4596954088216812973U,
+ 4606568886807728474U, 4600206446098256018U,
+ 13823578482953031826U, 4606568886807728474U,
+ 4600921238092511730U, 4606420848538580260U,
+ 13829792885393356068U, 4600921238092511730U,
+ 4604679572075463103U, 4604406033021674239U,
+ 13827778069876450047U, 4604679572075463103U,
+ 4581846703643734566U, 4607180341788068727U,
+ 13830552378642844535U, 4581846703643734566U,
+ 4607181359080094673U, 4579996072175835083U,
+ 13803368109030610891U, 4607181359080094673U,
+ 4604445825685214043U, 4604641218080103285U,
+ 13828013254934879093U, 4604445825685214043U,
+ 4606442934727379583U, 4600819913163773071U,
+ 13824191950018548879U, 4606442934727379583U,
+ 4600309328230211502U, 4606548680329491866U,
+ 13829920717184267674U, 4600309328230211502U,
+ 4606981354314050484U, 4596738097012783531U,
+ 13820110133867559339U, 4606981354314050484U,
+ 4603063884010218172U, 4605740310302420207U,
+ 13829112347157196015U, 4603063884010218172U,
+ 4605586791482848547U, 4603293641160266722U,
+ 13826665678015042530U, 4605586791482848547U,
+ 4595654028864046335U, 4607035262954517034U,
+ 13830407299809292842U, 4595654028864046335U,
+ 4607124449686274900U, 4592826452951465409U,
+ 13816198489806241217U, 4607124449686274900U,
+ 4603781852316960384U, 4605224709411790590U,
+ 13828596746266566398U, 4603781852316960384U,
+ 4606052795787882823U, 4602406247776385022U,
+ 13825778284631160830U, 4606052795787882823U,
+ 4598635880488956483U, 4606833664420673202U,
+ 13830205701275449010U, 4598635880488956483U,
+ 4606753451050079834U, 4599164736579548843U,
+ 13822536773434324651U, 4606753451050079834U,
+ 4601918851211878557U, 4606183055233559255U,
+ 13829555092088335063U, 4601918851211878557U,
+ 4605049409688478101U, 4603995455647851249U,
+ 13827367492502627057U, 4605049409688478101U,
+ 4590626485056654602U, 4607151534426937478U,
+ 13830523571281713286U, 4590626485056654602U,
+ 4607163731439411601U, 4589303678145802340U,
+ 13812675715000578148U, 4607163731439411601U,
+ 4604121000955189926U, 4604941113561600762U,
+ 13828313150416376570U, 4604121000955189926U,
+ 4606257600839867033U, 4601622657843474729U,
+ 13824994694698250537U, 4606257600839867033U,
+ 4599479600326345459U, 4606701442584137310U,
+ 13830073479438913118U, 4599479600326345459U,
+ 4606877885424248132U, 4598316292140394014U,
+ 13821688328995169822U, 4606877885424248132U,
+ 4602686793990243041U, 4605971073215153165U,
+ 13829343110069928973U, 4602686793990243041U,
+ 4605326714874986465U, 4603651144395358093U,
+ 13827023181250133901U, 4605326714874986465U,
+ 4593907249284540294U, 4607104153983298999U,
+ 13830476190838074807U, 4593907249284540294U,
+ 4607063608453868552U, 4595000592312171144U,
+ 13818372629166946952U, 4607063608453868552U,
+ 4603429196809300824U, 4605491322423429598U,
+ 13828863359278205406U, 4603429196809300824U,
+ 4605829012964735987U, 4602923807199184054U,
+ 13826295844053959862U, 4605829012964735987U,
+ 4597385183080791534U, 4606945027305114062U,
+ 13830317064159889870U, 4597385183080791534U,
+ 4606608350964852124U, 4599999947619525579U,
+ 13823371984474301387U, 4606608350964852124U,
+ 4601123065313358619U, 4606375745674388705U,
+ 13829747782529164513U, 4601123065313358619U,
+ 4604755543975806820U, 4604325745441780828U,
+ 13827697782296556636U, 4604755543975806820U,
+ 4585023436363055487U, 4607177290141793710U,
+ 13830549326996569518U, 4585023436363055487U,
+ 4607175255902437396U, 4585907115494236537U,
+ 13809279152349012345U, 4607175255902437396U,
+ 4604285253548209224U, 4604793159020491611U,
+ 13828165195875267419U, 4604285253548209224U,
+ 4606352730697093817U, 4601223560006786057U,
+ 13824595596861561865U, 4606352730697093817U,
+ 4599896339047301634U, 4606627607157935956U,
+ 13829999644012711764U, 4599896339047301634U,
+ 4606932257325205256U, 4597600270510262682U,
+ 13820972307365038490U, 4606932257325205256U,
+ 4602876755014813164U, 4605858005670328613U,
+ 13829230042525104421U, 4602876755014813164U,
+ 4605458946901419122U, 4603473988668005304U,
+ 13826846025522781112U, 4605458946901419122U,
+ 4594782329999411347U, 4607072388129742377U,
+ 13830444424984518185U, 4594782329999411347U,
+ 4607096716058023245U, 4594126307716900071U,
+ 13817498344571675879U, 4607096716058023245U,
+ 4603607160562208225U, 4605360179893335444U,
+ 13828732216748111252U, 4603607160562208225U,
+ 4605943243960030558U, 4602734543519989142U,
+ 13826106580374764950U, 4605943243960030558U,
+ 4598209407597805010U, 4606891971185517504U,
+ 13830264008040293312U, 4598209407597805010U,
+ 4606683463531482757U, 4599584122834874440U,
+ 13822956159689650248U, 4606683463531482757U,
+ 4601523323048804569U, 4606281842017099424U,
+ 13829653878871875232U, 4601523323048804569U,
+ 4604904503566677638U, 4604162403772767740U,
+ 13827534440627543548U, 4604904503566677638U,
+ 4588556721781247689U, 4607167120476811757U,
+ 13830539157331587565U, 4588556721781247689U,
+ 4607146792632922887U, 4591066993883984169U,
+ 13814439030738759977U, 4607146792632922887U,
+ 4603953166845776383U, 4605084992581147553U,
+ 13828457029435923361U, 4603953166845776383U,
+ 4606157602458368090U, 4602016966272225497U,
+ 13825389003127001305U, 4606157602458368090U,
+ 4599059363095165615U, 4606770142132396069U,
+ 13830142178987171877U, 4599059363095165615U,
+ 4606818271362779153U, 4598742041476147134U,
+ 13822114078330922942U, 4606818271362779153U,
+ 4602309411551204896U, 4606079444829232727U,
+ 13829451481684008535U, 4602309411551204896U,
+ 4605190175055178825U, 4603825001630339212U,
+ 13827197038485115020U, 4605190175055178825U,
+ 4592387007752762956U, 4607130541380624519U,
+ 13830502578235400327U, 4592387007752762956U,
+ 4607025146816593591U, 4595871363584150300U,
+ 13819243400438926108U, 4607025146816593591U,
+ 4603248068256948438U, 4605618058006716661U,
+ 13828990094861492469U, 4603248068256948438U,
+ 4605710171610479304U, 4603110210506737381U,
+ 13826482247361513189U, 4605710171610479304U,
+ 4596521820799644122U, 4606992800820440327U,
+ 13830364837675216135U, 4596521820799644122U,
+ 4606528158595189433U, 4600411960456200676U,
+ 13823783997310976484U, 4606528158595189433U,
+ 4600718319105833937U, 4606464709641375231U,
+ 13829836746496151039U, 4600718319105833937U,
+ 4604602620643553229U, 4604485382263976838U,
+ 13827857419118752646U, 4604602620643553229U,
+ 4576459225186735875U, 4607182037296057423U,
+ 13830554074150833231U, 4576459225186735875U,
+ 4607182037296057423U, 4576459225186735875U,
+ 13799831262041511683U, 4607182037296057423U,
+ 4604485382263976838U, 4604602620643553229U,
+ 13827974657498329037U, 4604485382263976838U,
+ 4606464709641375231U, 4600718319105833937U,
+ 13824090355960609745U, 4606464709641375231U,
+ 4600411960456200676U, 4606528158595189433U,
+ 13829900195449965241U, 4600411960456200676U,
+ 4606992800820440327U, 4596521820799644122U,
+ 13819893857654419930U, 4606992800820440327U,
+ 4603110210506737381U, 4605710171610479304U,
+ 13829082208465255112U, 4603110210506737381U,
+ 4605618058006716661U, 4603248068256948438U,
+ 13826620105111724246U, 4605618058006716661U,
+ 4595871363584150300U, 4607025146816593591U,
+ 13830397183671369399U, 4595871363584150300U,
+ 4607130541380624519U, 4592387007752762956U,
+ 13815759044607538764U, 4607130541380624519U,
+ 4603825001630339212U, 4605190175055178825U,
+ 13828562211909954633U, 4603825001630339212U,
+ 4606079444829232727U, 4602309411551204896U,
+ 13825681448405980704U, 4606079444829232727U,
+ 4598742041476147134U, 4606818271362779153U,
+ 13830190308217554961U, 4598742041476147134U,
+ 4606770142132396069U, 4599059363095165615U,
+ 13822431399949941423U, 4606770142132396069U,
+ 4602016966272225497U, 4606157602458368090U,
+ 13829529639313143898U, 4602016966272225497U,
+ 4605084992581147553U, 4603953166845776383U,
+ 13827325203700552191U, 4605084992581147553U,
+ 4591066993883984169U, 4607146792632922887U,
+ 13830518829487698695U, 4591066993883984169U,
+ 4607167120476811757U, 4588556721781247689U,
+ 13811928758636023497U, 4607167120476811757U,
+ 4604162403772767740U, 4604904503566677638U,
+ 13828276540421453446U, 4604162403772767740U,
+ 4606281842017099424U, 4601523323048804569U,
+ 13824895359903580377U, 4606281842017099424U,
+ 4599584122834874440U, 4606683463531482757U,
+ 13830055500386258565U, 4599584122834874440U,
+ 4606891971185517504U, 4598209407597805010U,
+ 13821581444452580818U, 4606891971185517504U,
+ 4602734543519989142U, 4605943243960030558U,
+ 13829315280814806366U, 4602734543519989142U,
+ 4605360179893335444U, 4603607160562208225U,
+ 13826979197416984033U, 4605360179893335444U,
+ 4594126307716900071U, 4607096716058023245U,
+ 13830468752912799053U, 4594126307716900071U,
+ 4607072388129742377U, 4594782329999411347U,
+ 13818154366854187155U, 4607072388129742377U,
+ 4603473988668005304U, 4605458946901419122U,
+ 13828830983756194930U, 4603473988668005304U,
+ 4605858005670328613U, 4602876755014813164U,
+ 13826248791869588972U, 4605858005670328613U,
+ 4597600270510262682U, 4606932257325205256U,
+ 13830304294179981064U, 4597600270510262682U,
+ 4606627607157935956U, 4599896339047301634U,
+ 13823268375902077442U, 4606627607157935956U,
+ 4601223560006786057U, 4606352730697093817U,
+ 13829724767551869625U, 4601223560006786057U,
+ 4604793159020491611U, 4604285253548209224U,
+ 13827657290402985032U, 4604793159020491611U,
+ 4585907115494236537U, 4607175255902437396U,
+ 13830547292757213204U, 4585907115494236537U,
+ 4607177290141793710U, 4585023436363055487U,
+ 13808395473217831295U, 4607177290141793710U,
+ 4604325745441780828U, 4604755543975806820U,
+ 13828127580830582628U, 4604325745441780828U,
+ 4606375745674388705U, 4601123065313358619U,
+ 13824495102168134427U, 4606375745674388705U,
+ 4599999947619525579U, 4606608350964852124U,
+ 13829980387819627932U, 4599999947619525579U,
+ 4606945027305114062U, 4597385183080791534U,
+ 13820757219935567342U, 4606945027305114062U,
+ 4602923807199184054U, 4605829012964735987U,
+ 13829201049819511795U, 4602923807199184054U,
+ 4605491322423429598U, 4603429196809300824U,
+ 13826801233664076632U, 4605491322423429598U,
+ 4595000592312171144U, 4607063608453868552U,
+ 13830435645308644360U, 4595000592312171144U,
+ 4607104153983298999U, 4593907249284540294U,
+ 13817279286139316102U, 4607104153983298999U,
+ 4603651144395358093U, 4605326714874986465U,
+ 13828698751729762273U, 4603651144395358093U,
+ 4605971073215153165U, 4602686793990243041U,
+ 13826058830845018849U, 4605971073215153165U,
+ 4598316292140394014U, 4606877885424248132U,
+ 13830249922279023940U, 4598316292140394014U,
+ 4606701442584137310U, 4599479600326345459U,
+ 13822851637181121267U, 4606701442584137310U,
+ 4601622657843474729U, 4606257600839867033U,
+ 13829629637694642841U, 4601622657843474729U,
+ 4604941113561600762U, 4604121000955189926U,
+ 13827493037809965734U, 4604941113561600762U,
+ 4589303678145802340U, 4607163731439411601U,
+ 13830535768294187409U, 4589303678145802340U,
+ 4607151534426937478U, 4590626485056654602U,
+ 13813998521911430410U, 4607151534426937478U,
+ 4603995455647851249U, 4605049409688478101U,
+ 13828421446543253909U, 4603995455647851249U,
+ 4606183055233559255U, 4601918851211878557U,
+ 13825290888066654365U, 4606183055233559255U,
+ 4599164736579548843U, 4606753451050079834U,
+ 13830125487904855642U, 4599164736579548843U,
+ 4606833664420673202U, 4598635880488956483U,
+ 13822007917343732291U, 4606833664420673202U,
+ 4602406247776385022U, 4606052795787882823U,
+ 13829424832642658631U, 4602406247776385022U,
+ 4605224709411790590U, 4603781852316960384U,
+ 13827153889171736192U, 4605224709411790590U,
+ 4592826452951465409U, 4607124449686274900U,
+ 13830496486541050708U, 4592826452951465409U,
+ 4607035262954517034U, 4595654028864046335U,
+ 13819026065718822143U, 4607035262954517034U,
+ 4603293641160266722U, 4605586791482848547U,
+ 13828958828337624355U, 4603293641160266722U,
+ 4605740310302420207U, 4603063884010218172U,
+ 13826435920864993980U, 4605740310302420207U,
+ 4596738097012783531U, 4606981354314050484U,
+ 13830353391168826292U, 4596738097012783531U,
+ 4606548680329491866U, 4600309328230211502U,
+ 13823681365084987310U, 4606548680329491866U,
+ 4600819913163773071U, 4606442934727379583U,
+ 13829814971582155391U, 4600819913163773071U,
+ 4604641218080103285U, 4604445825685214043U,
+ 13827817862539989851U, 4604641218080103285U,
+ 4579996072175835083U, 4607181359080094673U,
+ 13830553395934870481U, 4579996072175835083U,
+ 4607180341788068727U, 4581846703643734566U,
+ 13805218740498510374U, 4607180341788068727U,
+ 4604406033021674239U, 4604679572075463103U,
+ 13828051608930238911U, 4604406033021674239U,
+ 4606420848538580260U, 4600921238092511730U,
+ 13824293274947287538U, 4606420848538580260U,
+ 4600206446098256018U, 4606568886807728474U,
+ 13829940923662504282U, 4600206446098256018U,
+ 4606969576261663845U, 4596954088216812973U,
+ 13820326125071588781U, 4606969576261663845U,
+ 4603017373458244943U, 4605770164172969910U,
+ 13829142201027745718U, 4603017373458244943U,
+ 4605555245917486022U, 4603339021357904144U,
+ 13826711058212679952U, 4605555245917486022U,
+ 4595436449949385485U, 4607045045516813836U,
+ 13830417082371589644U, 4595436449949385485U,
+ 4607118021058468598U, 4593265590854265407U,
+ 13816637627709041215U, 4607118021058468598U,
+ 4603738491917026584U, 4605258978359093269U,
+ 13828631015213869077U, 4603738491917026584U,
+ 4606025850160239809U, 4602502755147763107U,
+ 13825874792002538915U, 4606025850160239809U,
+ 4598529532600161144U, 4606848731493011465U,
+ 13830220768347787273U, 4598529532600161144U,
+ 4606736437002195879U, 4599269903251194481U,
+ 13822641940105970289U, 4606736437002195879U,
+ 4601820425647934753U, 4606208206518262803U,
+ 13829580243373038611U, 4601820425647934753U,
+ 4605013567986435066U, 4604037525321326463U,
+ 13827409562176102271U, 4605013567986435066U,
+ 4590185751760970393U, 4607155938267770208U,
+ 13830527975122546016U, 4590185751760970393U,
+ 4607160003989618959U, 4589744810590291021U,
+ 13813116847445066829U, 4607160003989618959U,
+ 4604079374282302598U, 4604977468824438271U,
+ 13828349505679214079U, 4604079374282302598U,
+ 4606233055365547081U, 4601721693286060937U,
+ 13825093730140836745U, 4606233055365547081U,
+ 4599374859150636784U, 4606719100629313491U,
+ 13830091137484089299U, 4599374859150636784U,
+ 4606863472012527185U, 4598423001813699022U,
+ 13821795038668474830U, 4606863472012527185U,
+ 4602598930031891166U, 4605998608960791335U,
+ 13829370645815567143U, 4602598930031891166U,
+ 4605292980606880364U, 4603694922063032361U,
+ 13827066958917808169U, 4605292980606880364U,
+ 4593688012422887515U, 4607111255739239816U,
+ 13830483292594015624U, 4593688012422887515U,
+ 4607054494135176056U, 4595218635031890910U,
+ 13818590671886666718U, 4607054494135176056U,
+ 4603384207141321914U, 4605523422498301790U,
+ 13828895459353077598U, 4603384207141321914U,
+ 4605799732098147061U, 4602970680601913687U,
+ 13826342717456689495U, 4605799732098147061U,
+ 4597169786279785693U, 4606957467106717424U,
+ 13830329503961493232U, 4597169786279785693U,
+ 4606588777269136769U, 4600103317933788342U,
+ 13823475354788564150U, 4606588777269136769U,
+ 4601022290077223616U, 4606398451906509788U,
+ 13829770488761285596U, 4601022290077223616U,
+ 4604717681185626434U, 4604366005771528720U,
+ 13827738042626304528U, 4604717681185626434U,
+ 4583614727651146525U, 4607178985458280057U,
+ 13830551022313055865U, 4583614727651146525U,
+ 4607172882816799076U, 4586790578280679046U,
+ 13810162615135454854U, 4607172882816799076U,
+ 4604244531615310815U, 4604830524903495634U,
+ 13828202561758271442U, 4604244531615310815U,
+ 4606329407841126011U, 4601323770373937522U,
+ 13824695807228713330U, 4606329407841126011U,
+ 4599792496117920694U, 4606646545123403481U,
+ 13830018581978179289U, 4599792496117920694U,
+ 4606919157647773535U, 4597815040470278984U,
+ 13821187077325054792U, 4606919157647773535U,
+ 4602829525820289164U, 4605886709123365959U,
+ 13829258745978141767U, 4602829525820289164U,
+ 4605426297151190466U, 4603518581031047189U,
+ 13826890617885822997U, 4605426297151190466U,
+ 4594563856311064231U, 4607080832832247697U,
+ 13830452869687023505U, 4594563856311064231U,
+ 4607088942243446236U, 4594345179472540681U,
+ 13817717216327316489U, 4607088942243446236U,
+ 4603562972219549215U, 4605393374401988274U,
+ 13828765411256764082U, 4603562972219549215U,
+ 4605915122243179241U, 4602782121393764535U,
+ 13826154158248540343U, 4605915122243179241U,
+ 4598029484874872834U, 4606905728766014348U,
+ 13830277765620790156U, 4598029484874872834U,
+ 4606665164148251002U, 4599688422741010356U,
+ 13823060459595786164U, 4606665164148251002U,
+ 4601423692641949331U, 4606305777984577632U,
+ 13829677814839353440U, 4601423692641949331U,
+ 4604867640218014515U, 4604203581176243359U,
+ 13827575618031019167U, 4604867640218014515U,
+ 4587673791460508439U, 4607170170974224083U,
+ 13830542207828999891U, 4587673791460508439U,
+ 4607141713064252300U, 4591507261658050721U,
+ 13814879298512826529U, 4607141713064252300U,
+ 4603910660507251362U, 4605120315324767624U,
+ 13828492352179543432U, 4603910660507251362U,
+ 4606131849150971908U, 4602114767134999006U,
+ 13825486803989774814U, 4606131849150971908U,
+ 4598953786765296928U, 4606786509620734768U,
+ 13830158546475510576U, 4598953786765296928U,
+ 4606802552898869248U, 4598848011564831930U,
+ 13822220048419607738U, 4606802552898869248U,
+ 4602212250118051877U, 4606105796280968177U,
+ 13829477833135743985U, 4602212250118051877U,
+ 4605155376589456981U, 4603867938232615808U,
+ 13827239975087391616U, 4605155376589456981U,
+ 4591947271803021404U, 4607136295912168606U,
+ 13830508332766944414U, 4591947271803021404U,
+ 4607014697483910382U, 4596088445927168004U,
+ 13819460482781943812U, 4607014697483910382U,
+ 4603202304363743346U, 4605649044311923410U,
+ 13829021081166699218U, 4603202304363743346U,
+ 4605679749231851918U, 4603156351203636159U,
+ 13826528388058411967U, 4605679749231851918U,
+ 4596305267720071930U, 4607003915349878877U,
+ 13830375952204654685U, 4596305267720071930U,
+ 4606507322377452870U, 4600514338912178239U,
+ 13823886375766954047U, 4606507322377452870U,
+ 4600616459743653188U, 4606486172460753999U,
+ 13829858209315529807U, 4600616459743653188U,
+ 4604563781218984604U, 4604524701268679793U,
+ 13827896738123455601U, 4604563781218984604U,
+ 4569220649180767418U, 4607182376410422530U,
+ 13830554413265198338U, 4569220649180767418U
+};
+
+const fpr fpr_p2_tab[] = {
+ 4611686018427387904U,
+ 4607182418800017408U,
+ 4602678819172646912U,
+ 4598175219545276416U,
+ 4593671619917905920U,
+ 4589168020290535424U,
+ 4584664420663164928U,
+ 4580160821035794432U,
+ 4575657221408423936U,
+ 4571153621781053440U,
+ 4566650022153682944U
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.h b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.h
new file mode 100644
index 000000000..3e80b5068
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/fpr.h
@@ -0,0 +1,491 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/* ====================================================================== */
+/*
+ * Custom floating-point implementation with integer arithmetics. We
+ * use IEEE-754 "binary64" format, with some simplifications:
+ *
+ * - Top bit is s = 1 for negative, 0 for positive.
+ *
+ * - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
+ *
+ * - Mantissa m uses the 52 low bits.
+ *
+ * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
+ * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
+ * less than 1.0), but the top bit (equal to 1 by definition) is omitted
+ * in the encoding.
+ *
+ * In IEEE-754, there are some special values:
+ *
+ * - If e = 2047, then the value is either an infinite (m = 0) or
+ * a NaN (m != 0).
+ *
+ * - If e = 0, then the value is either a zero (m = 0) or a subnormal,
+ * aka "denormalized number" (m != 0).
+ *
+ * Of these, we only need the zeros. The caller is responsible for not
+ * providing operands that would lead to infinites, NaNs or subnormals.
+ * If inputs are such that values go out of range, then indeterminate
+ * values are returned (it would still be deterministic, but no specific
+ * value may be relied upon).
+ *
+ * At the C level, the three parts are stored in a 64-bit unsigned
+ * word.
+ *
+ * One may note that a property of the IEEE-754 format is that order
+ * is preserved for positive values: if two positive floating-point
+ * values x and y are such that x < y, then their respective encodings
+ * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
+ * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
+ * y < 0, and x < y, then ia64(x) > ia64(y).
+ *
+ * IMPORTANT ASSUMPTIONS:
+ * ======================
+ *
+ * For proper computations, and constant-time behaviour, we assume the
+ * following:
+ *
+ * - 32x32->64 multiplication (unsigned) has an execution time that
+ * is independent of its operands. This is true of most modern
+ * x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
+ * and M3 (in the M0 and M0+, this is done in software, so it depends
+ * on that routine), and the PowerPC cores from the G3/G4 lines.
+ * For more info, see: https://www.bearssl.org/ctmul.html
+ *
+ * - Left-shifts and right-shifts of 32-bit values have an execution
+ * time which does not depend on the shifted value nor on the
+ * shift count. An historical exception is the Pentium IV, but most
+ * modern CPU have barrel shifters. Some small microcontrollers
+ * might have varying-time shifts (not the ARM Cortex M*, though).
+ *
+ * - Right-shift of a signed negative value performs a sign extension.
+ * As per the C standard, this operation returns an
+ * implementation-defined result (this is NOT an "undefined
+ * behaviour"). On most/all systems, an arithmetic shift is
+ * performed, because this is what makes most sense.
+ */
+
+/*
+ * Normally we should declare the 'fpr' type to be a struct or union
+ * around the internal 64-bit value; however, we want to use the
+ * direct 64-bit integer type to enable a lighter call convention on
+ * ARM platforms. This means that direct (invalid) use of operators
+ * such as '*' or '+' will not be caught by the compiler. We rely on
+ * the "normal" (non-emulated) code to detect such instances.
+ */
+typedef uint64_t fpr;
+
+/*
+ * For computations, we split values into an integral mantissa in the
+ * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
+ * "sticky" (it is set to 1 if any of the bits below it is 1); when
+ * re-encoding, the low two bits are dropped, but may induce an
+ * increment in the value for proper rounding.
+ */
+
+/*
+ * Right-shift a 64-bit unsigned value by a possibly secret shift count.
+ * We assumed that the underlying architecture had a barrel shifter for
+ * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
+ * typically invoke a software routine that is not necessarily
+ * constant-time; hence the function below.
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ursh(uint64_t x, int n) {
+ x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
+ return x >> (n & 31);
+}
+
+/*
+ * Right-shift a 64-bit signed value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline int64_t
+fpr_irsh(int64_t x, int n) {
+ x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
+ return x >> (n & 31);
+}
+
+/*
+ * Left-shift a 64-bit unsigned value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ulsh(uint64_t x, int n) {
+ x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
+ return x << (n & 31);
+}
+
+/*
+ * Expectations:
+ * s = 0 or 1
+ * exponent e is "arbitrary" and unbiased
+ * 2^54 <= m < 2^55
+ * Numerical value is (-1)^2 * m * 2^e
+ *
+ * Exponents which are too low lead to value zero. If the exponent is
+ * too large, the returned value is indeterminate.
+ *
+ * If m = 0, then a zero is returned (using the provided sign).
+ * If e < -1076, then a zero is returned (regardless of the value of m).
+ * If e >= -1076 and e != 0, m must be within the expected range
+ * (2^54 to 2^55-1).
+ */
+static inline fpr
+FPR(int s, int e, uint64_t m) {
+ fpr x;
+ uint32_t t;
+ unsigned f;
+
+ /*
+ * If e >= -1076, then the value is "normal"; otherwise, it
+ * should be a subnormal, which we clamp down to zero.
+ */
+ e += 1076;
+ t = (uint32_t)e >> 31;
+ m &= (uint64_t)t - 1;
+
+ /*
+ * If m = 0 then we want a zero; make e = 0 too, but conserve
+ * the sign.
+ */
+ t = (uint32_t)(m >> 54);
+ e &= -(int)t;
+
+ /*
+ * The 52 mantissa bits come from m. Value m has its top bit set
+ * (unless it is a zero); we leave it "as is": the top bit will
+ * increment the exponent by 1, except when m = 0, which is
+ * exactly what we want.
+ */
+ x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
+
+ /*
+ * Rounding: if the low three bits of m are 011, 110 or 111,
+ * then the value should be incremented to get the next
+ * representable value. This implements the usual
+ * round-to-nearest rule (with preference to even values in case
+ * of a tie). Note that the increment may make a carry spill
+ * into the exponent field, which is again exactly what we want
+ * in that case.
+ */
+ f = (unsigned)m & 7U;
+ x += (0xC8U >> f) & 1;
+ return x;
+}
+
+#define fpr_scaled PQCLEAN_FALCONPADDED1024_CLEAN_fpr_scaled
+fpr fpr_scaled(int64_t i, int sc);
+
+static inline fpr
+fpr_of(int64_t i) {
+ return fpr_scaled(i, 0);
+}
+
+static const fpr fpr_q = 4667981563525332992;
+static const fpr fpr_inverse_of_q = 4545632735260551042;
+static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
+static const fpr fpr_inv_sigma[] = {
+ 0, /* unused */
+ 4574611497772390042,
+ 4574501679055810265,
+ 4574396282908341804,
+ 4574245855758572086,
+ 4574103865040221165,
+ 4573969550563515544,
+ 4573842244705920822,
+ 4573721358406441454,
+ 4573606369665796042,
+ 4573496814039276259
+};
+static const fpr fpr_sigma_min[] = {
+ 0, /* unused */
+ 4607707126469777035,
+ 4607777455861499430,
+ 4607846828256951418,
+ 4607949175006100261,
+ 4608049571757433526,
+ 4608148125896792003,
+ 4608244935301382692,
+ 4608340089478362016,
+ 4608433670533905013,
+ 4608525754002622308
+};
+static const fpr fpr_log2 = 4604418534313441775;
+static const fpr fpr_inv_log2 = 4609176140021203710;
+static const fpr fpr_bnorm_max = 4670353323383631276;
+static const fpr fpr_zero = 0;
+static const fpr fpr_one = 4607182418800017408;
+static const fpr fpr_two = 4611686018427387904;
+static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
+static const fpr fpr_ptwo31 = 4746794007248502784;
+static const fpr fpr_ptwo31m1 = 4746794007244308480;
+static const fpr fpr_mtwo31m1 = 13970166044099084288U;
+static const fpr fpr_ptwo63m1 = 4890909195324358656;
+static const fpr fpr_mtwo63m1 = 14114281232179134464U;
+static const fpr fpr_ptwo63 = 4890909195324358656;
+
+static inline int64_t
+fpr_rint(fpr x) {
+ uint64_t m, d;
+ int e;
+ uint32_t s, dd, f;
+
+ /*
+ * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
+ * thus extract the mantissa as a 63-bit integer, then right-shift
+ * it as needed.
+ */
+ m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+ e = 1085 - ((int)(x >> 52) & 0x7FF);
+
+ /*
+ * If a shift of more than 63 bits is needed, then simply set m
+ * to zero. This also covers the case of an input operand equal
+ * to zero.
+ */
+ m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
+ e &= 63;
+
+ /*
+ * Right-shift m as needed. Shift count is e. Proper rounding
+ * mandates that:
+ * - If the highest dropped bit is zero, then round low.
+ * - If the highest dropped bit is one, and at least one of the
+ * other dropped bits is one, then round up.
+ * - If the highest dropped bit is one, and all other dropped
+ * bits are zero, then round up if the lowest kept bit is 1,
+ * or low otherwise (i.e. ties are broken by "rounding to even").
+ *
+ * We thus first extract a word consisting of all the dropped bit
+ * AND the lowest kept bit; then we shrink it down to three bits,
+ * the lowest being "sticky".
+ */
+ d = fpr_ulsh(m, 63 - e);
+ dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
+ f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
+ m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
+
+ /*
+ * Apply the sign bit.
+ */
+ s = (uint32_t)(x >> 63);
+ return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+ uint64_t t;
+ int64_t xi;
+ int e, cc;
+
+ /*
+ * We extract the integer as a _signed_ 64-bit integer with
+ * a scaling factor. Since we assume that the value fits
+ * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
+ * absolute value to make it in the 2^62..2^63-1 range: we
+ * will only need a right-shift afterwards.
+ */
+ e = (int)(x >> 52) & 0x7FF;
+ t = x >> 63;
+ xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
+ & (((uint64_t)1 << 63) - 1));
+ xi = (xi ^ -(int64_t)t) + (int64_t)t;
+ cc = 1085 - e;
+
+ /*
+ * We perform an arithmetic right-shift on the value. This
+ * applies floor() semantics on both positive and negative values
+ * (rounding toward minus infinity).
+ */
+ xi = fpr_irsh(xi, cc & 63);
+
+ /*
+ * If the true shift count was 64 or more, then we should instead
+ * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
+ * case: -0 will be floored to -1, not 0 (whether this is correct
+ * is debatable; in any case, the other functions normalize zero
+ * to +0).
+ *
+ * For an input of zero, the non-shifted xi was incorrect (we used
+ * a top implicit bit of value 1, not 0), but this does not matter
+ * since this operation will clamp it down.
+ */
+ xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
+ return xi;
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+ uint64_t t, xu;
+ int e, cc;
+
+ /*
+ * Extract the absolute value. Since we assume that the value
+ * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
+ * the absolute value into the 2^62..2^63-1 range, and then
+ * do a right shift afterwards.
+ */
+ e = (int)(x >> 52) & 0x7FF;
+ xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+ cc = 1085 - e;
+ xu = fpr_ursh(xu, cc & 63);
+
+ /*
+ * If the exponent is too low (cc > 63), then the shift was wrong
+ * and we must clamp the value to 0. This also covers the case
+ * of an input equal to zero.
+ */
+ xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
+
+ /*
+ * Apply back the sign, if the source value is negative.
+ */
+ t = x >> 63;
+ xu = (xu ^ -t) + t;
+ return *(int64_t *)&xu;
+}
+
+#define fpr_add PQCLEAN_FALCONPADDED1024_CLEAN_fpr_add
+fpr fpr_add(fpr x, fpr y);
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+ y ^= (uint64_t)1 << 63;
+ return fpr_add(x, y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+ x ^= (uint64_t)1 << 63;
+ return x;
+}
+
+static inline fpr
+fpr_half(fpr x) {
+ /*
+ * To divide a value by 2, we just have to subtract 1 from its
+ * exponent, but we have to take care of zero.
+ */
+ uint32_t t;
+
+ x -= (uint64_t)1 << 52;
+ t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
+ x &= (uint64_t)t - 1;
+ return x;
+}
+
+static inline fpr
+fpr_double(fpr x) {
+ /*
+ * To double a value, we just increment by one the exponent. We
+ * don't care about infinites or NaNs; however, 0 is a
+ * special case.
+ */
+ x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
+ return x;
+}
+
+#define fpr_mul PQCLEAN_FALCONPADDED1024_CLEAN_fpr_mul
+fpr fpr_mul(fpr x, fpr y);
+
+static inline fpr
+fpr_sqr(fpr x) {
+ return fpr_mul(x, x);
+}
+
+#define fpr_div PQCLEAN_FALCONPADDED1024_CLEAN_fpr_div
+fpr fpr_div(fpr x, fpr y);
+
+static inline fpr
+fpr_inv(fpr x) {
+ return fpr_div(4607182418800017408u, x);
+}
+
+#define fpr_sqrt PQCLEAN_FALCONPADDED1024_CLEAN_fpr_sqrt
+fpr fpr_sqrt(fpr x);
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+ /*
+ * If both x and y are positive, then a signed comparison yields
+ * the proper result:
+ * - For positive values, the order is preserved.
+ * - The sign bit is at the same place as in integers, so
+ * sign is preserved.
+ * Moreover, we can compute [x < y] as sgn(x-y) and the computation
+ * of x-y will not overflow.
+ *
+ * If the signs differ, then sgn(x) gives the proper result.
+ *
+ * If both x and y are negative, then the order is reversed.
+ * Hence [x < y] = sgn(y-x). We must compute this separately from
+ * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
+ * case x = y properly.
+ */
+ int cc0, cc1;
+ int64_t sx;
+ int64_t sy;
+
+ sx = *(int64_t *)&x;
+ sy = *(int64_t *)&y;
+ sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
+
+ cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
+ cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
+
+ return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
+}
+
+/*
+ * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
+ * bits or so.
+ */
+#define fpr_expm_p63 PQCLEAN_FALCONPADDED1024_CLEAN_fpr_expm_p63
+uint64_t fpr_expm_p63(fpr x, fpr ccs);
+
+#define fpr_gm_tab PQCLEAN_FALCONPADDED1024_CLEAN_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab PQCLEAN_FALCONPADDED1024_CLEAN_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/inner.h b/src/sig/falcon/pqclean_falcon-padded-1024_clean/inner.h
new file mode 100644
index 000000000..c63ee1ddf
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/inner.h
@@ -0,0 +1,820 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ * - All public functions (i.e. the non-static ones) must be referenced
+ * with the PQCLEAN_FALCONPADDED1024_CLEAN_ macro (e.g. PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw for the verify_raw()
+ * function). That macro adds a prefix to the name, which is
+ * configurable with the FALCON_PREFIX macro. This allows compiling
+ * the code into a specific "namespace" and potentially including
+ * several versions of this code into a single application (e.g. to
+ * have an AVX2 and a non-AVX2 variants and select the one to use at
+ * runtime based on availability of AVX2 opcodes).
+ *
+ * - Functions that need temporary buffers expects them as a final
+ * tmp[] array of type uint8_t*, with a size which is documented for
+ * each function. However, most have some alignment requirements,
+ * because they will use the array to store 16-bit, 32-bit or 64-bit
+ * values (e.g. uint64_t or double). The caller must ensure proper
+ * alignment. What happens on unaligned access depends on the
+ * underlying architecture, ranging from a slight time penalty
+ * to immediate termination of the process.
+ *
+ * - Some functions rely on specific rounding rules and precision for
+ * floating-point numbers. On some systems (in particular 32-bit x86
+ * with the 387 FPU), this requires setting an hardware control
+ * word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ * oldcw = set_fpu_cw(2);
+ * PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(...);
+ * set_fpu_cw(oldcw);
+ *
+ * On systems where the native floating-point precision is already
+ * proper, or integer-based emulation is used, the set_fpu_cw()
+ * function does nothing, so it can be called systematically.
+ */
+
+#include
+#include
+#include
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+ return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context shake256incctx
+#define inner_shake256_init(sc) shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ * - modq: sequence of values modulo 12289, each encoded over exactly
+ * 14 bits. The encoder and decoder verify that integers are within
+ * the valid range (0..12288). Values are arrays of uint16.
+ *
+ * - trim: sequence of signed integers, a specified number of bits
+ * each. The number of bits is provided as parameter and includes
+ * the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ * (which means that the -2^(bits-1) value is forbidden); encode and
+ * decode functions check that property. Values are arrays of
+ * int16_t or int8_t, corresponding to names 'trim_i16' and
+ * 'trim_i8', respectively.
+ *
+ * - comp: variable-length encoding for signed integers; each integer
+ * uses a minimum of 9 bits, possibly more. This is normally used
+ * only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_modq_encode(void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_comp_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_modq_decode(uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED1024_CLEAN_comp_decode(int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED1024_CLEAN_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ * c0[] contains the hashed nonce+message
+ * s2[] is the decoded signature
+ * h[] contains the public key, in NTT + Montgomery format
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ * h[] receives the public key (NOT in NTT/Montgomery format)
+ * c0[] contains the hashed nonce+message
+ * s1[] is the first signature half
+ * s2[] is the second signature half
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ * fpr
+ * type for a real number
+ *
+ * fpr fpr_of(int64_t i)
+ * cast an integer into a real number; source must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_scaled(int64_t i, int sc)
+ * compute i*2^sc as a real number; source 'i' must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_ldexp(fpr x, int e)
+ * compute x*2^e
+ *
+ * int64_t fpr_rint(fpr x)
+ * round x to the nearest integer; x must be in the -(2^63-1)
+ * to +(2^63-1) range
+ *
+ * int64_t fpr_trunc(fpr x)
+ * round to an integer; this rounds towards zero; value must
+ * be in the -(2^63-1) to +(2^63-1) range
+ *
+ * fpr fpr_add(fpr x, fpr y)
+ * compute x + y
+ *
+ * fpr fpr_sub(fpr x, fpr y)
+ * compute x - y
+ *
+ * fpr fpr_neg(fpr x)
+ * compute -x
+ *
+ * fpr fpr_half(fpr x)
+ * compute x/2
+ *
+ * fpr fpr_double(fpr x)
+ * compute x*2
+ *
+ * fpr fpr_mul(fpr x, fpr y)
+ * compute x * y
+ *
+ * fpr fpr_sqr(fpr x)
+ * compute x * x
+ *
+ * fpr fpr_inv(fpr x)
+ * compute 1/x
+ *
+ * fpr fpr_div(fpr x, fpr y)
+ * compute x/y
+ *
+ * fpr fpr_sqrt(fpr x)
+ * compute the square root of x
+ *
+ * int fpr_lt(fpr x, fpr y)
+ * return 1 if x < y, 0 otherwise
+ *
+ * uint64_t fpr_expm_p63(fpr x)
+ * return exp(x), assuming that 0 <= x < log(2). Returned value
+ * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ * rounded to the nearest integer). Computation should have a
+ * precision of at least 45 bits.
+ *
+ * const fpr fpr_gm_tab[]
+ * array of constants for FFT / iFFT
+ *
+ * const fpr fpr_p2_tab[]
+ * precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ * fpr fpr_q 12289
+ * fpr fpr_inverse_of_q 1/12289
+ * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2))
+ * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10)
+ * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10)
+ * fpr fpr_log2 log(2)
+ * fpr fpr_inv_log2 1/log(2)
+ * fpr fpr_bnorm_max 16822.4121
+ * fpr fpr_zero 0
+ * fpr fpr_one 1
+ * fpr fpr_two 2
+ * fpr fpr_onehalf 0.5
+ * fpr fpr_ptwo31 2^31
+ * fpr fpr_ptwo31m1 2^31-1
+ * fpr fpr_mtwo31m1 -(2^31-1)
+ * fpr fpr_ptwo63m1 2^63-1
+ * fpr fpr_mtwo63m1 -(2^63-1)
+ * fpr fpr_ptwo63 2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED1024_CLEAN_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+ union {
+ uint8_t d[512]; /* MUST be 512, exactly */
+ uint64_t dummy_u64;
+ } buf;
+ size_t ptr;
+ union {
+ uint8_t d[256];
+ uint64_t dummy_u64;
+ } state;
+ int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+ size_t u;
+
+ /*
+ * If there are less than 9 bytes in the buffer, we refill it.
+ * This means that we may drop the last few bytes, but this allows
+ * for faster extraction code. Also, it means that we never leave
+ * an empty buffer.
+ */
+ u = p->ptr;
+ if (u >= (sizeof p->buf.d) - 9) {
+ PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+ u = 0;
+ }
+ p->ptr = u + 8;
+
+ return (uint64_t)p->buf.d[u + 0]
+ | ((uint64_t)p->buf.d[u + 1] << 8)
+ | ((uint64_t)p->buf.d[u + 2] << 16)
+ | ((uint64_t)p->buf.d[u + 3] << 24)
+ | ((uint64_t)p->buf.d[u + 4] << 32)
+ | ((uint64_t)p->buf.d[u + 5] << 40)
+ | ((uint64_t)p->buf.d[u + 6] << 48)
+ | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+ unsigned v;
+
+ v = p->buf.d[p->ptr ++];
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+ }
+ return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDL_fft(const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1 136
+#define FALCON_KEYGEN_TEMP_2 272
+#define FALCON_KEYGEN_TEMP_3 224
+#define FALCON_KEYGEN_TEMP_4 448
+#define FALCON_KEYGEN_TEMP_5 896
+#define FALCON_KEYGEN_TEMP_6 1792
+#define FALCON_KEYGEN_TEMP_7 3584
+#define FALCON_KEYGEN_TEMP_8 7168
+#define FALCON_KEYGEN_TEMP_9 14336
+#define FALCON_KEYGEN_TEMP_10 28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED1024_CLEAN_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ * ctx pointer to the sampler_context structure
+ * mu center for the distribution
+ * isigma inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+ prng p;
+ fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED1024_CLEAN_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/keygen.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/keygen.c
new file mode 100644
index 000000000..411c37463
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/keygen.c
@@ -0,0 +1,4234 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ * Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ * Montgomery multiplication computes (x*y)/R mod p for
+ * operands x and y. Therefore:
+ *
+ * - if operands are x*R and y*R (Montgomery representations of x and
+ * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ * mod p, which is the Montgomery representation of the product x*y;
+ *
+ * - if operands are x*R and y (or x and y*R), then Montgomery
+ * multiplication returns x*y mod p: mixed-representation
+ * multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ * p The prime itself.
+ *
+ * g A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ * s The inverse of the product of all previous primes in the array,
+ * computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+ uint32_t p;
+ uint32_t g;
+ uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+ { 2147473409, 383167813, 10239 },
+ { 2147389441, 211808905, 471403745 },
+ { 2147387393, 37672282, 1329335065 },
+ { 2147377153, 1977035326, 968223422 },
+ { 2147358721, 1067163706, 132460015 },
+ { 2147352577, 1606082042, 598693809 },
+ { 2147346433, 2033915641, 1056257184 },
+ { 2147338241, 1653770625, 421286710 },
+ { 2147309569, 631200819, 1111201074 },
+ { 2147297281, 2038364663, 1042003613 },
+ { 2147295233, 1962540515, 19440033 },
+ { 2147239937, 2100082663, 353296760 },
+ { 2147235841, 1991153006, 1703918027 },
+ { 2147217409, 516405114, 1258919613 },
+ { 2147205121, 409347988, 1089726929 },
+ { 2147196929, 927788991, 1946238668 },
+ { 2147178497, 1136922411, 1347028164 },
+ { 2147100673, 868626236, 701164723 },
+ { 2147082241, 1897279176, 617820870 },
+ { 2147074049, 1888819123, 158382189 },
+ { 2147051521, 25006327, 522758543 },
+ { 2147043329, 327546255, 37227845 },
+ { 2147039233, 766324424, 1133356428 },
+ { 2146988033, 1862817362, 73861329 },
+ { 2146963457, 404622040, 653019435 },
+ { 2146959361, 1936581214, 995143093 },
+ { 2146938881, 1559770096, 634921513 },
+ { 2146908161, 422623708, 1985060172 },
+ { 2146885633, 1751189170, 298238186 },
+ { 2146871297, 578919515, 291810829 },
+ { 2146846721, 1114060353, 915902322 },
+ { 2146834433, 2069565474, 47859524 },
+ { 2146818049, 1552824584, 646281055 },
+ { 2146775041, 1906267847, 1597832891 },
+ { 2146756609, 1847414714, 1228090888 },
+ { 2146744321, 1818792070, 1176377637 },
+ { 2146738177, 1118066398, 1054971214 },
+ { 2146736129, 52057278, 933422153 },
+ { 2146713601, 592259376, 1406621510 },
+ { 2146695169, 263161877, 1514178701 },
+ { 2146656257, 685363115, 384505091 },
+ { 2146650113, 927727032, 537575289 },
+ { 2146646017, 52575506, 1799464037 },
+ { 2146643969, 1276803876, 1348954416 },
+ { 2146603009, 814028633, 1521547704 },
+ { 2146572289, 1846678872, 1310832121 },
+ { 2146547713, 919368090, 1019041349 },
+ { 2146508801, 671847612, 38582496 },
+ { 2146492417, 283911680, 532424562 },
+ { 2146490369, 1780044827, 896447978 },
+ { 2146459649, 327980850, 1327906900 },
+ { 2146447361, 1310561493, 958645253 },
+ { 2146441217, 412148926, 287271128 },
+ { 2146437121, 293186449, 2009822534 },
+ { 2146430977, 179034356, 1359155584 },
+ { 2146418689, 1517345488, 1790248672 },
+ { 2146406401, 1615820390, 1584833571 },
+ { 2146404353, 826651445, 607120498 },
+ { 2146379777, 3816988, 1897049071 },
+ { 2146363393, 1221409784, 1986921567 },
+ { 2146355201, 1388081168, 849968120 },
+ { 2146336769, 1803473237, 1655544036 },
+ { 2146312193, 1023484977, 273671831 },
+ { 2146293761, 1074591448, 467406983 },
+ { 2146283521, 831604668, 1523950494 },
+ { 2146203649, 712865423, 1170834574 },
+ { 2146154497, 1764991362, 1064856763 },
+ { 2146142209, 627386213, 1406840151 },
+ { 2146127873, 1638674429, 2088393537 },
+ { 2146099201, 1516001018, 690673370 },
+ { 2146093057, 1294931393, 315136610 },
+ { 2146091009, 1942399533, 973539425 },
+ { 2146078721, 1843461814, 2132275436 },
+ { 2146060289, 1098740778, 360423481 },
+ { 2146048001, 1617213232, 1951981294 },
+ { 2146041857, 1805783169, 2075683489 },
+ { 2146019329, 272027909, 1753219918 },
+ { 2145986561, 1206530344, 2034028118 },
+ { 2145976321, 1243769360, 1173377644 },
+ { 2145964033, 887200839, 1281344586 },
+ { 2145906689, 1651026455, 906178216 },
+ { 2145875969, 1673238256, 1043521212 },
+ { 2145871873, 1226591210, 1399796492 },
+ { 2145841153, 1465353397, 1324527802 },
+ { 2145832961, 1150638905, 554084759 },
+ { 2145816577, 221601706, 427340863 },
+ { 2145785857, 608896761, 316590738 },
+ { 2145755137, 1712054942, 1684294304 },
+ { 2145742849, 1302302867, 724873116 },
+ { 2145728513, 516717693, 431671476 },
+ { 2145699841, 524575579, 1619722537 },
+ { 2145691649, 1925625239, 982974435 },
+ { 2145687553, 463795662, 1293154300 },
+ { 2145673217, 771716636, 881778029 },
+ { 2145630209, 1509556977, 837364988 },
+ { 2145595393, 229091856, 851648427 },
+ { 2145587201, 1796903241, 635342424 },
+ { 2145525761, 715310882, 1677228081 },
+ { 2145495041, 1040930522, 200685896 },
+ { 2145466369, 949804237, 1809146322 },
+ { 2145445889, 1673903706, 95316881 },
+ { 2145390593, 806941852, 1428671135 },
+ { 2145372161, 1402525292, 159350694 },
+ { 2145361921, 2124760298, 1589134749 },
+ { 2145359873, 1217503067, 1561543010 },
+ { 2145355777, 338341402, 83865711 },
+ { 2145343489, 1381532164, 641430002 },
+ { 2145325057, 1883895478, 1528469895 },
+ { 2145318913, 1335370424, 65809740 },
+ { 2145312769, 2000008042, 1919775760 },
+ { 2145300481, 961450962, 1229540578 },
+ { 2145282049, 910466767, 1964062701 },
+ { 2145232897, 816527501, 450152063 },
+ { 2145218561, 1435128058, 1794509700 },
+ { 2145187841, 33505311, 1272467582 },
+ { 2145181697, 269767433, 1380363849 },
+ { 2145175553, 56386299, 1316870546 },
+ { 2145079297, 2106880293, 1391797340 },
+ { 2145021953, 1347906152, 720510798 },
+ { 2145015809, 206769262, 1651459955 },
+ { 2145003521, 1885513236, 1393381284 },
+ { 2144960513, 1810381315, 31937275 },
+ { 2144944129, 1306487838, 2019419520 },
+ { 2144935937, 37304730, 1841489054 },
+ { 2144894977, 1601434616, 157985831 },
+ { 2144888833, 98749330, 2128592228 },
+ { 2144880641, 1772327002, 2076128344 },
+ { 2144864257, 1404514762, 2029969964 },
+ { 2144827393, 801236594, 406627220 },
+ { 2144806913, 349217443, 1501080290 },
+ { 2144796673, 1542656776, 2084736519 },
+ { 2144778241, 1210734884, 1746416203 },
+ { 2144759809, 1146598851, 716464489 },
+ { 2144757761, 286328400, 1823728177 },
+ { 2144729089, 1347555695, 1836644881 },
+ { 2144727041, 1795703790, 520296412 },
+ { 2144696321, 1302475157, 852964281 },
+ { 2144667649, 1075877614, 504992927 },
+ { 2144573441, 198765808, 1617144982 },
+ { 2144555009, 321528767, 155821259 },
+ { 2144550913, 814139516, 1819937644 },
+ { 2144536577, 571143206, 962942255 },
+ { 2144524289, 1746733766, 2471321 },
+ { 2144512001, 1821415077, 124190939 },
+ { 2144468993, 917871546, 1260072806 },
+ { 2144458753, 378417981, 1569240563 },
+ { 2144421889, 175229668, 1825620763 },
+ { 2144409601, 1699216963, 351648117 },
+ { 2144370689, 1071885991, 958186029 },
+ { 2144348161, 1763151227, 540353574 },
+ { 2144335873, 1060214804, 919598847 },
+ { 2144329729, 663515846, 1448552668 },
+ { 2144327681, 1057776305, 590222840 },
+ { 2144309249, 1705149168, 1459294624 },
+ { 2144296961, 325823721, 1649016934 },
+ { 2144290817, 738775789, 447427206 },
+ { 2144243713, 962347618, 893050215 },
+ { 2144237569, 1655257077, 900860862 },
+ { 2144161793, 242206694, 1567868672 },
+ { 2144155649, 769415308, 1247993134 },
+ { 2144137217, 320492023, 515841070 },
+ { 2144120833, 1639388522, 770877302 },
+ { 2144071681, 1761785233, 964296120 },
+ { 2144065537, 419817825, 204564472 },
+ { 2144028673, 666050597, 2091019760 },
+ { 2144010241, 1413657615, 1518702610 },
+ { 2143952897, 1238327946, 475672271 },
+ { 2143940609, 307063413, 1176750846 },
+ { 2143918081, 2062905559, 786785803 },
+ { 2143899649, 1338112849, 1562292083 },
+ { 2143891457, 68149545, 87166451 },
+ { 2143885313, 921750778, 394460854 },
+ { 2143854593, 719766593, 133877196 },
+ { 2143836161, 1149399850, 1861591875 },
+ { 2143762433, 1848739366, 1335934145 },
+ { 2143756289, 1326674710, 102999236 },
+ { 2143713281, 808061791, 1156900308 },
+ { 2143690753, 388399459, 1926468019 },
+ { 2143670273, 1427891374, 1756689401 },
+ { 2143666177, 1912173949, 986629565 },
+ { 2143645697, 2041160111, 371842865 },
+ { 2143641601, 1279906897, 2023974350 },
+ { 2143635457, 720473174, 1389027526 },
+ { 2143621121, 1298309455, 1732632006 },
+ { 2143598593, 1548762216, 1825417506 },
+ { 2143567873, 620475784, 1073787233 },
+ { 2143561729, 1932954575, 949167309 },
+ { 2143553537, 354315656, 1652037534 },
+ { 2143541249, 577424288, 1097027618 },
+ { 2143531009, 357862822, 478640055 },
+ { 2143522817, 2017706025, 1550531668 },
+ { 2143506433, 2078127419, 1824320165 },
+ { 2143488001, 613475285, 1604011510 },
+ { 2143469569, 1466594987, 502095196 },
+ { 2143426561, 1115430331, 1044637111 },
+ { 2143383553, 9778045, 1902463734 },
+ { 2143377409, 1557401276, 2056861771 },
+ { 2143363073, 652036455, 1965915971 },
+ { 2143260673, 1464581171, 1523257541 },
+ { 2143246337, 1876119649, 764541916 },
+ { 2143209473, 1614992673, 1920672844 },
+ { 2143203329, 981052047, 2049774209 },
+ { 2143160321, 1847355533, 728535665 },
+ { 2143129601, 965558457, 603052992 },
+ { 2143123457, 2140817191, 8348679 },
+ { 2143100929, 1547263683, 694209023 },
+ { 2143092737, 643459066, 1979934533 },
+ { 2143082497, 188603778, 2026175670 },
+ { 2143062017, 1657329695, 377451099 },
+ { 2143051777, 114967950, 979255473 },
+ { 2143025153, 1698431342, 1449196896 },
+ { 2143006721, 1862741675, 1739650365 },
+ { 2142996481, 756660457, 996160050 },
+ { 2142976001, 927864010, 1166847574 },
+ { 2142965761, 905070557, 661974566 },
+ { 2142916609, 40932754, 1787161127 },
+ { 2142892033, 1987985648, 675335382 },
+ { 2142885889, 797497211, 1323096997 },
+ { 2142871553, 2068025830, 1411877159 },
+ { 2142861313, 1217177090, 1438410687 },
+ { 2142830593, 409906375, 1767860634 },
+ { 2142803969, 1197788993, 359782919 },
+ { 2142785537, 643817365, 513932862 },
+ { 2142779393, 1717046338, 218943121 },
+ { 2142724097, 89336830, 416687049 },
+ { 2142707713, 5944581, 1356813523 },
+ { 2142658561, 887942135, 2074011722 },
+ { 2142638081, 151851972, 1647339939 },
+ { 2142564353, 1691505537, 1483107336 },
+ { 2142533633, 1989920200, 1135938817 },
+ { 2142529537, 959263126, 1531961857 },
+ { 2142527489, 453251129, 1725566162 },
+ { 2142502913, 1536028102, 182053257 },
+ { 2142498817, 570138730, 701443447 },
+ { 2142416897, 326965800, 411931819 },
+ { 2142363649, 1675665410, 1517191733 },
+ { 2142351361, 968529566, 1575712703 },
+ { 2142330881, 1384953238, 1769087884 },
+ { 2142314497, 1977173242, 1833745524 },
+ { 2142289921, 95082313, 1714775493 },
+ { 2142283777, 109377615, 1070584533 },
+ { 2142277633, 16960510, 702157145 },
+ { 2142263297, 553850819, 431364395 },
+ { 2142208001, 241466367, 2053967982 },
+ { 2142164993, 1795661326, 1031836848 },
+ { 2142097409, 1212530046, 712772031 },
+ { 2142087169, 1763869720, 822276067 },
+ { 2142078977, 644065713, 1765268066 },
+ { 2142074881, 112671944, 643204925 },
+ { 2142044161, 1387785471, 1297890174 },
+ { 2142025729, 783885537, 1000425730 },
+ { 2142011393, 905662232, 1679401033 },
+ { 2141974529, 799788433, 468119557 },
+ { 2141943809, 1932544124, 449305555 },
+ { 2141933569, 1527403256, 841867925 },
+ { 2141931521, 1247076451, 743823916 },
+ { 2141902849, 1199660531, 401687910 },
+ { 2141890561, 150132350, 1720336972 },
+ { 2141857793, 1287438162, 663880489 },
+ { 2141833217, 618017731, 1819208266 },
+ { 2141820929, 999578638, 1403090096 },
+ { 2141786113, 81834325, 1523542501 },
+ { 2141771777, 120001928, 463556492 },
+ { 2141759489, 122455485, 2124928282 },
+ { 2141749249, 141986041, 940339153 },
+ { 2141685761, 889088734, 477141499 },
+ { 2141673473, 324212681, 1122558298 },
+ { 2141669377, 1175806187, 1373818177 },
+ { 2141655041, 1113654822, 296887082 },
+ { 2141587457, 991103258, 1585913875 },
+ { 2141583361, 1401451409, 1802457360 },
+ { 2141575169, 1571977166, 712760980 },
+ { 2141546497, 1107849376, 1250270109 },
+ { 2141515777, 196544219, 356001130 },
+ { 2141495297, 1733571506, 1060744866 },
+ { 2141483009, 321552363, 1168297026 },
+ { 2141458433, 505818251, 733225819 },
+ { 2141360129, 1026840098, 948342276 },
+ { 2141325313, 945133744, 2129965998 },
+ { 2141317121, 1871100260, 1843844634 },
+ { 2141286401, 1790639498, 1750465696 },
+ { 2141267969, 1376858592, 186160720 },
+ { 2141255681, 2129698296, 1876677959 },
+ { 2141243393, 2138900688, 1340009628 },
+ { 2141214721, 1933049835, 1087819477 },
+ { 2141212673, 1898664939, 1786328049 },
+ { 2141202433, 990234828, 940682169 },
+ { 2141175809, 1406392421, 993089586 },
+ { 2141165569, 1263518371, 289019479 },
+ { 2141073409, 1485624211, 507864514 },
+ { 2141052929, 1885134788, 311252465 },
+ { 2141040641, 1285021247, 280941862 },
+ { 2141028353, 1527610374, 375035110 },
+ { 2141011969, 1400626168, 164696620 },
+ { 2140999681, 632959608, 966175067 },
+ { 2140997633, 2045628978, 1290889438 },
+ { 2140993537, 1412755491, 375366253 },
+ { 2140942337, 719477232, 785367828 },
+ { 2140925953, 45224252, 836552317 },
+ { 2140917761, 1157376588, 1001839569 },
+ { 2140887041, 278480752, 2098732796 },
+ { 2140837889, 1663139953, 924094810 },
+ { 2140788737, 802501511, 2045368990 },
+ { 2140766209, 1820083885, 1800295504 },
+ { 2140764161, 1169561905, 2106792035 },
+ { 2140696577, 127781498, 1885987531 },
+ { 2140684289, 16014477, 1098116827 },
+ { 2140653569, 665960598, 1796728247 },
+ { 2140594177, 1043085491, 377310938 },
+ { 2140579841, 1732838211, 1504505945 },
+ { 2140569601, 302071939, 358291016 },
+ { 2140567553, 192393733, 1909137143 },
+ { 2140557313, 406595731, 1175330270 },
+ { 2140549121, 1748850918, 525007007 },
+ { 2140477441, 499436566, 1031159814 },
+ { 2140469249, 1886004401, 1029951320 },
+ { 2140426241, 1483168100, 1676273461 },
+ { 2140420097, 1779917297, 846024476 },
+ { 2140413953, 522948893, 1816354149 },
+ { 2140383233, 1931364473, 1296921241 },
+ { 2140366849, 1917356555, 147196204 },
+ { 2140354561, 16466177, 1349052107 },
+ { 2140348417, 1875366972, 1860485634 },
+ { 2140323841, 456498717, 1790256483 },
+ { 2140321793, 1629493973, 150031888 },
+ { 2140315649, 1904063898, 395510935 },
+ { 2140280833, 1784104328, 831417909 },
+ { 2140250113, 256087139, 697349101 },
+ { 2140229633, 388553070, 243875754 },
+ { 2140223489, 747459608, 1396270850 },
+ { 2140200961, 507423743, 1895572209 },
+ { 2140162049, 580106016, 2045297469 },
+ { 2140149761, 712426444, 785217995 },
+ { 2140137473, 1441607584, 536866543 },
+ { 2140119041, 346538902, 1740434653 },
+ { 2140090369, 282642885, 21051094 },
+ { 2140076033, 1407456228, 319910029 },
+ { 2140047361, 1619330500, 1488632070 },
+ { 2140041217, 2089408064, 2012026134 },
+ { 2140008449, 1705524800, 1613440760 },
+ { 2139924481, 1846208233, 1280649481 },
+ { 2139906049, 989438755, 1185646076 },
+ { 2139867137, 1522314850, 372783595 },
+ { 2139842561, 1681587377, 216848235 },
+ { 2139826177, 2066284988, 1784999464 },
+ { 2139824129, 480888214, 1513323027 },
+ { 2139789313, 847937200, 858192859 },
+ { 2139783169, 1642000434, 1583261448 },
+ { 2139770881, 940699589, 179702100 },
+ { 2139768833, 315623242, 964612676 },
+ { 2139666433, 331649203, 764666914 },
+ { 2139641857, 2118730799, 1313764644 },
+ { 2139635713, 519149027, 519212449 },
+ { 2139598849, 1526413634, 1769667104 },
+ { 2139574273, 551148610, 820739925 },
+ { 2139568129, 1386800242, 472447405 },
+ { 2139549697, 813760130, 1412328531 },
+ { 2139537409, 1615286260, 1609362979 },
+ { 2139475969, 1352559299, 1696720421 },
+ { 2139455489, 1048691649, 1584935400 },
+ { 2139432961, 836025845, 950121150 },
+ { 2139424769, 1558281165, 1635486858 },
+ { 2139406337, 1728402143, 1674423301 },
+ { 2139396097, 1727715782, 1483470544 },
+ { 2139383809, 1092853491, 1741699084 },
+ { 2139369473, 690776899, 1242798709 },
+ { 2139351041, 1768782380, 2120712049 },
+ { 2139334657, 1739968247, 1427249225 },
+ { 2139332609, 1547189119, 623011170 },
+ { 2139310081, 1346827917, 1605466350 },
+ { 2139303937, 369317948, 828392831 },
+ { 2139301889, 1560417239, 1788073219 },
+ { 2139283457, 1303121623, 595079358 },
+ { 2139248641, 1354555286, 573424177 },
+ { 2139240449, 60974056, 885781403 },
+ { 2139222017, 355573421, 1221054839 },
+ { 2139215873, 566477826, 1724006500 },
+ { 2139150337, 871437673, 1609133294 },
+ { 2139144193, 1478130914, 1137491905 },
+ { 2139117569, 1854880922, 964728507 },
+ { 2139076609, 202405335, 756508944 },
+ { 2139062273, 1399715741, 884826059 },
+ { 2139045889, 1051045798, 1202295476 },
+ { 2139033601, 1707715206, 632234634 },
+ { 2139006977, 2035853139, 231626690 },
+ { 2138951681, 183867876, 838350879 },
+ { 2138945537, 1403254661, 404460202 },
+ { 2138920961, 310865011, 1282911681 },
+ { 2138910721, 1328496553, 103472415 },
+ { 2138904577, 78831681, 993513549 },
+ { 2138902529, 1319697451, 1055904361 },
+ { 2138816513, 384338872, 1706202469 },
+ { 2138810369, 1084868275, 405677177 },
+ { 2138787841, 401181788, 1964773901 },
+ { 2138775553, 1850532988, 1247087473 },
+ { 2138767361, 874261901, 1576073565 },
+ { 2138757121, 1187474742, 993541415 },
+ { 2138748929, 1782458888, 1043206483 },
+ { 2138744833, 1221500487, 800141243 },
+ { 2138738689, 413465368, 1450660558 },
+ { 2138695681, 739045140, 342611472 },
+ { 2138658817, 1355845756, 672674190 },
+ { 2138644481, 608379162, 1538874380 },
+ { 2138632193, 1444914034, 686911254 },
+ { 2138607617, 484707818, 1435142134 },
+ { 2138591233, 539460669, 1290458549 },
+ { 2138572801, 2093538990, 2011138646 },
+ { 2138552321, 1149786988, 1076414907 },
+ { 2138546177, 840688206, 2108985273 },
+ { 2138533889, 209669619, 198172413 },
+ { 2138523649, 1975879426, 1277003968 },
+ { 2138490881, 1351891144, 1976858109 },
+ { 2138460161, 1817321013, 1979278293 },
+ { 2138429441, 1950077177, 203441928 },
+ { 2138400769, 908970113, 628395069 },
+ { 2138398721, 219890864, 758486760 },
+ { 2138376193, 1306654379, 977554090 },
+ { 2138351617, 298822498, 2004708503 },
+ { 2138337281, 441457816, 1049002108 },
+ { 2138320897, 1517731724, 1442269609 },
+ { 2138290177, 1355911197, 1647139103 },
+ { 2138234881, 531313247, 1746591962 },
+ { 2138214401, 1899410930, 781416444 },
+ { 2138202113, 1813477173, 1622508515 },
+ { 2138191873, 1086458299, 1025408615 },
+ { 2138183681, 1998800427, 827063290 },
+ { 2138173441, 1921308898, 749670117 },
+ { 2138103809, 1620902804, 2126787647 },
+ { 2138099713, 828647069, 1892961817 },
+ { 2138085377, 179405355, 1525506535 },
+ { 2138060801, 615683235, 1259580138 },
+ { 2138044417, 2030277840, 1731266562 },
+ { 2138042369, 2087222316, 1627902259 },
+ { 2138032129, 126388712, 1108640984 },
+ { 2138011649, 715026550, 1017980050 },
+ { 2137993217, 1693714349, 1351778704 },
+ { 2137888769, 1289762259, 1053090405 },
+ { 2137853953, 199991890, 1254192789 },
+ { 2137833473, 941421685, 896995556 },
+ { 2137817089, 750416446, 1251031181 },
+ { 2137792513, 798075119, 368077456 },
+ { 2137786369, 878543495, 1035375025 },
+ { 2137767937, 9351178, 1156563902 },
+ { 2137755649, 1382297614, 1686559583 },
+ { 2137724929, 1345472850, 1681096331 },
+ { 2137704449, 834666929, 630551727 },
+ { 2137673729, 1646165729, 1892091571 },
+ { 2137620481, 778943821, 48456461 },
+ { 2137618433, 1730837875, 1713336725 },
+ { 2137581569, 805610339, 1378891359 },
+ { 2137538561, 204342388, 1950165220 },
+ { 2137526273, 1947629754, 1500789441 },
+ { 2137516033, 719902645, 1499525372 },
+ { 2137491457, 230451261, 556382829 },
+ { 2137440257, 979573541, 412760291 },
+ { 2137374721, 927841248, 1954137185 },
+ { 2137362433, 1243778559, 861024672 },
+ { 2137313281, 1341338501, 980638386 },
+ { 2137311233, 937415182, 1793212117 },
+ { 2137255937, 795331324, 1410253405 },
+ { 2137243649, 150756339, 1966999887 },
+ { 2137182209, 163346914, 1939301431 },
+ { 2137171969, 1952552395, 758913141 },
+ { 2137159681, 570788721, 218668666 },
+ { 2137147393, 1896656810, 2045670345 },
+ { 2137141249, 358493842, 518199643 },
+ { 2137139201, 1505023029, 674695848 },
+ { 2137133057, 27911103, 830956306 },
+ { 2137122817, 439771337, 1555268614 },
+ { 2137116673, 790988579, 1871449599 },
+ { 2137110529, 432109234, 811805080 },
+ { 2137102337, 1357900653, 1184997641 },
+ { 2137098241, 515119035, 1715693095 },
+ { 2137090049, 408575203, 2085660657 },
+ { 2137085953, 2097793407, 1349626963 },
+ { 2137055233, 1556739954, 1449960883 },
+ { 2137030657, 1545758650, 1369303716 },
+ { 2136987649, 332602570, 103875114 },
+ { 2136969217, 1499989506, 1662964115 },
+ { 2136924161, 857040753, 4738842 },
+ { 2136895489, 1948872712, 570436091 },
+ { 2136893441, 58969960, 1568349634 },
+ { 2136887297, 2127193379, 273612548 },
+ { 2136850433, 111208983, 1181257116 },
+ { 2136809473, 1627275942, 1680317971 },
+ { 2136764417, 1574888217, 14011331 },
+ { 2136741889, 14011055, 1129154251 },
+ { 2136727553, 35862563, 1838555253 },
+ { 2136721409, 310235666, 1363928244 },
+ { 2136698881, 1612429202, 1560383828 },
+ { 2136649729, 1138540131, 800014364 },
+ { 2136606721, 602323503, 1433096652 },
+ { 2136563713, 182209265, 1919611038 },
+ { 2136555521, 324156477, 165591039 },
+ { 2136549377, 195513113, 217165345 },
+ { 2136526849, 1050768046, 939647887 },
+ { 2136508417, 1886286237, 1619926572 },
+ { 2136477697, 609647664, 35065157 },
+ { 2136471553, 679352216, 1452259468 },
+ { 2136457217, 128630031, 824816521 },
+ { 2136422401, 19787464, 1526049830 },
+ { 2136420353, 698316836, 1530623527 },
+ { 2136371201, 1651862373, 1804812805 },
+ { 2136334337, 326596005, 336977082 },
+ { 2136322049, 63253370, 1904972151 },
+ { 2136297473, 312176076, 172182411 },
+ { 2136248321, 381261841, 369032670 },
+ { 2136242177, 358688773, 1640007994 },
+ { 2136229889, 512677188, 75585225 },
+ { 2136219649, 2095003250, 1970086149 },
+ { 2136207361, 1909650722, 537760675 },
+ { 2136176641, 1334616195, 1533487619 },
+ { 2136158209, 2096285632, 1793285210 },
+ { 2136143873, 1897347517, 293843959 },
+ { 2136133633, 923586222, 1022655978 },
+ { 2136096769, 1464868191, 1515074410 },
+ { 2136094721, 2020679520, 2061636104 },
+ { 2136076289, 290798503, 1814726809 },
+ { 2136041473, 156415894, 1250757633 },
+ { 2135996417, 297459940, 1132158924 },
+ { 2135955457, 538755304, 1688831340 },
+ { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+ uint32_t w;
+
+ w = (uint32_t)x;
+ w += p & -(w >> 31);
+ return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+ return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+ uint32_t y;
+
+ y = 2 - p;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+ /*
+ * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+ * 2^31 - p.
+ */
+ return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a + b - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a - b;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+ a += p & -(a & 1);
+ return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+ uint64_t z, w;
+ uint32_t d;
+
+ z = (uint64_t)a * (uint64_t)b;
+ w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+ d = (uint32_t)((z + w) >> 31) - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+ uint32_t z;
+
+ /*
+ * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+ * representation), then double it with an addition.
+ */
+ z = modp_R(p);
+ z = modp_add(z, z, p);
+
+ /*
+ * Square it five times to obtain 2^32 in Montgomery representation
+ * (i.e. 2^63 mod p).
+ */
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+
+ /*
+ * Halve the value mod p to get 2^62.
+ */
+ z = (z + (p & -(z & 1))) >> 1;
+ return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+ int i;
+ uint32_t r, z;
+
+ /*
+ * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+ * representation of (2^31)^e mod p, where e = x-1.
+ * R2 is 2^31 in Montgomery representation.
+ */
+ x --;
+ r = R2;
+ z = modp_R(p);
+ for (i = 0; (1U << i) <= x; i ++) {
+ if ((x & (1U << i)) != 0) {
+ z = modp_montymul(z, r, p, p0i);
+ }
+ r = modp_montymul(r, r, p, p0i);
+ }
+ return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ * a dividend
+ * b divisor
+ * p odd prime modulus
+ * p0i -1/p mod 2^31
+ * R 2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+ uint32_t z, e;
+ int i;
+
+ e = p - 2;
+ z = R;
+ for (i = 30; i >= 0; i --) {
+ uint32_t z2;
+
+ z = modp_montymul(z, z, p, p0i);
+ z2 = modp_montymul(z, b, p, p0i);
+ z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+ }
+
+ /*
+ * The loop above just assumed that b was in Montgomery
+ * representation, i.e. really contained b*R; under that
+ * assumption, it returns 1/b in Montgomery representation,
+ * which is R/b. But we gave it b in normal representation,
+ * so the loop really returned R/(b/R) = R^2/b.
+ *
+ * We want a/b, so we need one Montgomery multiplication with a,
+ * which also remove one of the R factors, and another such
+ * multiplication to remove the second R factor.
+ */
+ z = modp_montymul(z, 1, p, p0i);
+ return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+ 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832,
+ 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928,
+ 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784,
+ 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976,
+ 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880,
+ 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904,
+ 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808,
+ 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000,
+ 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856,
+ 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952,
+ 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772,
+ 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964,
+ 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868,
+ 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916,
+ 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820,
+ 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012,
+ 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844,
+ 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940,
+ 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796,
+ 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988,
+ 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892,
+ 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898,
+ 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802,
+ 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994,
+ 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850,
+ 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946,
+ 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778,
+ 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970,
+ 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874,
+ 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922,
+ 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826,
+ 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018,
+ 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838,
+ 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934,
+ 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790,
+ 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982,
+ 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886,
+ 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910,
+ 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814,
+ 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006,
+ 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862,
+ 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958,
+ 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769,
+ 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961,
+ 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865,
+ 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913,
+ 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817,
+ 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009,
+ 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841,
+ 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937,
+ 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793,
+ 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985,
+ 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889,
+ 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901,
+ 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805,
+ 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997,
+ 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853,
+ 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949,
+ 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781,
+ 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973,
+ 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877,
+ 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925,
+ 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829,
+ 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021,
+ 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835,
+ 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931,
+ 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787,
+ 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979,
+ 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883,
+ 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907,
+ 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811,
+ 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003,
+ 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859,
+ 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955,
+ 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775,
+ 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967,
+ 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871,
+ 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919,
+ 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823,
+ 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015,
+ 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847,
+ 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943,
+ 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799,
+ 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991,
+ 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895,
+ 255, 767, 511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ * gm[rev(i)] = g^i mod p
+ * igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+ uint32_t g, uint32_t p, uint32_t p0i) {
+ size_t u, n;
+ unsigned k;
+ uint32_t ig, x1, x2, R2;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * We want g such that g^(2N) = 1 mod p, but the provided
+ * generator has order 2048. We must square it a few times.
+ */
+ R2 = modp_R2(p, p0i);
+ g = modp_montymul(g, R2, p, p0i);
+ for (k = logn; k < 10; k ++) {
+ g = modp_montymul(g, g, p, p0i);
+ }
+
+ ig = modp_div(R2, g, p, p0i, modp_R(p));
+ k = 10 - logn;
+ x1 = x2 = modp_R(p);
+ for (u = 0; u < n; u ++) {
+ size_t v;
+
+ v = REV10[u << k];
+ gm[v] = x1;
+ igm[v] = x2;
+ x1 = modp_montymul(x1, g, p, p0i);
+ x2 = modp_montymul(x2, ig, p, p0i);
+ }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, u, v1;
+
+ ht = t >> 1;
+ for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = gm[m + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + ht * stride;
+ for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = modp_montymul(*r2, s, p, p0i);
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_sub(x, y, p);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n, k;
+ uint32_t ni;
+ uint32_t *r;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = 1;
+ for (m = n; m > 1; m >>= 1) {
+ size_t hm, dt, u, v1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = igm[hm + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + t * stride;
+ for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = *r2;
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_montymul(
+ modp_sub(x, y, p), s, p, p0i);;
+ }
+ }
+ t = dt;
+ }
+
+ /*
+ * We need 1/n in Montgomery representation, i.e. R/n. Since
+ * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+ * thus a simple shift will do.
+ */
+ ni = (uint32_t)1 << (31 - logn);
+ for (k = 0, r = a; k < n; k ++, r += stride) {
+ *r = modp_montymul(*r, ni, p, p0i);
+ }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ size_t hn, u;
+
+ hn = (size_t)1 << (logn - 1);
+ for (u = 0; u < hn; u ++) {
+ uint32_t w0, w1;
+
+ w0 = f[(u << 1) + 0];
+ w1 = f[(u << 1) + 1];
+ f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ * - Rebuild the resultant and the polynomial coefficients from their
+ * values modulo small primes (of length 31 bits each).
+ *
+ * - Compute an extended GCD between the two computed resultants.
+ *
+ * - Extract top bits and add scaled values during the successive steps
+ * of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+ uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ cc = 0;
+ m = -ctl;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, w;
+
+ aw = a[u];
+ w = aw - b[u] - cc;
+ cc = w >> 31;
+ aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+ a[u] = aw;
+ }
+ return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < mlen; u ++) {
+ uint64_t z;
+
+ z = (uint64_t)m[u] * (uint64_t)x + cc;
+ m[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ * d is unsigned
+ * p is prime
+ * 2^30 < p < 2^31
+ * p0i = -(1/p) mod 2^31
+ * R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ uint32_t x;
+ size_t u;
+
+ /*
+ * Algorithm: we inject words one by one, starting with the high
+ * word. Each step is:
+ * - multiply x by 2^31
+ * - add new word
+ */
+ x = 0;
+ u = dlen;
+ while (u -- > 0) {
+ uint32_t w;
+
+ x = modp_montymul(x, R2, p, p0i);
+ w = d[u] - p;
+ w += p & -(w >> 31);
+ x = modp_add(x, w, p);
+ }
+ return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+ uint32_t z;
+
+ if (dlen == 0) {
+ return 0;
+ }
+ z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+ z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+ return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+ const uint32_t *y, size_t len, uint32_t s) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t xw, yw;
+ uint64_t z;
+
+ xw = x[u];
+ yw = y[u];
+ z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+ size_t u;
+ uint32_t r, bb;
+
+ /*
+ * Compare x with p/2. We use the shifted version of p, and p
+ * is odd, so we really compare with (p-1)/2; we want to perform
+ * the subtraction if and only if x > (p-1)/2.
+ */
+ r = 0;
+ bb = 0;
+ u = len;
+ while (u -- > 0) {
+ uint32_t wx, wp, cc;
+
+ /*
+ * Get the two words to compare in wx and wp (both over
+ * 31 bits exactly).
+ */
+ wx = x[u];
+ wp = (p[u] >> 1) | (bb << 30);
+ bb = p[u] & 1;
+
+ /*
+ * We set cc to -1, 0 or 1, depending on whether wp is
+ * lower than, equal to, or greater than wx.
+ */
+ cc = wp - wx;
+ cc = ((-cc) >> 31) | -(cc >> 31);
+
+ /*
+ * If r != 0 then it is either 1 or -1, and we keep its
+ * value. Otherwise, if r = 0, then we replace it with cc.
+ */
+ r |= cc & ((r & 1) - 1);
+ }
+
+ /*
+ * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+ * is lower than, equal to, or greater than x. We thus want to
+ * do the subtraction only if r = -1.
+ */
+ zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+ size_t num, const small_prime *primes, int normalize_signed,
+ uint32_t *tmp) {
+ size_t u;
+ uint32_t *x;
+
+ tmp[0] = primes[0].p;
+ for (u = 1; u < xlen; u ++) {
+ /*
+ * At the entry of each loop iteration:
+ * - the first u words of each array have been
+ * reassembled;
+ * - the first u words of tmp[] contains the
+ * product of the prime moduli processed so far.
+ *
+ * We call 'q' the product of all previous primes.
+ */
+ uint32_t p, p0i, s, R2;
+ size_t v;
+
+ p = primes[u].p;
+ s = primes[u].s;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ for (v = 0, x = xx; v < num; v ++, x += xstride) {
+ uint32_t xp, xq, xr;
+ /*
+ * xp = the integer x modulo the prime p for this
+ * iteration
+ * xq = (x mod q) mod p
+ */
+ xp = x[u];
+ xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+ /*
+ * New value is (x mod q) + q * (s * (xp - xq) mod p)
+ */
+ xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+ zint_add_mul_small(x, tmp, u, xr);
+ }
+
+ /*
+ * Update product of primes in tmp[].
+ */
+ tmp[u] = zint_mul_small(tmp, u, p);
+ }
+
+ /*
+ * Normalize the reconstructed values around 0.
+ */
+ if (normalize_signed) {
+ for (u = 0, x = xx; u < num; u ++, x += xstride) {
+ zint_norm_zero(x, tmp, xlen);
+ }
+ }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ /*
+ * If ctl = 1 then we flip the bits of a by XORing with
+ * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+ * with 0 and add 0, which leaves the value unchanged.
+ */
+ cc = ctl;
+ m = -ctl >> 1;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw;
+
+ aw = a[u];
+ aw = (aw ^ m) + cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ * 0 both values were positive
+ * 1 new a had to be negated
+ * 2 new b had to be negated
+ * 3 both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+ int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t nega, negb;
+
+ cca = 0;
+ ccb = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ nega = (uint32_t)((uint64_t)cca >> 63);
+ negb = (uint32_t)((uint64_t)ccb >> 63);
+ zint_negate(a, len, nega);
+ zint_negate(b, len, negb);
+ return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ * if neg = 1, then -m <= a < 0
+ * if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+ size_t u;
+ uint32_t cc, xm, ym;
+
+ /*
+ * First pass: compare a (assumed nonnegative) with m. Note that
+ * if the top word uses 32 bits, subtracting m must yield a
+ * value less than 2^31 since a < 2*m.
+ */
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ cc = (a[u] - m[u] - cc) >> 31;
+ }
+
+ /*
+ * If neg = 1 then we must add m (regardless of cc)
+ * If neg = 0 and cc = 0 then we must subtract m
+ * If neg = 0 and cc = 1 then we must do nothing
+ *
+ * In the loop below, we conditionally subtract either m or -m
+ * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+ * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+ */
+ xm = -neg >> 1;
+ ym = -(neg | (1 - cc));
+ cc = neg;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, mw;
+
+ aw = a[u];
+ mw = (m[u] ^ xm) & ym;
+ aw = aw - mw - cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+ uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t fa, fb;
+
+ /*
+ * These are actually four combined Montgomery multiplications.
+ */
+ cca = 0;
+ ccb = 0;
+ fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+ fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb
+ + m[u] * (uint64_t)fa + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+ + m[u] * (uint64_t)fb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ /*
+ * At this point:
+ * -m <= a < 2*m
+ * -m <= b < 2*m
+ * (this is a case of Montgomery reduction)
+ * The top words of 'a' and 'b' may have a 32-th bit set.
+ * We want to add or subtract the modulus, as required.
+ */
+ zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+ zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ * 0 <= u <= y
+ * 0 <= v <= x
+ * x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+ const uint32_t *x, const uint32_t *y,
+ size_t len, uint32_t *tmp) {
+ /*
+ * Algorithm is an extended binary GCD. We maintain 6 values
+ * a, b, u0, u1, v0 and v1 with the following invariants:
+ *
+ * a = x*u0 - y*v0
+ * b = x*u1 - y*v1
+ * 0 <= a <= x
+ * 0 <= b <= y
+ * 0 <= u0 < y
+ * 0 <= v0 < x
+ * 0 <= u1 <= y
+ * 0 <= v1 < x
+ *
+ * Initial values are:
+ *
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ *
+ * Each iteration reduces either a or b, and maintains the
+ * invariants. Algorithm stops when a = b, at which point their
+ * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+ * the values (u,v) we want to return.
+ *
+ * The formal definition of the algorithm is a sequence of steps:
+ *
+ * - If a is even, then:
+ * a <- a/2
+ * u0 <- u0/2 mod y
+ * v0 <- v0/2 mod x
+ *
+ * - Otherwise, if b is even, then:
+ * b <- b/2
+ * u1 <- u1/2 mod y
+ * v1 <- v1/2 mod x
+ *
+ * - Otherwise, if a > b, then:
+ * a <- (a-b)/2
+ * u0 <- (u0-u1)/2 mod y
+ * v0 <- (v0-v1)/2 mod x
+ *
+ * - Otherwise:
+ * b <- (b-a)/2
+ * u1 <- (u1-u0)/2 mod y
+ * v1 <- (v1-v0)/2 mod y
+ *
+ * We can show that the operations above preserve the invariants:
+ *
+ * - If a is even, then u0 and v0 are either both even or both
+ * odd (since a = x*u0 - y*v0, and x and y are both odd).
+ * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+ * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+ * the a = x*u0 - y*v0 invariant is preserved.
+ *
+ * - The same holds for the case where b is even.
+ *
+ * - If a and b are odd, and a > b, then:
+ *
+ * a-b = x*(u0-u1) - y*(v0-v1)
+ *
+ * In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+ * a-b > 0; therefore, it must be that v0 < v1, and the
+ * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+ * which preserves the invariants. Otherwise, if u0 > u1,
+ * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+ * b >= 0, hence a-b <= x. It follows that, in that case,
+ * v0-v1 >= 0. The first part of the update is then:
+ * (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+ * invariants.
+ *
+ * Either way, once the subtraction is done, the new value of
+ * a, which is the difference of two odd values, is even,
+ * and the remaining of this step is a subcase of the
+ * first algorithm case (i.e. when a is even).
+ *
+ * - If a and b are odd, and b > a, then the a similar
+ * argument holds.
+ *
+ * The values a and b start at x and y, respectively. Since x
+ * and y are odd, their GCD is odd, and it is easily seen that
+ * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+ * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+ * or b is reduced by at least one bit at each iteration, so
+ * the algorithm necessarily converges on the case a = b, at
+ * which point the common value is the GCD.
+ *
+ * In the algorithm expressed above, when a = b, the fourth case
+ * applies, and sets b = 0. Since a contains the GCD of x and y,
+ * which are both odd, a must be odd, and subsequent iterations
+ * (if any) will simply divide b by 2 repeatedly, which has no
+ * consequence. Thus, the algorithm can run for more iterations
+ * than necessary; the final GCD will be in a, and the (u,v)
+ * coefficients will be (u0,v0).
+ *
+ *
+ * The presentation above is bit-by-bit. It can be sped up by
+ * noticing that all decisions are taken based on the low bits
+ * and high bits of a and b. We can extract the two top words
+ * and low word of each of a and b, and compute reduction
+ * parameters pa, pb, qa and qb such that the new values for
+ * a and b are:
+ * a' = (a*pa + b*pb) / (2^31)
+ * b' = (a*qa + b*qb) / (2^31)
+ * the two divisions being exact. The coefficients are obtained
+ * just from the extracted words, and may be slightly off, requiring
+ * an optional correction: if a' < 0, then we replace pa with -pa
+ * and pb with -pb. Each such step will reduce the total length
+ * (sum of lengths of a and b) by at least 30 bits at each
+ * iteration.
+ */
+ uint32_t *u0, *u1, *v0, *v1, *a, *b;
+ uint32_t x0i, y0i;
+ uint32_t num, rc;
+ size_t j;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ /*
+ * u0 and v0 are the u and v result buffers; the four other
+ * values (u1, v1, a and b) are taken from tmp[].
+ */
+ u0 = u;
+ v0 = v;
+ u1 = tmp;
+ v1 = u1 + len;
+ a = v1 + len;
+ b = a + len;
+
+ /*
+ * We'll need the Montgomery reduction coefficients.
+ */
+ x0i = modp_ninv31(x[0]);
+ y0i = modp_ninv31(y[0]);
+
+ /*
+ * Initialize a, b, u0, u1, v0 and v1.
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ * Note that x is odd, so computing x-1 is easy.
+ */
+ memcpy(a, x, len * sizeof * x);
+ memcpy(b, y, len * sizeof * y);
+ u0[0] = 1;
+ memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+ memset(v0, 0, len * sizeof * v0);
+ memcpy(u1, y, len * sizeof * u1);
+ memcpy(v1, x, len * sizeof * v1);
+ v1[0] --;
+
+ /*
+ * Each input operand may be as large as 31*len bits, and we
+ * reduce the total length by at least 30 bits at each iteration.
+ */
+ for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+ uint32_t c0, c1;
+ uint32_t a0, a1, b0, b1;
+ uint64_t a_hi, b_hi;
+ uint32_t a_lo, b_lo;
+ int64_t pa, pb, qa, qb;
+ int i;
+ uint32_t r;
+
+ /*
+ * Extract the top words of a and b. If j is the highest
+ * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+ * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+ * If a and b are down to one word each, then we use
+ * a[0] and b[0].
+ */
+ c0 = (uint32_t) -1;
+ c1 = (uint32_t) -1;
+ a0 = 0;
+ a1 = 0;
+ b0 = 0;
+ b1 = 0;
+ j = len;
+ while (j -- > 0) {
+ uint32_t aw, bw;
+
+ aw = a[j];
+ bw = b[j];
+ a0 ^= (a0 ^ aw) & c0;
+ a1 ^= (a1 ^ aw) & c1;
+ b0 ^= (b0 ^ bw) & c0;
+ b1 ^= (b1 ^ bw) & c1;
+ c1 = c0;
+ c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+ }
+
+ /*
+ * If c1 = 0, then we grabbed two words for a and b.
+ * If c1 != 0 but c0 = 0, then we grabbed one word. It
+ * is not possible that c1 != 0 and c0 != 0, because that
+ * would mean that both integers are zero.
+ */
+ a1 |= a0 & c1;
+ a0 &= ~c1;
+ b1 |= b0 & c1;
+ b0 &= ~c1;
+ a_hi = ((uint64_t)a0 << 31) + a1;
+ b_hi = ((uint64_t)b0 << 31) + b1;
+ a_lo = a[0];
+ b_lo = b[0];
+
+ /*
+ * Compute reduction factors:
+ *
+ * a' = a*pa + b*pb
+ * b' = a*qa + b*qb
+ *
+ * such that a' and b' are both multiple of 2^31, but are
+ * only marginally larger than a and b.
+ */
+ pa = 1;
+ pb = 0;
+ qa = 0;
+ qb = 1;
+ for (i = 0; i < 31; i ++) {
+ /*
+ * At each iteration:
+ *
+ * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+ * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+ * a <- a/2 if: a is even
+ * b <- b/2 if: a is odd, b is even
+ *
+ * We multiply a_lo and b_lo by 2 at each
+ * iteration, thus a division by 2 really is a
+ * non-multiplication by 2.
+ */
+ uint32_t rt, oa, ob, cAB, cBA, cA;
+ uint64_t rz;
+
+ /*
+ * rt = 1 if a_hi > b_hi, 0 otherwise.
+ */
+ rz = b_hi - a_hi;
+ rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+ & (a_hi ^ rz))) >> 63);
+
+ /*
+ * cAB = 1 if b must be subtracted from a
+ * cBA = 1 if a must be subtracted from b
+ * cA = 1 if a must be divided by 2
+ *
+ * Rules:
+ *
+ * cAB and cBA cannot both be 1.
+ * If a is not divided by 2, b is.
+ */
+ oa = (a_lo >> i) & 1;
+ ob = (b_lo >> i) & 1;
+ cAB = oa & ob & rt;
+ cBA = oa & ob & ~rt;
+ cA = cAB | (oa ^ 1);
+
+ /*
+ * Conditional subtractions.
+ */
+ a_lo -= b_lo & -cAB;
+ a_hi -= b_hi & -(uint64_t)cAB;
+ pa -= qa & -(int64_t)cAB;
+ pb -= qb & -(int64_t)cAB;
+ b_lo -= a_lo & -cBA;
+ b_hi -= a_hi & -(uint64_t)cBA;
+ qa -= pa & -(int64_t)cBA;
+ qb -= pb & -(int64_t)cBA;
+
+ /*
+ * Shifting.
+ */
+ a_lo += a_lo & (cA - 1);
+ pa += pa & ((int64_t)cA - 1);
+ pb += pb & ((int64_t)cA - 1);
+ a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+ b_lo += b_lo & -cA;
+ qa += qa & -(int64_t)cA;
+ qb += qb & -(int64_t)cA;
+ b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+ }
+
+ /*
+ * Apply the computed parameters to our values. We
+ * may have to correct pa and pb depending on the
+ * returned value of zint_co_reduce() (when a and/or b
+ * had to be negated).
+ */
+ r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+ pa -= (pa + pa) & -(int64_t)(r & 1);
+ pb -= (pb + pb) & -(int64_t)(r & 1);
+ qa -= (qa + qa) & -(int64_t)(r >> 1);
+ qb -= (qb + qb) & -(int64_t)(r >> 1);
+ zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+ zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+ }
+
+ /*
+ * At that point, array a[] should contain the GCD, and the
+ * results (u,v) should already be set. We check that the GCD
+ * is indeed 1. We also check that the two operands x and y
+ * are odd.
+ */
+ rc = a[0] ^ 1;
+ for (j = 1; j < len; j ++) {
+ rc |= a[j];
+ }
+ return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, int32_t k,
+ uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ int32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t wy, wys, ccu;
+ uint64_t z;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ /*
+ * The expression below does not overflow.
+ */
+ z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+ /*
+ * Right-shifting the signed value z would yield
+ * implementation-defined results (arithmetic shift is
+ * not guaranteed). However, we can cast to unsigned,
+ * and get the next carry as an unsigned word. We can
+ * then convert it back to signed by using the guaranteed
+ * fact that 'int32_t' uses two's complement with no
+ * trap representation or padding bit, and with a layout
+ * compatible with that of 'uint32_t'.
+ */
+ ccu = (uint32_t)(z >> 31);
+ cc = *(int32_t *)&ccu;
+ }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ uint32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t w, wy, wys;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ w = x[u] - wys - cc;
+ x[u] = w & 0x7FFFFFFF;
+ cc = w >> 31;
+ }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+ uint32_t w;
+
+ w = x[0];
+ w |= (w & 0x40000000) << 1;
+ return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+ unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ if (flen == 0) {
+ for (u = 0; u < n; u ++) {
+ d[u] = fpr_zero;
+ }
+ return;
+ }
+ for (u = 0; u < n; u ++, f += fstride) {
+ size_t v;
+ uint32_t neg, cc, xm;
+ fpr x, fsc;
+
+ /*
+ * Get sign of the integer; if it is negative, then we
+ * will load its absolute value instead, and negate the
+ * result.
+ */
+ neg = -(f[flen - 1] >> 30);
+ xm = neg >> 1;
+ cc = neg & 1;
+ x = fpr_zero;
+ fsc = fpr_one;
+ for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+ uint32_t w;
+
+ w = (f[v] ^ xm) + cc;
+ cc = w >> 31;
+ w &= 0x7FFFFFFF;
+ w -= (w << 1) & neg;
+ x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+ }
+ d[u] = x;
+ }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = zint_one_to_plain(s + u);
+ if (z < -lim || z > lim) {
+ return 0;
+ }
+ d[u] = (int8_t)z;
+ }
+ return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t kf;
+ size_t v;
+ uint32_t *x;
+ const uint32_t *y;
+
+ kf = -k[u];
+ x = F + u * Fstride;
+ y = f;
+ for (v = 0; v < n; v ++) {
+ zint_add_scaled_mul_small(
+ x, Flen, y, flen, kf, sch, scl);
+ if (u + v == n - 1) {
+ x = F;
+ kf = -kf;
+ } else {
+ x += Fstride;
+ }
+ y += fstride;
+ }
+ }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+ uint32_t *tmp) {
+ uint32_t *gm, *igm, *fk, *t1, *x;
+ const uint32_t *y;
+ size_t n, u, tlen;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ tlen = flen + 1;
+ gm = tmp;
+ igm = gm + MKN(logn);
+ fk = igm + MKN(logn);
+ t1 = fk + n * tlen;
+
+ primes = PRIMES;
+
+ /*
+ * Compute k*f in fk[], in RNS notation.
+ */
+ for (u = 0; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0; v < n; v ++) {
+ t1[v] = modp_set(k[v], p);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, y = f, x = fk + u;
+ v < n; v ++, y += fstride, x += tlen) {
+ *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+ }
+ modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+ for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+ *x = modp_montymul(
+ modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+ }
+ modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild k*f.
+ */
+ zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+ /*
+ * Subtract k*f, scaled, from F.
+ */
+ for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+ zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+ }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+ /*
+ * We enforce little-endian representation.
+ */
+
+ uint8_t tmp[8];
+
+ inner_shake256_extract(rng, tmp, sizeof tmp);
+ return (uint64_t)tmp[0]
+ | ((uint64_t)tmp[1] << 8)
+ | ((uint64_t)tmp[2] << 16)
+ | ((uint64_t)tmp[3] << 24)
+ | ((uint64_t)tmp[4] << 32)
+ | ((uint64_t)tmp[5] << 40)
+ | ((uint64_t)tmp[6] << 48)
+ | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ * D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+ 1283868770400643928u, 6416574995475331444u, 4078260278032692663u,
+ 2353523259288686585u, 1227179971273316331u, 575931623374121527u,
+ 242543240509105209u, 91437049221049666u, 30799446349977173u,
+ 9255276791179340u, 2478152334826140u, 590642893610164u,
+ 125206034929641u, 23590435911403u, 3948334035941u,
+ 586753615614u, 77391054539u, 9056793210u,
+ 940121950u, 86539696u, 7062824u,
+ 510971u, 32764u, 1862u,
+ 94u, 4u, 0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+ unsigned u, g;
+ int val;
+
+ g = 1U << (10 - logn);
+ val = 0;
+ for (u = 0; u < g; u ++) {
+ /*
+ * Each iteration generates one value with the
+ * Gaussian distribution for N = 1024.
+ *
+ * We use two random 64-bit values. First value
+ * decides on whether the generated value is 0, and,
+ * if not, the sign of the value. Second random 64-bit
+ * word is used to generate the non-zero value.
+ *
+ * For constant-time code we have to read the complete
+ * table. This has negligible cost, compared with the
+ * remainder of the keygen process (solving the NTRU
+ * equation).
+ */
+ uint64_t r;
+ uint32_t f, v, k, neg;
+
+ /*
+ * First value:
+ * - flag 'neg' is randomly selected to be 0 or 1.
+ * - flag 'f' is set to 1 if the generated value is zero,
+ * or set to 0 otherwise.
+ */
+ r = get_rng_u64(rng);
+ neg = (uint32_t)(r >> 63);
+ r &= ~((uint64_t)1 << 63);
+ f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+ /*
+ * We produce a new random 63-bit integer r, and go over
+ * the array, starting at index 1. We store in v the
+ * index of the first array element which is not greater
+ * than r, unless the flag f was already 1.
+ */
+ v = 0;
+ r = get_rng_u64(rng);
+ r &= ~((uint64_t)1 << 63);
+ for (k = 1; k < (sizeof gauss_1024_12289)
+ / (sizeof gauss_1024_12289[0]); k ++) {
+ uint32_t t;
+
+ t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+ v |= k & -(t & (f ^ 1));
+ f |= t;
+ }
+
+ /*
+ * We apply the sign ('neg' flag). If the value is zero,
+ * the sign has no effect.
+ */
+ v = (v ^ -neg) + neg;
+
+ /*
+ * Generated value is added to val.
+ */
+ val += *(int32_t *)&v;
+ }
+ return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ * MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ * - Within an array, values grow.
+ *
+ * - The 'SMALL' array must have an entry for maximum depth, corresponding
+ * to the size of values used in the binary GCD. There is no such value
+ * for the 'LARGE' array (the binary GCD yields already reduced
+ * coefficients).
+ *
+ * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ * - Values must be large enough to handle the common cases, with some
+ * margins.
+ *
+ * - Values must not be "too large" either because we will convert some
+ * integers into floating-point values by considering the top 10 words,
+ * i.e. 310 bits; hence, for values of length more than 10 words, we
+ * should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51)
+ * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55)
+ * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77)
+ * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31)
+ * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04)
+ * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87)
+ * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38)
+ * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39)
+ * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73)
+ * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41)
+ * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+ 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+ 2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+ int avg;
+ int std;
+} BITLENGTH[] = {
+ { 4, 0 },
+ { 11, 1 },
+ { 24, 1 },
+ { 50, 1 },
+ { 102, 1 },
+ { 202, 2 },
+ { 401, 4 },
+ { 794, 5 },
+ { 1577, 8 },
+ { 3138, 13 },
+ { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG 4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = MKN(logn);
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = f[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(fpr);
+ if (km) {
+ k += (sizeof(fpr)) - km;
+ }
+ return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(uint32_t);
+ if (km) {
+ k += (sizeof(uint32_t)) - km;
+ }
+ return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ x[u] = fpr_of(f[u]);
+ }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+ int in_ntt, int out_ntt) {
+ size_t n, hn, u;
+ size_t slen, tlen;
+ uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+ const small_prime *primes;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ slen = MAX_BL_SMALL[depth];
+ tlen = MAX_BL_SMALL[depth + 1];
+ primes = PRIMES;
+
+ /*
+ * Prepare room for the result.
+ */
+ fd = data;
+ gd = fd + hn * tlen;
+ fs = gd + hn * tlen;
+ gs = fs + n * slen;
+ gm = gs + n * slen;
+ igm = gm + n;
+ t1 = igm + n;
+ memmove(fs, data, 2 * n * slen * sizeof * data);
+
+ /*
+ * First slen words: we use the input values directly, and apply
+ * inverse NTT as we go.
+ */
+ for (u = 0; u < slen; u ++) {
+ uint32_t p, p0i, R2;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+ }
+
+ for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+
+ /*
+ * Since the fs and gs words have been de-NTTized, we can use the
+ * CRT to rebuild the values.
+ */
+ zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+ zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+ /*
+ * Remaining words: use modular reductions to extract the values.
+ */
+ for (u = slen; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+ for (v = 0, x = fs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ for (v = 0, x = gs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ * 0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+ unsigned logn, unsigned depth, int out_ntt) {
+ size_t n, u;
+ uint32_t *ft, *gt, p0;
+ unsigned d;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ ft = data;
+ gt = ft + n;
+ primes = PRIMES;
+ p0 = primes[0].p;
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p0);
+ gt[u] = modp_set(g[u], p0);
+ }
+
+ if (depth == 0 && out_ntt) {
+ uint32_t *gm, *igm;
+ uint32_t p, p0i;
+
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ gm = gt + n;
+ igm = gm + MKN(logn);
+ modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ return;
+ }
+
+ if (depth == 0) {
+ return;
+ }
+
+ if (depth == 1) {
+ make_fg_step(data, logn, 0, 0, out_ntt);
+ return;
+ }
+
+ make_fg_step(data, logn, 0, 0, 1);
+ for (d = 1; d + 1 < depth; d ++) {
+ make_fg_step(data, logn - d, d, 1, 1);
+ }
+ make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t len;
+ uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+ const small_prime *primes;
+
+ len = MAX_BL_SMALL[logn_top];
+ primes = PRIMES;
+
+ Fp = tmp;
+ Gp = Fp + len;
+ fp = Gp + len;
+ gp = fp + len;
+ t1 = gp + len;
+
+ make_fg(fp, f, g, logn_top, logn_top, 0);
+
+ /*
+ * We use the CRT to rebuild the resultants as big integers.
+ * There are two such big integers. The resultants are always
+ * nonnegative.
+ */
+ zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+ /*
+ * Apply the binary GCD. The zint_bezout() function works only
+ * if both inputs are odd.
+ *
+ * We can test on the result and return 0 because that would
+ * imply failure of the NTRU solving equation, and the (f,g)
+ * values will be abandoned in that case.
+ */
+ if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+ return 0;
+ }
+
+ /*
+ * Multiply the two values by the target value q. Values must
+ * fit in the destination arrays.
+ * We can again test on the returned words: a non-zero output
+ * of zint_mul_small() means that we exceeded our array
+ * capacity, and that implies failure and rejection of (f,g).
+ */
+ q = 12289;
+ if (zint_mul_small(Fp, len, q) != 0
+ || zint_mul_small(Gp, len, q) != 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+ const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+ /*
+ * In this function, 'logn' is the log2 of the degree for
+ * this step. If N = 2^logn, then:
+ * - the F and G values already in fk->tmp (from the deeper
+ * levels) have degree N/2;
+ * - this function should return F and G of degree N.
+ */
+ unsigned logn;
+ size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+ int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+ uint32_t *x, *y;
+ int32_t *k;
+ const small_prime *primes;
+
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2 or N/3)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+ primes = PRIMES;
+
+ /*
+ * Fd and Gd are the F and G from the deeper level.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+
+ /*
+ * Compute the input f and g for this level. Note that we get f
+ * and g in RNS + NTT representation.
+ */
+ ft = Gd + dlen * hn;
+ make_fg(ft, f, g, logn_top, depth, 1);
+
+ /*
+ * Move the newly computed f and g to make room for our candidate
+ * F and G (unreduced).
+ */
+ Ft = tmp;
+ Gt = Ft + n * llen;
+ t1 = Gt + n * llen;
+ memmove(t1, ft, 2 * n * slen * sizeof * ft);
+ ft = t1;
+ gt = ft + slen * n;
+ t1 = gt + slen * n;
+
+ /*
+ * Move Fd and Gd _after_ f and g.
+ */
+ memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+ Fd = t1;
+ Gd = Fd + hn * dlen;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt (only n/2 values in each).
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * We do not need Fd and Gd after that point.
+ */
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * If we processed slen words, then f and g have been
+ * de-NTTized, and are in RNS; we can rebuild them.
+ */
+ if (u == slen) {
+ zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+ zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+ }
+
+ gm = t1;
+ igm = gm + n;
+ fx = igm + n;
+ gx = fx + n;
+
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ if (u < slen) {
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = *x;
+ gx[v] = *y;
+ }
+ modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+ modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+ } else {
+ uint32_t Rx;
+
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ for (v = 0, x = ft, y = gt;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = zint_mod_small_signed(x, slen,
+ p, p0i, R2, Rx);
+ gx[v] = zint_mod_small_signed(y, slen,
+ p, p0i, R2, Rx);
+ }
+ modp_NTT2(fx, gm, logn, p, p0i);
+ modp_NTT2(gx, gm, logn, p, p0i);
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed in
+ * a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * General case:
+ *
+ * we divide degree by d = 2 or 3
+ * f'(x^d) = N(f)(x^d) = f * adj(f)
+ * g'(x^d) = N(g)(x^d) = g * adj(g)
+ * f'*G' - g'*F' = q
+ * F = F'(x^d) * adj(g)
+ * G = G'(x^d) * adj(f)
+ *
+ * We compute things in the NTT. We group roots of phi
+ * such that all roots x in a group share the same x^d.
+ * If the roots in a group are x_1, x_2... x_d, then:
+ *
+ * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+ *
+ * Thus, we have:
+ *
+ * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * ...
+ * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+ *
+ * In all cases, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, in our chosen NTT representation, roots
+ * from the same group are consecutive in RAM.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+ v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild F and G with the CRT.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+ zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+ /*
+ * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+ * order).
+ */
+
+ /*
+ * Apply Babai reduction to bring back F and G to size slen.
+ *
+ * We use the FFT to compute successive approximations of the
+ * reduction coefficient. We first isolate the top bits of
+ * the coefficients of f and g, and convert them to floating
+ * point; with the FFT, we compute adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)).
+ *
+ * Then, we repeatedly apply the following:
+ *
+ * - Get the top bits of the coefficients of F and G into
+ * floating point, and use the FFT to compute:
+ * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+ *
+ * - Convert back that value into normal representation, and
+ * round it to the nearest integers, yielding a polynomial k.
+ * Proper scaling is applied to f, g, F and G so that the
+ * coefficients fit on 32 bits (signed).
+ *
+ * - Subtract k*f from F and k*g from G.
+ *
+ * Under normal conditions, this process reduces the size of F
+ * and G by some bits at each iteration. For constant-time
+ * operation, we do not want to measure the actual length of
+ * F and G; instead, we do the following:
+ *
+ * - f and g are converted to floating-point, with some scaling
+ * if necessary to keep values in the representable range.
+ *
+ * - For each iteration, we _assume_ a maximum size for F and G,
+ * and use the values at that size. If we overreach, then
+ * we get zeros, which is harmless: the resulting coefficients
+ * of k will be 0 and the value won't be reduced.
+ *
+ * - We conservatively assume that F and G will be reduced by
+ * at least 25 bits at each iteration.
+ *
+ * Even when reaching the bottom of the reduction, reduction
+ * coefficient will remain low. If it goes out-of-range, then
+ * something wrong occurred and the whole NTRU solving fails.
+ */
+
+ /*
+ * Memory layout:
+ * - We need to compute and keep adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+ * respectively).
+ * - At each iteration we need two extra fp buffer (N fp values),
+ * and produce a k (N 32-bit words). k will be shared with one
+ * of the fp buffers.
+ * - To compute k*f and k*g efficiently (with the NTT), we need
+ * some extra room; we reuse the space of the temporary buffers.
+ *
+ * Arrays of 'fpr' are obtained from the temporary array itself.
+ * We ensure that the base is at a properly aligned offset (the
+ * source array tmp[] is supposed to be already aligned).
+ */
+
+ rt3 = align_fpr(tmp, t1);
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt1 = rt5 + (n >> 1);
+ k = (int32_t *)align_u32(tmp, rt1);
+ rt2 = align_fpr(tmp, k + n);
+ if (rt2 < (rt1 + n)) {
+ rt2 = rt1 + n;
+ }
+ t1 = (uint32_t *)k + n;
+
+ /*
+ * Get f and g into rt3 and rt4 as floating-point approximations.
+ *
+ * We need to "scale down" the floating-point representation of
+ * coefficients when they are too big. We want to keep the value
+ * below 2^310 or so. Thus, when values are larger than 10 words,
+ * we consider only the top 10 words. Array lengths have been
+ * computed so that average maximum length will fall in the
+ * middle or the upper half of these top 10 words.
+ */
+ if (slen > 10) {
+ rlen = 10;
+ } else {
+ rlen = slen;
+ }
+ poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+ poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+ /*
+ * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+ */
+ scale_fg = 31 * (int)(slen - rlen);
+
+ /*
+ * Estimated boundaries for the maximum size (in bits) of the
+ * coefficients of (f,g). We use the measured average, and
+ * allow for a deviation of at most six times the standard
+ * deviation.
+ */
+ minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+ maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+ /*
+ * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+ * and adj(g) in rt3 and rt4, respectively.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt4, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt4, logn);
+
+ /*
+ * Reduce F and G repeatedly.
+ *
+ * The expected maximum bit length of coefficients of F and G
+ * is kept in maxbl_FG, with the corresponding word length in
+ * FGlen.
+ */
+ FGlen = llen;
+ maxbl_FG = 31 * (int)llen;
+
+ /*
+ * Each reduction operation computes the reduction polynomial
+ * "k". We need that polynomial to have coefficients that fit
+ * on 32-bit signed integers, with some scaling; thus, we use
+ * a descending sequence of scaling values, down to zero.
+ *
+ * The size of the coefficients of k is (roughly) the difference
+ * between the size of the coefficients of (F,G) and the size
+ * of the coefficients of (f,g). Thus, the maximum size of the
+ * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+ * this is our starting scale value for k.
+ *
+ * We need to estimate the size of (F,G) during the execution of
+ * the algorithm; we are allowed some overestimation but not too
+ * much (poly_big_to_fp() uses a 310-bit window). Generally
+ * speaking, after applying a reduction with k scaled to
+ * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+ * where 'dd' is a few bits to account for the fact that the
+ * reduction is never perfect (intuitively, dd is on the order
+ * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+ * bits).
+ *
+ * The size of (f,g) is not known exactly, but maxbl_fg is an
+ * upper bound.
+ */
+ scale_k = maxbl_FG - minbl_fg;
+
+ for (;;) {
+ int scale_FG, dc, new_maxbl_FG;
+ uint32_t scl, sch;
+ fpr pdc, pt;
+
+ /*
+ * Convert current F and G into floating-point. We apply
+ * scaling if the current length is more than 10 words.
+ */
+ if (FGlen > 10) {
+ rlen = 10;
+ } else {
+ rlen = FGlen;
+ }
+ scale_FG = 31 * (int)(FGlen - rlen);
+ poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+ poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(rt2, rt1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt2, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt2, logn);
+
+ /*
+ * (f,g) are scaled by 'scale_fg', meaning that the
+ * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+ * to have their true mathematical value.
+ *
+ * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+ * the value we computed in rt2 is scaled by
+ * 'scale_FG-scale_fg'.
+ *
+ * We want that value to be scaled by 'scale_k', hence we
+ * apply a corrective scaling. After scaling, the values
+ * should fit in -2^31-1..+2^31-1.
+ */
+ dc = scale_k - scale_FG + scale_fg;
+
+ /*
+ * We will need to multiply values by 2^(-dc). The value
+ * 'dc' is not secret, so we can compute 2^(-dc) with a
+ * non-constant-time process.
+ * (We could use ldexp(), but we prefer to avoid any
+ * dependency on libm. When using FP emulation, we could
+ * use our fpr_ldexp(), which is constant-time.)
+ */
+ if (dc < 0) {
+ dc = -dc;
+ pt = fpr_two;
+ } else {
+ pt = fpr_onehalf;
+ }
+ pdc = fpr_one;
+ while (dc != 0) {
+ if ((dc & 1) != 0) {
+ pdc = fpr_mul(pdc, pt);
+ }
+ dc >>= 1;
+ pt = fpr_sqr(pt);
+ }
+
+ for (u = 0; u < n; u ++) {
+ fpr xv;
+
+ xv = fpr_mul(rt2[u], pdc);
+
+ /*
+ * Sometimes the values can be out-of-bounds if
+ * the algorithm fails; we must not call
+ * fpr_rint() (and cast to int32_t) if the value
+ * is not in-bounds. Note that the test does not
+ * break constant-time discipline, since any
+ * failure here implies that we discard the current
+ * secret key (f,g).
+ */
+ if (!fpr_lt(fpr_mtwo31m1, xv)
+ || !fpr_lt(xv, fpr_ptwo31m1)) {
+ return 0;
+ }
+ k[u] = (int32_t)fpr_rint(xv);
+ }
+
+ /*
+ * Values in k[] are integers. They really are scaled
+ * down by maxbl_FG - minbl_fg bits.
+ *
+ * If we are at low depth, then we use the NTT to
+ * compute k*f and k*g.
+ */
+ sch = (uint32_t)(scale_k / 31);
+ scl = (uint32_t)(scale_k % 31);
+ if (depth <= DEPTH_INT_FG) {
+ poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn, t1);
+ poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn, t1);
+ } else {
+ poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn);
+ poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn);
+ }
+
+ /*
+ * We compute the new maximum size of (F,G), assuming that
+ * (f,g) has _maximal_ length (i.e. that reduction is
+ * "late" instead of "early". We also adjust FGlen
+ * accordingly.
+ */
+ new_maxbl_FG = scale_k + maxbl_fg + 10;
+ if (new_maxbl_FG < maxbl_FG) {
+ maxbl_FG = new_maxbl_FG;
+ if ((int)FGlen * 31 >= maxbl_FG + 31) {
+ FGlen --;
+ }
+ }
+
+ /*
+ * We suppose that scaling down achieves a reduction by
+ * at least 25 bits per iteration. We stop when we have
+ * done the loop with an unscaled k.
+ */
+ if (scale_k <= 0) {
+ break;
+ }
+ scale_k -= 25;
+ if (scale_k < 0) {
+ scale_k = 0;
+ }
+ }
+
+ /*
+ * If (F,G) length was lowered below 'slen', then we must take
+ * care to re-extend the sign.
+ */
+ if (FGlen < slen) {
+ for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+ size_t v;
+ uint32_t sw;
+
+ sw = -(Ft[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Ft[v] = sw;
+ }
+ sw = -(Gt[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Gt[v] = sw;
+ }
+ }
+ }
+
+ /*
+ * Compress encoding of all values to 'slen' words (this is the
+ * expected output format).
+ */
+ for (u = 0, x = tmp, y = tmp;
+ u < (n << 1); u ++, x += slen, y += llen) {
+ memmove(x, y, slen * sizeof * y);
+ }
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ /*
+ * The first half of this function is a copy of the corresponding
+ * part in solve_NTRU_intermediate(), for the reconstruction of
+ * the unreduced F and G. The second half (Babai reduction) is
+ * done differently, because the unreduced F and G fit in 53 bits
+ * of precision, allowing a much simpler process with lower RAM
+ * usage.
+ */
+ unsigned depth, logn;
+ size_t n_top, n, hn, slen, dlen, llen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+ uint32_t *x, *y;
+
+ depth = 1;
+ n_top = (size_t)1 << logn_top;
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ */
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+
+ /*
+ * Fd and Gd are the F and G from the deeper level. Ft and Gt
+ * are the destination arrays for the unreduced F and G.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+ Ft = Gd + dlen * hn;
+ Gt = Ft + llen * n;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * Now Fd and Gd are not needed anymore; we can squeeze them out.
+ */
+ memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+ Ft = tmp;
+ memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+ Gt = Ft + llen * n;
+ ft = Gt + llen * n;
+ gt = ft + slen * n;
+
+ t1 = gt + slen * n;
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ unsigned e;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * We recompute things from the source f and g, of full
+ * degree. However, we will need only the n first elements
+ * of the inverse NTT table (igm); the call to modp_mkgm()
+ * below will fill n_top elements in igm[] (thus overflowing
+ * into fx[]) but later code will overwrite these extra
+ * elements.
+ */
+ gm = t1;
+ igm = gm + n_top;
+ fx = igm + n;
+ gx = fx + n_top;
+ modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+ /*
+ * Set ft and gt to f and g modulo p, respectively.
+ */
+ for (v = 0; v < n_top; v ++) {
+ fx[v] = modp_set(f[v], p);
+ gx[v] = modp_set(g[v], p);
+ }
+
+ /*
+ * Convert to NTT and compute our f and g.
+ */
+ modp_NTT2(fx, gm, logn_top, p, p0i);
+ modp_NTT2(gx, gm, logn_top, p, p0i);
+ for (e = logn_top; e > logn; e --) {
+ modp_poly_rec_res(fx, e, p, p0i, R2);
+ modp_poly_rec_res(gx, e, p, p0i, R2);
+ }
+
+ /*
+ * From that point onward, we only need tables for
+ * degree n, so we can save some space.
+ */
+ if (depth > 0) { /* always true */
+ memmove(gm + n, igm, n * sizeof * igm);
+ igm = gm + n;
+ memmove(igm + n, fx, n * sizeof * ft);
+ fx = igm + n;
+ memmove(fx + n, gx, n * sizeof * gt);
+ gx = fx + n;
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed
+ * in a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * Equations are:
+ *
+ * f'(x^2) = N(f)(x^2) = f * adj(f)
+ * g'(x^2) = N(g)(x^2) = g * adj(g)
+ *
+ * f'*G' - g'*F' = q
+ *
+ * F = F'(x^2) * adj(g)
+ * G = G'(x^2) * adj(f)
+ *
+ * The NTT representation of f is f(w) for all w which
+ * are roots of phi. In the binary case, as well as in
+ * the ternary case for all depth except the deepest,
+ * these roots can be grouped in pairs (w,-w), and we
+ * then have:
+ *
+ * f(w) = adj(f)(-w)
+ * f(-w) = adj(f)(w)
+ *
+ * and w^2 is then a root for phi at the half-degree.
+ *
+ * At the deepest level in the ternary case, this still
+ * holds, in the following sense: the roots of x^2-x+1
+ * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+ * have:
+ *
+ * f(w) = adj(f)(-w^2)
+ * f(-w^2) = adj(f)(w)
+ *
+ * In all case, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, the two roots for each pair are consecutive
+ * in our bit-reversal encoding.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+ /*
+ * Also save ft and gt (only up to size slen).
+ */
+ if (u < slen) {
+ modp_iNTT2(fx, igm, logn, p, p0i);
+ modp_iNTT2(gx, igm, logn, p, p0i);
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ *x = fx[v];
+ *y = gx[v];
+ }
+ }
+ }
+
+ /*
+ * Rebuild f, g, F and G with the CRT. Note that the elements of F
+ * and G are consecutive, and thus can be rebuilt in a single
+ * loop; similarly, the elements of f and g are consecutive.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+ zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+ /*
+ * Here starts the Babai reduction, specialized for depth = 1.
+ *
+ * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+ * are converted to floating point. There is no scaling, and a
+ * single pass is sufficient.
+ */
+
+ /*
+ * Convert F and G into floating point (rt1 and rt2).
+ */
+ rt1 = align_fpr(tmp, gt + slen * n);
+ rt2 = rt1 + n;
+ poly_big_to_fp(rt1, Ft, llen, llen, logn);
+ poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+ /*
+ * Integer representation of F and G is no longer needed, we
+ * can remove it.
+ */
+ memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+ ft = tmp;
+ gt = ft + slen * n;
+ rt3 = align_fpr(tmp, gt + slen * n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * Convert f and g into floating point (rt3 and rt4).
+ */
+ poly_big_to_fp(rt3, ft, slen, slen, logn);
+ poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+ /*
+ * Remove unneeded ft and gt.
+ */
+ memmove(tmp, rt1, 4 * n * sizeof * rt1);
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * We now have:
+ * rt1 = F
+ * rt2 = G
+ * rt3 = f
+ * rt4 = g
+ * in that order in RAM. We convert all of them to FFT.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = F*adj(f) + G*adj(g)
+ * rt6 = 1 / (f*adj(f) + g*adj(g))
+ * (Note that rt6 is half-length.)
+ */
+ rt5 = rt4 + n;
+ rt6 = rt5 + n;
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+ /*
+ * Compute k as the rounded version of rt5. Check that none of
+ * the values is larger than 2^63-1 (in absolute value)
+ * because that would make the fpr_rint() do something undefined;
+ * note that any out-of-bounds value here implies a failure and
+ * (f,g) will be discarded, so we can make a simple test.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt5, logn);
+ for (u = 0; u < n; u ++) {
+ fpr z;
+
+ z = rt5[u];
+ if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+ return 0;
+ }
+ rt5[u] = fpr_of(fpr_rint(z));
+ }
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt5, logn);
+
+ /*
+ * Subtract k*f from F, and k*g from G.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt3, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(rt4, rt5, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt2, logn);
+
+ /*
+ * Convert back F and G to integers, and return.
+ */
+ Ft = tmp;
+ Gt = Ft + n;
+ rt3 = align_fpr(tmp, Gt + n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ for (u = 0; u < n; u ++) {
+ Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+ Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t n, hn, u;
+ uint32_t p, p0i, R2;
+ uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+ uint32_t *gm, *igm, *ft, *gt;
+ fpr *rt2, *rt3;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ *
+ * Everything should fit in 31-bit integers, hence we can just use
+ * the first small prime p = 2147473409.
+ */
+ p = PRIMES[0].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ Fp = tmp;
+ Gp = Fp + hn;
+ ft = Gp + hn;
+ gt = ft + n;
+ gm = gt + n;
+ igm = gm + n;
+
+ modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F' anf G' in NTT representation.
+ */
+ for (u = 0; u < hn; u ++) {
+ Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+ Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Load f and g and convert them to NTT representation.
+ */
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+
+ /*
+ * Build the unreduced F,G in ft and gt.
+ */
+ for (u = 0; u < n; u += 2) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = ft[u + 0];
+ ftB = ft[u + 1];
+ gtA = gt[u + 0];
+ gtB = gt[u + 1];
+ mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+ mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+ ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+ ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+ gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+ gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2(ft, igm, logn, p, p0i);
+ modp_iNTT2(gt, igm, logn, p, p0i);
+
+ Gp = Fp + n;
+ t1 = Gp + n;
+ memmove(Fp, ft, 2 * n * sizeof * ft);
+
+ /*
+ * We now need to apply the Babai reduction. At that point,
+ * we have F and G in two n-word arrays.
+ *
+ * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+ * modulo p, using the NTT. We still move memory around in
+ * order to save RAM.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+
+ /*
+ * Compute the NTT tables in t1 and t2. We do not keep t2
+ * (we'll recompute it later on).
+ */
+ modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F and G to NTT.
+ */
+ modp_NTT2(Fp, t1, logn, p, p0i);
+ modp_NTT2(Gp, t1, logn, p, p0i);
+
+ /*
+ * Load f and adj(f) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(f[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[n - u] = modp_set(-f[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Compute F*adj(f) in t2, and f*adj(f) in t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_montymul(w, Fp[u], p, p0i);
+ t3[u] = modp_montymul(w, t4[u], p, p0i);
+ }
+
+ /*
+ * Load g and adj(g) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(g[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(g[u], p);
+ t5[n - u] = modp_set(-g[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Add G*adj(g) to t2, and g*adj(g) to t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_add(t2[u],
+ modp_montymul(w, Gp[u], p, p0i), p);
+ t3[u] = modp_add(t3[u],
+ modp_montymul(w, t4[u], p, p0i), p);
+ }
+
+ /*
+ * Convert back t2 and t3 to normal representation (normalized
+ * around 0), and then
+ * move them to t1 and t2. We first need to recompute the
+ * inverse table for NTT.
+ */
+ modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+ modp_iNTT2(t2, t4, logn, p, p0i);
+ modp_iNTT2(t3, t4, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint32_t)modp_norm(t2[u], p);
+ t2[u] = (uint32_t)modp_norm(t3[u], p);
+ }
+
+ /*
+ * At that point, array contents are:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * F*adj(f)+G*adj(g) (t1)
+ * f*adj(f)+g*adj(g) (t2)
+ *
+ * We want to divide t1 by t2. The result is not integral; it
+ * must be rounded. We thus need to use the FFT.
+ */
+
+ /*
+ * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+ * polynomial is auto-adjoint, all its coordinates in FFT
+ * representation are actually real, so we can truncate off
+ * the imaginary parts.
+ */
+ rt3 = align_fpr(tmp, t3);
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t2)[u]);
+ }
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+ rt2 = align_fpr(tmp, t2);
+ memmove(rt2, rt3, hn * sizeof * rt3);
+
+ /*
+ * Convert F*adj(f)+G*adj(g) in FFT representation.
+ */
+ rt3 = rt2 + hn;
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t1)[u]);
+ }
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt3, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+ * its rounded normal representation in t1.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_div_autoadj_fft(rt3, rt2, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt3, logn);
+ for (u = 0; u < n; u ++) {
+ t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+ }
+
+ /*
+ * RAM contents are now:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * k (t1)
+ *
+ * We want to compute F-k*f, and G-k*g.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+ modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(t1, t2, logn, p, p0i);
+ modp_NTT2(t4, t2, logn, p, p0i);
+ modp_NTT2(t5, t2, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t kw;
+
+ kw = modp_montymul(t1[u], R2, p, p0i);
+ Fp[u] = modp_sub(Fp[u],
+ modp_montymul(kw, t4[u], p, p0i), p);
+ Gp[u] = modp_sub(Gp[u],
+ modp_montymul(kw, t5[u], p, p0i), p);
+ }
+ modp_iNTT2(Fp, t3, logn, p, p0i);
+ modp_iNTT2(Gp, t3, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+ Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+ }
+
+ return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+ const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+ size_t n, u;
+ uint32_t *ft, *gt, *Ft, *Gt, *gm;
+ uint32_t p, p0i, r;
+ const small_prime *primes;
+
+ n = MKN(logn);
+
+ if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+ return 0;
+ }
+
+ /*
+ * For logn <= 2, we need to use solve_NTRU_intermediate()
+ * directly, because coefficients are a bit too large and
+ * do not fit the hypotheses in solve_NTRU_binary_depth0().
+ */
+ if (logn <= 2) {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 0) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ } else {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 2) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+ return 0;
+ }
+ if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+ return 0;
+ }
+ }
+
+ /*
+ * If no buffer has been provided for G, use a temporary one.
+ */
+ if (G == NULL) {
+ G = (int8_t *)(tmp + 2 * n);
+ }
+
+ /*
+ * Final F and G are in fk->tmp, one word per coefficient
+ * (signed value over 31 bits).
+ */
+ if (!poly_big_to_small(F, tmp, lim, logn)
+ || !poly_big_to_small(G, tmp + n, lim, logn)) {
+ return 0;
+ }
+
+ /*
+ * Verify that the NTRU equation is fulfilled. Since all elements
+ * have short lengths, verifying modulo a small prime p works, and
+ * allows using the NTT.
+ *
+ * We put Gt[] first in tmp[], and process it first, so that it does
+ * not overlap with G[] in case we allocated it ourselves.
+ */
+ Gt = tmp;
+ ft = Gt + n;
+ gt = ft + n;
+ Ft = gt + n;
+ gm = Ft + n;
+
+ primes = PRIMES;
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Gt[u] = modp_set(G[u], p);
+ }
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ Ft[u] = modp_set(F[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ modp_NTT2(Ft, gm, logn, p, p0i);
+ modp_NTT2(Gt, gm, logn, p, p0i);
+ r = modp_montymul(12289, 1, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t z;
+
+ z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+ modp_montymul(gt[u], Ft[u], p, p0i), p);
+ if (z != r) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+ size_t n, u;
+ unsigned mod2;
+
+ n = MKN(logn);
+ mod2 = 0;
+ for (u = 0; u < n; u ++) {
+ int s;
+
+restart:
+ s = mkgauss(rng, logn);
+
+ /*
+ * We need the coefficient to fit within -127..+127;
+ * realistically, this is always the case except for
+ * the very low degrees (N = 2 or 4), for which there
+ * is no real security anyway.
+ */
+ if (s < -127 || s > 127) {
+ goto restart;
+ }
+
+ /*
+ * We need the sum of all coefficients to be 1; otherwise,
+ * the resultant of the polynomial with X^N+1 will be even,
+ * and the binary GCD will fail.
+ */
+ if (u == n - 1) {
+ if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+ goto restart;
+ }
+ } else {
+ mod2 ^= (unsigned)(s & 1);
+ }
+ f[u] = (int8_t)s;
+ }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp) {
+ /*
+ * Algorithm is the following:
+ *
+ * - Generate f and g with the Gaussian distribution.
+ *
+ * - If either Res(f,phi) or Res(g,phi) is even, try again.
+ *
+ * - If ||(f,g)|| is too large, try again.
+ *
+ * - If ||B~_{f,g}|| is too large, try again.
+ *
+ * - If f is not invertible mod phi mod q, try again.
+ *
+ * - Compute h = g/f mod phi mod q.
+ *
+ * - Solve the NTRU equation fG - gF = q; if the solving fails,
+ * try again. Usual failure condition is when Res(f,phi)
+ * and Res(g,phi) are not prime to each other.
+ */
+ size_t n, u;
+ uint16_t *h2, *tmp2;
+ RNG_CONTEXT *rc;
+
+ n = MKN(logn);
+ rc = rng;
+
+ /*
+ * We need to generate f and g randomly, until we find values
+ * such that the norm of (g,-f), and of the orthogonalized
+ * vector, are satisfying. The orthogonalized vector is:
+ * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+ * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+ *
+ * In the binary case, coefficients of f and g are generated
+ * independently of each other, with a discrete Gaussian
+ * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+ * the two vectors have expected norm 1.17*sqrt(q), which is
+ * also our acceptance bound: we require both vectors to be no
+ * larger than that (this will be satisfied about 1/4th of the
+ * time, thus we expect sampling new (f,g) about 4 times for that
+ * step).
+ *
+ * We require that Res(f,phi) and Res(g,phi) are both odd (the
+ * NTRU equation solver requires it).
+ */
+ for (;;) {
+ fpr *rt1, *rt2, *rt3;
+ fpr bnorm;
+ uint32_t normf, normg, norm;
+ int lim;
+
+ /*
+ * The poly_small_mkgauss() function makes sure
+ * that the sum of coefficients is 1 modulo 2
+ * (i.e. the resultant of the polynomial with phi
+ * will be odd).
+ */
+ poly_small_mkgauss(rc, f, logn);
+ poly_small_mkgauss(rc, g, logn);
+
+ /*
+ * Verify that all coefficients are within the bounds
+ * defined in max_fg_bits. This is the case with
+ * overwhelming probability; this guarantees that the
+ * key will be encodable with FALCON_COMP_TRIM.
+ */
+ lim = 1 << (PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[logn] - 1);
+ for (u = 0; u < n; u ++) {
+ /*
+ * We can use non-CT tests since on any failure
+ * we will discard f and g.
+ */
+ if (f[u] >= lim || f[u] <= -lim
+ || g[u] >= lim || g[u] <= -lim) {
+ lim = -1;
+ break;
+ }
+ }
+ if (lim < 0) {
+ continue;
+ }
+
+ /*
+ * Bound is 1.17*sqrt(q). We compute the squared
+ * norms. With q = 12289, the squared bound is:
+ * (1.17^2)* 12289 = 16822.4121
+ * Since f and g are integral, the squared norm
+ * of (g,-f) is an integer.
+ */
+ normf = poly_small_sqnorm(f, logn);
+ normg = poly_small_sqnorm(g, logn);
+ norm = (normf + normg) | -((normf | normg) >> 31);
+ if (norm >= 16823) {
+ continue;
+ }
+
+ /*
+ * We compute the orthogonalized vector norm.
+ */
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ poly_small_to_fp(rt1, f, logn);
+ poly_small_to_fp(rt2, g, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_adj_fft(rt2, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(rt1, fpr_q, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(rt2, fpr_q, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_autoadj_fft(rt2, rt3, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(rt2, logn);
+ bnorm = fpr_zero;
+ for (u = 0; u < n; u ++) {
+ bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+ bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+ }
+ if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+ continue;
+ }
+
+ /*
+ * Compute public key h = g/f mod X^N+1 mod q. If this
+ * fails, we must restart.
+ */
+ if (h == NULL) {
+ h2 = (uint16_t *)tmp;
+ tmp2 = h2 + n;
+ } else {
+ h2 = h;
+ tmp2 = (uint16_t *)tmp;
+ }
+ if (!PQCLEAN_FALCONPADDED1024_CLEAN_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+ continue;
+ }
+
+ /*
+ * Solve the NTRU equation to get F and G.
+ */
+ lim = (1 << (PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[logn] - 1)) - 1;
+ if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+ continue;
+ }
+
+ /*
+ * Key pair is generated.
+ */
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/pqclean.c
new file mode 100644
index 000000000..eb6cc85a1
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include
+#include
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN 40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ * private key:
+ * header byte: 0101nnnn
+ * private f (6 or 5 bits by element, depending on degree)
+ * private g (6 or 5 bits by element, depending on degree)
+ * private F (8 bits by element)
+ *
+ * public key:
+ * header byte: 0000nnnn
+ * public h (14 bits by element)
+ *
+ * signature:
+ * header byte: 0011nnnn
+ * nonce (r) 40 bytes
+ * value (s) compressed format
+ * padding to 1280 bytes
+ *
+ * message + signature:
+ * signature 1280 bytes
+ * message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk) {
+ union {
+ uint8_t b[FALCON_KEYGEN_TEMP_10];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[1024], g[1024], F[1024];
+ uint16_t h[1024];
+ unsigned char seed[48];
+ inner_shake256_context rng;
+ size_t u, v;
+
+ /*
+ * Generate key pair.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&rng);
+ inner_shake256_inject(&rng, seed, sizeof seed);
+ inner_shake256_flip(&rng);
+ PQCLEAN_FALCONPADDED1024_CLEAN_keygen(&rng, f, g, F, NULL, h, 10, tmp.b);
+ inner_shake256_ctx_release(&rng);
+
+ /*
+ * Encode private key.
+ */
+ sk[0] = 0x50 + 10;
+ u = 1;
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+ f, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+ g, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+ F, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[10]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+
+ /*
+ * Encode public key.
+ */
+ pk[0] = 0x00 + 10;
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_modq_encode(
+ pk + 1, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1,
+ h, 10);
+ if (v != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ union {
+ uint8_t b[72 * 1024];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[1024], g[1024], F[1024], G[1024];
+ struct {
+ int16_t sig[1024];
+ uint16_t hm[1024];
+ } r;
+ unsigned char seed[48];
+ inner_shake256_context sc;
+ size_t u, v;
+
+ /*
+ * Decode the private key.
+ */
+ if (sk[0] != 0x50 + 10) {
+ return -1;
+ }
+ u = 1;
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+ f, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10],
+ sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+ g, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_fg_bits[10],
+ sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_trim_i8_decode(
+ F, 10, PQCLEAN_FALCONPADDED1024_CLEAN_max_FG_bits[10],
+ sk + u, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+ if (!PQCLEAN_FALCONPADDED1024_CLEAN_complete_private(G, f, g, F, 10, tmp.b)) {
+ return -1;
+ }
+
+ /*
+ * Create a random nonce (40 bytes).
+ */
+ randombytes(nonce, NONCELEN);
+
+ /*
+ * Hash message nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(&sc, r.hm, 10, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Initialize a RNG.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, seed, sizeof seed);
+ inner_shake256_flip(&sc);
+
+ /*
+ * Compute and return the signature. This loops until a signature
+ * value is found that fits in the provided buffer.
+ */
+ for (;;) {
+ PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 10, tmp.b);
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_comp_encode(sigbuf, sigbuflen, r.sig, 10);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ memset(sigbuf + v, 0, sigbuflen - v);
+ return 0;
+ }
+ }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+ const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ union {
+ uint8_t b[2 * 1024];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ uint16_t h[1024], hm[1024];
+ int16_t sig[1024];
+ inner_shake256_context sc;
+ size_t v;
+
+ /*
+ * Decode public key.
+ */
+ if (pk[0] != 0x00 + 10) {
+ return -1;
+ }
+ if (PQCLEAN_FALCONPADDED1024_CLEAN_modq_decode(h, 10,
+ pk + 1, PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1)
+ != PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+ PQCLEAN_FALCONPADDED1024_CLEAN_to_ntt_monty(h, 10);
+
+ /*
+ * Decode signature.
+ */
+ if (sigbuflen == 0) {
+ return -1;
+ }
+
+ v = PQCLEAN_FALCONPADDED1024_CLEAN_comp_decode(sig, 10, sigbuf, sigbuflen);
+ if (v == 0) {
+ return -1;
+ }
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
+
+ /*
+ * Hash nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED1024_CLEAN_hash_to_point_ct(&sc, hm, 10, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Verify signature.
+ */
+ if (!PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw(hm, sig, h, 10, tmp.b)) {
+ return -1;
+ }
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ size_t vlen;
+
+ vlen = PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sig[0] = 0x30 + 10;
+ *siglen = 1 + NONCELEN + vlen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ if (siglen < 1 + NONCELEN) {
+ return -1;
+ }
+ if (sig[0] != 0x30 + 10) {
+ return -1;
+ }
+ return do_verify(sig + 1,
+ sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ uint8_t *sigbuf;
+ size_t sigbuflen;
+
+ /*
+ * Move the message to its final location; this is a memmove() so
+ * it handles overlaps properly.
+ */
+ memmove(sm + PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES, m, mlen);
+ sigbuf = sm + 1 + NONCELEN;
+ sigbuflen = PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sm[0] = 0x30 + 10;
+ sigbuflen ++;
+ *smlen = mlen + NONCELEN + sigbuflen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+ const uint8_t *sigbuf;
+ size_t pmlen, sigbuflen;
+
+ if (smlen < PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES) {
+ return -1;
+ }
+ sigbuflen = PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+ pmlen = smlen - PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES;
+ if (sm[0] != 0x30 + 10) {
+ return -1;
+ }
+ sigbuf = sm + 1 + NONCELEN;
+
+ /*
+ * The one-byte signature header has been verified. Nonce is at sm+1
+ * followed by the signature (pointed to by sigbuf). The message
+ * follows the signature value.
+ */
+ if (do_verify(sm + 1, sigbuf, sigbuflen,
+ sm + PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES, pmlen, pk) < 0) {
+ return -1;
+ }
+
+ /*
+ * Signature is correct, we just have to copy/move the message
+ * to its final destination. The memmove() properly handles
+ * overlaps.
+ */
+ memmove(m, sm + PQCLEAN_FALCONPADDED1024_CLEAN_CRYPTO_BYTES, pmlen);
+ *mlen = pmlen;
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/rng.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/rng.c
new file mode 100644
index 000000000..169d35fb2
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/rng.c
@@ -0,0 +1,188 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
+ /*
+ * To ensure reproducibility for a given seed, we
+ * must enforce little-endian interpretation of
+ * the state words.
+ */
+ uint8_t tmp[56];
+ uint64_t th, tl;
+ int i;
+
+ uint32_t *d32 = (uint32_t *) p->state.d;
+ uint64_t *d64 = (uint64_t *) p->state.d;
+
+ inner_shake256_extract(src, tmp, 56);
+ for (i = 0; i < 14; i ++) {
+ uint32_t w;
+
+ w = (uint32_t)tmp[(i << 2) + 0]
+ | ((uint32_t)tmp[(i << 2) + 1] << 8)
+ | ((uint32_t)tmp[(i << 2) + 2] << 16)
+ | ((uint32_t)tmp[(i << 2) + 3] << 24);
+ d32[i] = w;
+ }
+ tl = d32[48 / sizeof(uint32_t)];
+ th = d32[52 / sizeof(uint32_t)];
+ d64[48 / sizeof(uint64_t)] = tl + (th << 32);
+ PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(prng *p) {
+
+ static const uint32_t CW[] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+ };
+
+ uint64_t cc;
+ size_t u;
+
+ /*
+ * State uses local endianness. Only the output bytes must be
+ * converted to little endian (if used on a big-endian machine).
+ */
+ cc = *(uint64_t *)(p->state.d + 48);
+ for (u = 0; u < 8; u ++) {
+ uint32_t state[16];
+ size_t v;
+ int i;
+
+ memcpy(&state[0], CW, sizeof CW);
+ memcpy(&state[4], p->state.d, 48);
+ state[14] ^= (uint32_t)cc;
+ state[15] ^= (uint32_t)(cc >> 32);
+ for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d) do { \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 16) | (state[d] >> 16); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 12) | (state[b] >> 20); \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 8) | (state[d] >> 24); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 7) | (state[b] >> 25); \
+ } while (0)
+
+ QROUND( 0, 4, 8, 12);
+ QROUND( 1, 5, 9, 13);
+ QROUND( 2, 6, 10, 14);
+ QROUND( 3, 7, 11, 15);
+ QROUND( 0, 5, 10, 15);
+ QROUND( 1, 6, 11, 12);
+ QROUND( 2, 7, 8, 13);
+ QROUND( 3, 4, 9, 14);
+
+#undef QROUND
+
+ }
+
+ for (v = 0; v < 4; v ++) {
+ state[v] += CW[v];
+ }
+ for (v = 4; v < 14; v ++) {
+ state[v] += ((uint32_t *)p->state.d)[v - 4];
+ }
+ state[14] += ((uint32_t *)p->state.d)[10]
+ ^ (uint32_t)cc;
+ state[15] += ((uint32_t *)p->state.d)[11]
+ ^ (uint32_t)(cc >> 32);
+ cc ++;
+
+ /*
+ * We mimic the interleaving that is used in the AVX2
+ * implementation.
+ */
+ for (v = 0; v < 16; v ++) {
+ p->buf.d[(u << 2) + (v << 5) + 0] =
+ (uint8_t)state[v];
+ p->buf.d[(u << 2) + (v << 5) + 1] =
+ (uint8_t)(state[v] >> 8);
+ p->buf.d[(u << 2) + (v << 5) + 2] =
+ (uint8_t)(state[v] >> 16);
+ p->buf.d[(u << 2) + (v << 5) + 3] =
+ (uint8_t)(state[v] >> 24);
+ }
+ }
+ *(uint64_t *)(p->state.d + 48) = cc;
+
+ p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len) {
+ uint8_t *buf;
+
+ buf = dst;
+ while (len > 0) {
+ size_t clen;
+
+ clen = (sizeof p->buf.d) - p->ptr;
+ if (clen > len) {
+ clen = len;
+ }
+ memcpy(buf, p->buf.d, clen);
+ buf += clen;
+ len -= clen;
+ p->ptr += clen;
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED1024_CLEAN_prng_refill(p);
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/sign.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/sign.c
new file mode 100644
index 000000000..a7dbbfc62
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/sign.c
@@ -0,0 +1,1248 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ * N = 2^logn
+ * phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+ /*
+ * For logn = 0 (polynomials are constant), the "tree" is a
+ * single element. Otherwise, the tree node has size 2^logn, and
+ * has two child trees for size logn-1 each. Thus, treesize s()
+ * must fulfill these two relations:
+ *
+ * s(0) = 1
+ * s(logn) = (2^logn) + 2*s(logn-1)
+ */
+ return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+ fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g0[0];
+ return;
+ }
+ hn = n >> 1;
+
+ /*
+ * The LDL decomposition yields L (which is written in the tree)
+ * and the diagonal of D. Since d00 = g0, we just write d11
+ * into tmp.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+ /*
+ * Split d00 (currently in g0) and d11 (currently in tmp). We
+ * reuse g0 and g1 as temporary storage spaces:
+ * d00 splits into g1, g1+hn
+ * d11 splits into g0, g0+hn
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(g1, g1 + hn, g0, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+ /*
+ * Each split result is the first row of a new auto-adjoint
+ * quasicyclic matrix for the next recursive step.
+ */
+ ffLDL_fft_inner(tree + n,
+ g1, g1 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+ const fpr *g01, const fpr *g11,
+ unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *d00, *d11;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g00[0];
+ return;
+ }
+ hn = n >> 1;
+ d00 = tmp;
+ d11 = tmp + n;
+ tmp += n << 1;
+
+ memcpy(d00, g00, n * sizeof * g00);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(tmp, tmp + hn, d00, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(d00, d00 + hn, d11, logn);
+ memcpy(d11, tmp, n * sizeof * tmp);
+ ffLDL_fft_inner(tree + n,
+ d11, d11 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+ /*
+ * TODO: make an iterative version.
+ */
+ size_t n;
+
+ n = MKN(logn);
+ if (n == 1) {
+ /*
+ * We actually store in the tree leaf the inverse of
+ * the value mandated by the specification: this
+ * saves a division both here and in the sampler.
+ */
+ tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+ } else {
+ ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+ ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+ orig_logn, logn - 1);
+ }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ r[u] = fpr_of(t[u]);
+ }
+}
+
+/*
+ * The expanded private key contains:
+ * - The B0 matrix (four elements)
+ * - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+ (void)logn;
+ return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+ return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+ return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+ return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+ return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp) {
+ size_t n;
+ fpr *rf, *rg, *rF, *rG;
+ fpr *b00, *b01, *b10, *b11;
+ fpr *g00, *g01, *g11, *gxx;
+ fpr *tree;
+
+ n = MKN(logn);
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * We load the private key elements directly into the B0 matrix,
+ * since B0 = [[g, -f], [G, -F]].
+ */
+ rf = b01;
+ rg = b00;
+ rF = b11;
+ rG = b10;
+
+ smallints_to_fpr(rf, f, logn);
+ smallints_to_fpr(rg, g, logn);
+ smallints_to_fpr(rF, F, logn);
+ smallints_to_fpr(rG, G, logn);
+
+ /*
+ * Compute the FFT for the key elements, and negate f and F.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rf, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rg, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rF, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(rG, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(rf, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(rF, logn);
+
+ /*
+ * The Gram matrix is G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle).
+ */
+ g00 = (fpr *)tmp;
+ g01 = g00 + n;
+ g11 = g01 + n;
+ gxx = g11 + n;
+
+ memcpy(g00, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(g00, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(g00, gxx, logn);
+
+ memcpy(g01, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(g01, b10, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(gxx, b11, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(g01, gxx, logn);
+
+ memcpy(g11, b10, n * sizeof * b10);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(g11, logn);
+ memcpy(gxx, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(g11, gxx, logn);
+
+ /*
+ * Compute the Falcon tree.
+ */
+ ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+ /*
+ * Normalize tree.
+ */
+ ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+ fpr *t0, fpr *t1,
+ fpr *g00, fpr *g01, fpr *g11,
+ unsigned orig_logn, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *z0, *z1;
+
+ /*
+ * Deepest level: the LDL tree leaf value is just g00 (the
+ * array has length only 1 at this point); we normalize it
+ * with regards to sigma, then use it for sampling.
+ */
+ if (logn == 0) {
+ fpr leaf;
+
+ leaf = g00[0];
+ leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+ t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+ t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+ return;
+ }
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Decompose G into LDL. We only need d00 (identical to g00),
+ * d11, and l10; we do that in place.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_LDL_fft(g00, g01, g11, logn);
+
+ /*
+ * Split d00 and d11 and expand them into half-size quasi-cyclic
+ * Gram matrices. We also save l10 in tmp[].
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(tmp, tmp + hn, g00, logn);
+ memcpy(g00, tmp, n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(tmp, tmp + hn, g11, logn);
+ memcpy(g11, tmp, n * sizeof * tmp);
+ memcpy(tmp, g01, n * sizeof * g01);
+ memcpy(g01, g00, hn * sizeof * g00);
+ memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+ /*
+ * The half-size Gram matrices for the recursive LDL tree
+ * building are now:
+ * - left sub-tree: g00, g00+hn, g01
+ * - right sub-tree: g11, g11+hn, g01+hn
+ * l10 is in tmp[].
+ */
+
+ /*
+ * We split t1 and use the first recursive call on the two
+ * halves, using the right sub-tree. The result is merged
+ * back into tmp + 2*n.
+ */
+ z1 = tmp + n;
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+ g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * l10.
+ * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+ * in tmp + (n << 1). The buffer in z1 is free.
+ *
+ * In the end, z1 is written over t1, and tb0 is in t0.
+ */
+ memcpy(z1, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(z1, tmp + (n << 1), logn);
+ memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(t0, tmp, logn);
+
+ /*
+ * Second recursive invocation, on the split tb0 (currently in t0)
+ * and the left sub-tree.
+ */
+ z0 = tmp;
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z0, z0 + hn, t0, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+ g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+ fpr *z0, fpr *z1,
+ const fpr *tree,
+ const fpr *t0, const fpr *t1, unsigned logn,
+ fpr *tmp) {
+ size_t n, hn;
+ const fpr *tree0, *tree1;
+
+ /*
+ * When logn == 2, we inline the last two recursion levels.
+ */
+ if (logn == 2) {
+ fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ tree0 = tree + 4;
+ tree1 = tree + 8;
+
+ /*
+ * We split t1 into w*, then do the recursive invocation,
+ * with output in w*. We finally merge back into z1.
+ */
+ a_re = t1[0];
+ a_im = t1[2];
+ b_re = t1[1];
+ b_im = t1[3];
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ x0 = w2;
+ x1 = w3;
+ sigma = tree1[3];
+ w2 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, w2);
+ a_im = fpr_sub(x1, w3);
+ b_re = tree1[0];
+ b_im = tree1[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree1[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z1[0] = w0 = fpr_add(a_re, c_re);
+ z1[2] = w2 = fpr_add(a_im, c_im);
+ z1[1] = w1 = fpr_sub(a_re, c_re);
+ z1[3] = w3 = fpr_sub(a_im, c_im);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+ */
+ w0 = fpr_sub(t1[0], w0);
+ w1 = fpr_sub(t1[1], w1);
+ w2 = fpr_sub(t1[2], w2);
+ w3 = fpr_sub(t1[3], w3);
+
+ a_re = w0;
+ a_im = w2;
+ b_re = tree[0];
+ b_im = tree[2];
+ w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ a_re = w1;
+ a_im = w3;
+ b_re = tree[1];
+ b_im = tree[3];
+ w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+ w0 = fpr_add(w0, t0[0]);
+ w1 = fpr_add(w1, t0[1]);
+ w2 = fpr_add(w2, t0[2]);
+ w3 = fpr_add(w3, t0[3]);
+
+ /*
+ * Second recursive invocation.
+ */
+ a_re = w0;
+ a_im = w2;
+ b_re = w1;
+ b_im = w3;
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ x0 = w2;
+ x1 = w3;
+ sigma = tree0[3];
+ w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree0[0];
+ b_im = tree0[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree0[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z0[0] = fpr_add(a_re, c_re);
+ z0[2] = fpr_add(a_im, c_im);
+ z0[1] = fpr_sub(a_re, c_re);
+ z0[3] = fpr_sub(a_im, c_im);
+
+ return;
+ }
+
+ /*
+ * Case logn == 1 is reachable only when using Falcon-2 (the
+ * smallest size for which Falcon is mathematically defined, but
+ * of course way too insecure to be of any use).
+ */
+ if (logn == 1) {
+ fpr x0, x1, y0, y1, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ x0 = t1[0];
+ x1 = t1[1];
+ sigma = tree[3];
+ z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree[0];
+ b_im = tree[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, t0[0]);
+ x1 = fpr_add(c_im, t0[1]);
+ sigma = tree[2];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+ return;
+ }
+
+ /*
+ * Normal end of recursion is for logn == 0. Since the last
+ * steps of the recursions were inlined in the blocks above
+ * (when logn == 1 or 2), this case is not reachable, and is
+ * retained here only for documentation purposes.
+
+ if (logn == 0) {
+ fpr x0, x1, sigma;
+
+ x0 = t0[0];
+ x1 = t1[0];
+ sigma = tree[0];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+ return;
+ }
+
+ */
+
+ /*
+ * General recursive case (logn >= 3).
+ */
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ tree0 = tree + n;
+ tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+ /*
+ * We split t1 into z1 (reused as temporary storage), then do
+ * the recursive invocation, with output in tmp. We finally
+ * merge back into z1.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree1, z1, z1 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+ */
+ memcpy(tmp, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_sub(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tmp, tree, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(tmp, t0, logn);
+
+ /*
+ * Second recursive invocation.
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_split_fft(z0, z0 + hn, tmp, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree0, z0, z0 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const fpr *expanded_key,
+ const uint16_t *hm,
+ unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ const fpr *b00, *b01, *b10, *b11, *tree;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+ t0 = tmp;
+ t1 = t0 + n;
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t0, ni, logn);
+
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Apply sampling. Output is written back in [tx, ty].
+ */
+ ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(t0, tx, n * sizeof * tx);
+ memcpy(t1, ty, n * sizeof * ty);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(t1, ty, logn);
+
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t1, logn);
+
+ /*
+ * Compute the signature.
+ */
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+
+ /*
+ * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b11, logn);
+
+ /*
+ * Compute the Gram matrix G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle). g10 is not kept
+ * since it is equal to adj(g01).
+ *
+ * We _replace_ the matrix B with the Gram matrix, but we
+ * must keep b01 and b11 for computing the target vector.
+ */
+ t0 = b11 + n;
+ t1 = t0 + n;
+
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(t0, logn); // t0 <- b01*adj(b01)
+
+ memcpy(t1, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(t1, b10, logn); // t1 <- b00*adj(b10)
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(b00, logn); // b00 <- b00*adj(b00)
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(b00, t0, logn); // b00 <- g00
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_muladj_fft(b01, b11, logn); // b01 <- b01*adj(b11)
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(b01, t1, logn); // b01 <- g01
+
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(b10, logn); // b10 <- b10*adj(b10)
+ memcpy(t1, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulselfadj_fft(t1, logn); // t1 <- b11*adj(b11)
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(b10, t1, logn); // b10 <- g11
+
+ /*
+ * We rename variables to make things clearer. The three elements
+ * of the Gram matrix uses the first 3*n slots of tmp[], followed
+ * by b11 and b01 (in that order).
+ */
+ g00 = b00;
+ g01 = b01;
+ g11 = b10;
+ b01 = t0;
+ t0 = b01 + n;
+ t1 = t0 + n;
+
+ /*
+ * Memory layout at that point:
+ * g00 g01 g11 b11 b01 t0 t1
+ */
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mulconst(t0, ni, logn);
+
+ /*
+ * b01 and b11 can be discarded, so we move back (t0,t1).
+ * Memory layout is now:
+ * g00 g01 g11 t0 t1
+ */
+ memcpy(b11, t0, n * 2 * sizeof * t0);
+ t0 = g11 + n;
+ t1 = t0 + n;
+
+ /*
+ * Apply sampling; result is written over (t0,t1).
+ */
+ ffSampling_fft_dyntree(samp, samp_ctx,
+ t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+ /*
+ * We arrange the layout back to:
+ * b00 b01 b10 b11 t0 t1
+ *
+ * We did not conserve the matrix basis, so we must recompute
+ * it now.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ memmove(b11 + n, t0, n * 2 * sizeof * t0);
+ t0 = b11 + n;
+ t1 = t0 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_neg(b11, logn);
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(tx, t0, n * sizeof * t0);
+ memcpy(ty, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_poly_add(t1, ty, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED1024_CLEAN_iFFT(t1, logn);
+
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_gaussian0_sampler(prng *p) {
+
+ static const uint32_t dist[] = {
+ 10745844u, 3068844u, 3741698u,
+ 5559083u, 1580863u, 8248194u,
+ 2260429u, 13669192u, 2736639u,
+ 708981u, 4421575u, 10046180u,
+ 169348u, 7122675u, 4136815u,
+ 30538u, 13063405u, 7650655u,
+ 4132u, 14505003u, 7826148u,
+ 417u, 16768101u, 11363290u,
+ 31u, 8444042u, 8086568u,
+ 1u, 12844466u, 265321u,
+ 0u, 1232676u, 13644283u,
+ 0u, 38047u, 9111839u,
+ 0u, 870u, 6138264u,
+ 0u, 14u, 12545723u,
+ 0u, 0u, 3104126u,
+ 0u, 0u, 28824u,
+ 0u, 0u, 198u,
+ 0u, 0u, 1u
+ };
+
+ uint32_t v0, v1, v2, hi;
+ uint64_t lo;
+ size_t u;
+ int z;
+
+ /*
+ * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+ */
+ lo = prng_get_u64(p);
+ hi = prng_get_u8(p);
+ v0 = (uint32_t)lo & 0xFFFFFF;
+ v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+ v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+ /*
+ * Sampled value is z, such that v0..v2 is lower than the first
+ * z elements of the table.
+ */
+ z = 0;
+ for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) {
+ uint32_t w0, w1, w2, cc;
+
+ w0 = dist[u + 2];
+ w1 = dist[u + 1];
+ w2 = dist[u + 0];
+ cc = (v0 - w0) >> 31;
+ cc = (v1 - w1 - cc) >> 31;
+ cc = (v2 - w2 - cc) >> 31;
+ z += (int)cc;
+ }
+ return z;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+ int s, i;
+ fpr r;
+ uint32_t sw, w;
+ uint64_t z;
+
+ /*
+ * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+ * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+ */
+ s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+ r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+ /*
+ * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+ * (the minimum value for sigma), r = 0 and b = 1, then we get
+ * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+ * with probability about 0.000000000230383991, which is
+ * approximatively equal to 2^(-32). In any case, if s >= 64,
+ * then BerExp will be non-zero with probability less than
+ * 2^(-64), so we can simply saturate s at 63.
+ */
+ sw = (uint32_t)s;
+ sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+ s = (int)sw;
+
+ /*
+ * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+ * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+ * We scale it up to 2^64, then right-shift it by s bits because
+ * we really want exp(-x) = 2^(-s)*exp(-r).
+ *
+ * The "-1" operation makes sure that the value fits on 64 bits
+ * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+ * case). The bias is negligible since fpr_expm_p63() only computes
+ * with 51 bits of precision or so.
+ */
+ z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+ /*
+ * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+ * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+ * PRNG output to limit its consumption, the sign of the difference
+ * yields the expected result.
+ */
+ i = 64;
+ do {
+ i -= 8;
+ w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+ } while (!w && i > 0);
+ return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
+ sampler_context *spc;
+ int s;
+ fpr r, dss, ccs;
+
+ spc = ctx;
+
+ /*
+ * Center is mu. We compute mu = s + r where s is an integer
+ * and 0 <= r < 1.
+ */
+ s = (int)fpr_floor(mu);
+ r = fpr_sub(mu, fpr_of(s));
+
+ /*
+ * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+ */
+ dss = fpr_half(fpr_sqr(isigma));
+
+ /*
+ * ccs = sigma_min / sigma = sigma_min * isigma.
+ */
+ ccs = fpr_mul(isigma, spc->sigma_min);
+
+ /*
+ * We now need to sample on center r.
+ */
+ for (;;) {
+ int z0, z, b;
+ fpr x;
+
+ /*
+ * Sample z for a Gaussian distribution. Then get a
+ * random bit b to turn the sampling into a bimodal
+ * distribution: if b = 1, we use z+1, otherwise we
+ * use -z. We thus have two situations:
+ *
+ * - b = 1: z >= 1 and sampled against a Gaussian
+ * centered on 1.
+ * - b = 0: z <= 0 and sampled against a Gaussian
+ * centered on 0.
+ */
+ z0 = PQCLEAN_FALCONPADDED1024_CLEAN_gaussian0_sampler(&spc->p);
+ b = (int)prng_get_u8(&spc->p) & 1;
+ z = b + ((b << 1) - 1) * z0;
+
+ /*
+ * Rejection sampling. We want a Gaussian centered on r;
+ * but we sampled against a Gaussian centered on b (0 or
+ * 1). But we know that z is always in the range where
+ * our sampling distribution is greater than the Gaussian
+ * distribution, so rejection works.
+ *
+ * We got z with distribution:
+ * G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+ * We target distribution:
+ * S(z) = exp(-((z-r)^2)/(2*sigma^2))
+ * Rejection sampling works by keeping the value z with
+ * probability S(z)/G(z), and starting again otherwise.
+ * This requires S(z) <= G(z), which is the case here.
+ * Thus, we simply need to keep our z with probability:
+ * P = exp(-x)
+ * where:
+ * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+ *
+ * Here, we scale up the Bernouilli distribution, which
+ * makes rejection more probable, but makes rejection
+ * rate sufficiently decorrelated from the Gaussian
+ * center and standard deviation that the whole sampler
+ * can be said to be constant-time.
+ */
+ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+ x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+ if (BerExp(&spc->p, x, ccs)) {
+ /*
+ * Rejection sampling was centered on r, but the
+ * actual center is mu = s + r.
+ */
+ return s + z;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED1024_CLEAN_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_tree(samp, samp_ctx, sig,
+ expanded_key, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED1024_CLEAN_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED1024_CLEAN_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_dyn(samp, samp_ctx, sig,
+ f, g, F, G, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-1024_clean/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-1024_clean/vrfy.c
new file mode 100644
index 000000000..58dbf0bec
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-1024_clean/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ * n = 2^logn (2 <= n <= 1024)
+ * phi = X^n + 1
+ * q = 12289
+ * q0i = -1/q mod 2^16
+ * R = 2^16 mod q
+ * R2 = 2^32 mod q
+ */
+
+#define Q 12289
+#define Q0I 12287
+#define R 4091
+#define R2 10952
+
+/*
+ * Table for NTT, binary case:
+ * GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+ 4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759,
+ 1591, 6399, 9477, 5266, 586, 5825, 7538, 9710,
+ 1134, 6407, 1711, 965, 7099, 7674, 3743, 6442,
+ 10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180,
+ 12210, 6240, 997, 117, 4783, 4407, 1549, 7072,
+ 2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042,
+ 12189, 432, 10751, 1237, 7610, 1534, 3983, 7863,
+ 2181, 6308, 8720, 6570, 4843, 1690, 14, 3872,
+ 5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340,
+ 1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045,
+ 3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180,
+ 9277, 6130, 3323, 883, 10469, 489, 1502, 2851,
+ 11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195,
+ 730, 1762, 3854, 2030, 5892, 10922, 9020, 5274,
+ 9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446,
+ 7613, 9386, 834, 7703, 6836, 3403, 5351, 12276,
+ 3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525,
+ 10401, 2749, 7338, 10574, 6040, 943, 9330, 1477,
+ 6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680,
+ 8188, 6902, 3533, 9807, 6090, 727, 10099, 7003,
+ 6945, 1949, 9731, 10559, 6057, 378, 7871, 8763,
+ 8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821,
+ 5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159,
+ 1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188,
+ 737, 3698, 4699, 5753, 9046, 3687, 16, 914,
+ 5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381,
+ 10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357,
+ 6409, 8197, 2962, 6375, 5064, 6634, 5625, 278,
+ 932, 10229, 8927, 7642, 351, 9298, 237, 5858,
+ 7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204,
+ 4602, 1748, 11300, 340, 3711, 4614, 300, 10993,
+ 5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654,
+ 3835, 5553, 1224, 8476, 9237, 3845, 250, 11209,
+ 4225, 6326, 9680, 12254, 4136, 2778, 692, 8808,
+ 6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433,
+ 6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416,
+ 8418, 10824, 11986, 5733, 876, 7030, 2167, 2436,
+ 3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434,
+ 7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328,
+ 8557, 1867, 9454, 2416, 3816, 9076, 686, 5393,
+ 2523, 4339, 6115, 619, 937, 2834, 7775, 3279,
+ 2363, 7488, 6112, 5056, 824, 10204, 11690, 1113,
+ 2727, 9848, 896, 2028, 5075, 2654, 10464, 7884,
+ 12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520,
+ 1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399,
+ 11192, 315, 4511, 1158, 6061, 6751, 11865, 357,
+ 7367, 4550, 983, 8534, 8352, 10126, 7530, 9253,
+ 4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652,
+ 3374, 11477, 1753, 292, 8681, 2806, 10378, 12188,
+ 5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928,
+ 4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650,
+ 7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344,
+ 8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561,
+ 6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114,
+ 7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323,
+ 10438, 9471, 1271, 408, 6911, 3079, 360, 8276,
+ 11535, 9156, 9049, 11539, 850, 8617, 784, 7919,
+ 8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600,
+ 9779, 1012, 721, 2784, 6676, 6552, 5348, 4424,
+ 6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333,
+ 8801, 9661, 7308, 5788, 4910, 909, 11613, 4395,
+ 8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216,
+ 4296, 11918, 695, 4371, 9793, 4884, 2411, 10230,
+ 2650, 841, 3890, 10231, 7248, 8505, 11196, 6688,
+ 4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868,
+ 11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525,
+ 7938, 7982, 11977, 6755, 537, 4562, 1623, 8227,
+ 11453, 7544, 906, 11816, 9548, 10858, 9703, 2815,
+ 11736, 6813, 6979, 819, 8903, 6271, 10843, 348,
+ 7514, 8339, 6439, 694, 852, 5659, 2781, 3716,
+ 11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885,
+ 2978, 7289, 11884, 9123, 9323, 11830, 98, 2526,
+ 2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224,
+ 10871, 8092, 9651, 5989, 7140, 8480, 1670, 159,
+ 10923, 4918, 128, 7312, 725, 9157, 5006, 6393,
+ 3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668,
+ 3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365,
+ 5110, 45, 2400, 1921, 4377, 2720, 1695, 51,
+ 2808, 650, 1896, 9997, 9971, 11980, 8098, 4833,
+ 4135, 4257, 5838, 4765, 10985, 11532, 590, 12198,
+ 482, 12173, 2006, 7064, 10018, 3912, 12016, 10519,
+ 11362, 6954, 2210, 284, 5413, 6601, 3865, 10339,
+ 11188, 6231, 517, 9564, 11281, 3863, 1210, 4604,
+ 8160, 11447, 153, 7204, 5763, 5089, 9248, 12154,
+ 11748, 1354, 6672, 179, 5532, 2646, 5941, 12185,
+ 862, 3158, 477, 7279, 5678, 7914, 4254, 302,
+ 2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824,
+ 10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449,
+ 5159, 1308, 8315, 3404, 1877, 1231, 112, 6398,
+ 11724, 12272, 7286, 1459, 12274, 9896, 3456, 800,
+ 1397, 10678, 103, 7420, 7976, 936, 764, 632,
+ 7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946,
+ 6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139,
+ 4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850,
+ 7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217,
+ 10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711,
+ 2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729,
+ 4997, 7415, 6315, 12044, 4374, 7157, 4844, 211,
+ 8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875,
+ 8192, 986, 7527, 1401, 870, 3615, 8465, 2756,
+ 9770, 2034, 10168, 3264, 6132, 54, 2880, 4763,
+ 11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038,
+ 2567, 708, 893, 6465, 4962, 10024, 2090, 5718,
+ 10743, 780, 4733, 4623, 2134, 2087, 4802, 884,
+ 5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664,
+ 4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791,
+ 6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032,
+ 11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062,
+ 8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348,
+ 4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499,
+ 2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326,
+ 5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830,
+ 11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582,
+ 3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762,
+ 7496, 10383, 755, 1654, 12053, 4952, 10134, 4394,
+ 6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674,
+ 10358, 4901, 7414, 8771, 710, 6764, 8462, 7193,
+ 5371, 7274, 11084, 290, 7864, 6827, 11822, 2509,
+ 6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105,
+ 11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776,
+ 7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277,
+ 9182, 11456, 633, 10046, 11554, 5633, 9587, 2333,
+ 7008, 7084, 5047, 7199, 9865, 8997, 569, 6390,
+ 10845, 9679, 8268, 11472, 4203, 1997, 2, 9331,
+ 162, 6182, 2000, 3649, 9792, 6363, 7557, 6187,
+ 8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067,
+ 5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165,
+ 6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ * iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+ 4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329,
+ 2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698,
+ 3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875,
+ 5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155,
+ 8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108,
+ 4426, 8306, 10755, 4679, 11052, 1538, 11857, 100,
+ 8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460,
+ 5217, 10740, 7882, 7506, 12172, 11292, 6049, 79,
+ 13, 6938, 8886, 5453, 4586, 11455, 2903, 4676,
+ 9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110,
+ 7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559,
+ 11094, 2211, 1808, 7319, 48, 9547, 2560, 1228,
+ 9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012,
+ 6109, 2796, 2203, 1652, 711, 7004, 1053, 8973,
+ 5244, 1517, 9322, 11269, 900, 3888, 11133, 10736,
+ 4949, 7616, 9974, 4746, 10270, 126, 2921, 6720,
+ 6635, 6543, 1582, 4868, 42, 673, 2240, 7219,
+ 1296, 11989, 7675, 8578, 11949, 989, 10541, 7687,
+ 7085, 8487, 1004, 10236, 4703, 163, 9143, 4597,
+ 6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357,
+ 12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880,
+ 6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556,
+ 6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103,
+ 11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552,
+ 6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822,
+ 9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609,
+ 3468, 4659, 625, 2700, 7738, 3443, 3060, 3388,
+ 3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344,
+ 5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101,
+ 4609, 8605, 8226, 144, 5656, 8704, 2621, 5424,
+ 10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888,
+ 3764, 39, 8219, 2080, 2502, 1469, 10550, 8709,
+ 5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639,
+ 2059, 9878, 7405, 2496, 7918, 11594, 371, 7993,
+ 3073, 10326, 40, 10004, 9245, 7987, 5603, 4051,
+ 7894, 676, 11380, 7379, 6501, 4981, 2628, 3488,
+ 10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473,
+ 7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510,
+ 6689, 386, 4462, 105, 2076, 10443, 119, 3955,
+ 4370, 11505, 3672, 11439, 750, 3240, 3133, 754,
+ 4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851,
+ 4966, 8181, 2688, 6205, 6814, 926, 2936, 4327,
+ 10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255,
+ 728, 7569, 6056, 10432, 11036, 2452, 2811, 3787,
+ 945, 8998, 1244, 8815, 11017, 11218, 5894, 4325,
+ 4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707,
+ 1361, 9812, 2949, 11265, 10301, 9108, 478, 6489,
+ 101, 1911, 9483, 3608, 11997, 10536, 812, 8915,
+ 637, 8159, 5299, 9128, 3512, 8290, 7068, 7922,
+ 3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922,
+ 11932, 424, 5538, 6228, 11131, 7778, 11974, 1097,
+ 2890, 10027, 2569, 2250, 2352, 821, 2550, 11016,
+ 7769, 136, 617, 3157, 5889, 9219, 6855, 120,
+ 4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562,
+ 11176, 599, 2085, 11465, 7233, 6177, 4801, 9926,
+ 9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766,
+ 6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732,
+ 7961, 1457, 10857, 8069, 832, 1628, 3410, 4900,
+ 10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847,
+ 9853, 10122, 5259, 11413, 6556, 303, 1465, 3871,
+ 4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852,
+ 3856, 928, 4933, 8530, 1871, 2184, 5571, 5879,
+ 3481, 11597, 9511, 8153, 35, 2609, 5963, 8064,
+ 1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454,
+ 2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028,
+ 2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795,
+ 9222, 10837, 280, 8583, 3270, 6753, 2354, 3779,
+ 6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127,
+ 2958, 12287, 10292, 8086, 817, 4021, 2610, 1444,
+ 5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281,
+ 9956, 2702, 6656, 735, 2243, 11656, 833, 3107,
+ 6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278,
+ 3513, 9769, 3025, 779, 9433, 3392, 7437, 668,
+ 10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711,
+ 9780, 467, 5462, 4425, 11999, 1205, 5015, 6918,
+ 5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931,
+ 6615, 1541, 8708, 260, 3385, 4792, 4391, 5697,
+ 7895, 2155, 7337, 236, 10635, 11534, 1906, 4793,
+ 9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556,
+ 707, 1088, 4936, 678, 10245, 18, 5684, 960,
+ 4459, 7957, 226, 2451, 6, 8874, 320, 6298,
+ 8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876,
+ 9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679,
+ 7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378,
+ 5227, 952, 4319, 9810, 4356, 3088, 11118, 840,
+ 6257, 486, 6000, 1342, 10382, 6017, 4798, 5489,
+ 4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037,
+ 1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917,
+ 11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546,
+ 6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722,
+ 2251, 11199, 5356, 7408, 2861, 4003, 9215, 484,
+ 7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519,
+ 9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097,
+ 2414, 6496, 9953, 10554, 808, 2999, 2130, 4286,
+ 12078, 7445, 5132, 7915, 245, 5974, 4874, 7292,
+ 7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022,
+ 9578, 8934, 11074, 9498, 294, 4711, 3391, 1377,
+ 9072, 10189, 4569, 10890, 9909, 6923, 53, 4653,
+ 439, 10253, 7028, 10207, 8343, 1141, 2556, 7601,
+ 8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765,
+ 10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293,
+ 11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892,
+ 11489, 8833, 2393, 15, 10830, 5003, 17, 565,
+ 5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130,
+ 5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020,
+ 2465, 8191, 384, 2642, 2729, 5399, 2175, 9396,
+ 11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427,
+ 104, 6348, 9643, 6757, 12110, 5617, 10935, 541,
+ 135, 3041, 7200, 6526, 5085, 12136, 842, 4129,
+ 7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101,
+ 1950, 8424, 5688, 6876, 12005, 10079, 5335, 927,
+ 1770, 273, 8377, 2271, 5225, 10283, 116, 11807,
+ 91, 11699, 757, 1304, 7524, 6451, 8032, 8154,
+ 7456, 4191, 309, 2318, 2292, 10393, 11639, 9481,
+ 12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179,
+ 3924, 3188, 367, 2077, 336, 5384, 5631, 8596,
+ 4621, 1775, 8866, 451, 6108, 1317, 6246, 8795,
+ 5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366,
+ 12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418,
+ 10065, 4156, 8373, 8644, 10445, 882, 8158, 10173,
+ 9763, 12191, 459, 2966, 3166, 405, 5000, 9311,
+ 6404, 8986, 1551, 8175, 3630, 10766, 9265, 700,
+ 8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775,
+ 11941, 1446, 6018, 3386, 11470, 5310, 5476, 553,
+ 9474, 2586, 1431, 2741, 473, 11383, 4745, 836,
+ 4062, 10666, 7727, 11752, 5534, 312, 4307, 4351,
+ 5764, 8679, 8381, 8187, 5, 7395, 4363, 1152,
+ 5421, 5231, 6473, 436, 7567, 8603, 6229, 8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+ /*
+ * If x < 0, the cast to uint32_t will set the high bit to 1.
+ */
+ uint32_t y;
+
+ y = (uint32_t)x;
+ y += Q & -(y >> 31);
+ return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+ /*
+ * We compute x + y - q. If the result is negative, then the
+ * high bit will be set, and 'd >> 31' will be equal to 1;
+ * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+ * it will be an all-zero pattern. In other words, this
+ * implements a conditional addition of q.
+ */
+ uint32_t d;
+
+ d = x + y - Q;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+ /*
+ * As in mq_add(), we use a conditional addition to ensure the
+ * result is in the 0..q-1 range.
+ */
+ uint32_t d;
+
+ d = x - y;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+ x += Q & -(x & 1);
+ return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+ uint32_t z, w;
+
+ /*
+ * We compute x*y + k*q with a value of k chosen so that the 16
+ * low bits of the result are 0. We can then shift the value.
+ * After the shift, result may still be larger than q, but it
+ * will be lower than 2*q, so a conditional subtraction works.
+ */
+
+ z = x * y;
+ w = ((z * Q0I) & 0xFFFF) * Q;
+
+ /*
+ * When adding z and w, the result will have its low 16 bits
+ * equal to 0. Since x, y and z are lower than q, the sum will
+ * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+ * fit on 29 bits.
+ */
+ z = (z + w) >> 16;
+
+ /*
+ * After the shift, analysis shows that the value will be less
+ * than 2q. We do a subtraction then conditional subtraction to
+ * ensure the result is in the expected range.
+ */
+ z -= Q;
+ z += Q & -(z >> 31);
+ return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+ return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+ /*
+ * We invert y by computing y^(q-2) mod q.
+ *
+ * We use the following addition chain for exponent e = 12287:
+ *
+ * e0 = 1
+ * e1 = 2 * e0 = 2
+ * e2 = e1 + e0 = 3
+ * e3 = e2 + e1 = 5
+ * e4 = 2 * e3 = 10
+ * e5 = 2 * e4 = 20
+ * e6 = 2 * e5 = 40
+ * e7 = 2 * e6 = 80
+ * e8 = 2 * e7 = 160
+ * e9 = e8 + e2 = 163
+ * e10 = e9 + e8 = 323
+ * e11 = 2 * e10 = 646
+ * e12 = 2 * e11 = 1292
+ * e13 = e12 + e9 = 1455
+ * e14 = 2 * e13 = 2910
+ * e15 = 2 * e14 = 5820
+ * e16 = e15 + e10 = 6143
+ * e17 = 2 * e16 = 12286
+ * e18 = e17 + e0 = 12287
+ *
+ * Additions on exponents are converted to Montgomery
+ * multiplications. We define all intermediate results as so
+ * many local variables, and let the C compiler work out which
+ * must be kept around.
+ */
+ uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+ uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+ y0 = mq_montymul(y, R2);
+ y1 = mq_montysqr(y0);
+ y2 = mq_montymul(y1, y0);
+ y3 = mq_montymul(y2, y1);
+ y4 = mq_montysqr(y3);
+ y5 = mq_montysqr(y4);
+ y6 = mq_montysqr(y5);
+ y7 = mq_montysqr(y6);
+ y8 = mq_montysqr(y7);
+ y9 = mq_montymul(y8, y2);
+ y10 = mq_montymul(y9, y8);
+ y11 = mq_montysqr(y10);
+ y12 = mq_montysqr(y11);
+ y13 = mq_montymul(y12, y9);
+ y14 = mq_montysqr(y13);
+ y15 = mq_montysqr(y14);
+ y16 = mq_montymul(y15, y10);
+ y17 = mq_montysqr(y16);
+ y18 = mq_montymul(y17, y0);
+
+ /*
+ * Final multiplication with x, which is not in Montgomery
+ * representation, computes the correct division result.
+ */
+ return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, i, j1;
+
+ ht = t >> 1;
+ for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+ size_t j, j2;
+ uint32_t s;
+
+ s = GMb[m + i];
+ j2 = j1 + ht;
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v;
+
+ u = a[j];
+ v = mq_montymul(a[j + ht], s);
+ a[j] = (uint16_t)mq_add(u, v);
+ a[j + ht] = (uint16_t)mq_sub(u, v);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+ uint32_t ni;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ while (m > 1) {
+ size_t hm, dt, i, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+ size_t j, j2;
+ uint32_t s;
+
+ j2 = j1 + t;
+ s = iGMb[hm + i];
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v, w;
+
+ u = a[j];
+ v = a[j + t];
+ a[j] = (uint16_t)mq_add(u, v);
+ w = mq_sub(u, v);
+ a[j + t] = (uint16_t)
+ mq_montymul(w, s);
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * To complete the inverse NTT, we must now divide all values by
+ * n (the vector size). We thus need the inverse of n, i.e. we
+ * need to divide 1 by 2 logn times. But we also want it in
+ * Montgomery representation, i.e. we also want to multiply it
+ * by R = 2^16. In the common case, this should be a simple right
+ * shift. The loop below is generic and works also in corner cases;
+ * its computation time is negligible.
+ */
+ ni = R;
+ for (m = n; m > 1; m >>= 1) {
+ ni = mq_rshift1(ni);
+ }
+ for (m = 0; m < n; m ++) {
+ a[m] = (uint16_t)mq_montymul(a[m], ni);
+ }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], R2);
+ }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+ }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_sub(f[u], g[u]);
+ }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn) {
+ mq_NTT(h, logn);
+ mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+
+ /*
+ * Reduce s2 elements modulo q ([0..q-1] range).
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+ */
+ mq_NTT(tt, logn);
+ mq_poly_montymul_ntt(tt, h, logn);
+ mq_iNTT(tt, logn);
+ mq_poly_sub(tt, c0, logn);
+
+ /*
+ * Normalize -s1 elements into the [-q/2..q/2] range.
+ */
+ for (u = 0; u < n; u ++) {
+ int32_t w;
+
+ w = (int32_t)tt[u];
+ w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+ ((int16_t *)tt)[u] = (int16_t)w;
+ }
+
+ /*
+ * Signature is valid if and only if the aggregate (-s1,s2) vector
+ * is short enough.
+ */
+ return PQCLEAN_FALCONPADDED1024_CLEAN_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ tt[u] = (uint16_t)mq_conv_small(f[u]);
+ h[u] = (uint16_t)mq_conv_small(g[u]);
+ }
+ mq_NTT(h, logn);
+ mq_NTT(tt, logn);
+ for (u = 0; u < n; u ++) {
+ if (tt[u] == 0) {
+ return 0;
+ }
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *t1, *t2;
+
+ n = (size_t)1 << logn;
+ t1 = (uint16_t *)tmp;
+ t2 = t1 + n;
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint16_t)mq_conv_small(g[u]);
+ t2[u] = (uint16_t)mq_conv_small(F[u]);
+ }
+ mq_NTT(t1, logn);
+ mq_NTT(t2, logn);
+ mq_poly_tomonty(t1, logn);
+ mq_poly_montymul_ntt(t1, t2, logn);
+ for (u = 0; u < n; u ++) {
+ t2[u] = (uint16_t)mq_conv_small(f[u]);
+ }
+ mq_NTT(t2, logn);
+ for (u = 0; u < n; u ++) {
+ if (t2[u] == 0) {
+ return 0;
+ }
+ t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+ }
+ mq_iNTT(t1, logn);
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+ int32_t gi;
+
+ w = t1[u];
+ w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+ gi = *(int32_t *)&w;
+ if (gi < -127 || gi > +127) {
+ return 0;
+ }
+ G[u] = (int8_t)gi;
+ }
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+ mq_NTT(tt, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ }
+ return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+ * and c0 - s1 into h[].
+ */
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+
+ w = (uint32_t)s1[u];
+ w += Q & -(w >> 31);
+ w = mq_sub(c0[u], w);
+ h[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+ * is zero (in NTT representation) then the operation fails. We
+ * keep that information into a flag so that we do not deviate
+ * from strict constant-time processing; if all coefficients of
+ * s2 are non-zero, then the high bit of r will be zero.
+ */
+ mq_NTT(tt, logn);
+ mq_NTT(h, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+
+ /*
+ * Signature is acceptable if and only if it is short enough,
+ * and s2 was invertible mod phi mod q. The caller must still
+ * check that the rebuilt public key matches the expected
+ * value (e.g. through a hash).
+ */
+ r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED1024_CLEAN_is_short(s1, s2, logn);
+ return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+ uint16_t *s2;
+ size_t u, n;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ s2 = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)sig[u];
+ w += Q & -(w >> 31);
+ s2[u] = (uint16_t)w;
+ }
+ mq_NTT(s2, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u] - 1u;
+ r += (w >> 31);
+ }
+ return (int)r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/LICENSE b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/LICENSE
new file mode 100644
index 000000000..4df2d7836
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/LICENSE
@@ -0,0 +1,57 @@
+This ARMv8 NEON implementation is provided under the Apache 2.0 license:
+
+/*
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+Based on the reference code provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin .
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/api.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/api.h
new file mode 100644
index 000000000..deba20b36
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED512_AARCH64_API_H
+#define PQCLEAN_FALCONPADDED512_AARCH64_API_H
+
+#include
+#include
+
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES 1281
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES 897
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES 666
+
+#define PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_ALGNAME "Falcon-padded-512"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ * public (pk): PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES
+ * private (sk): PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/codec.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/codec.c
new file mode 100644
index 000000000..3fe3a9452
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/codec.c
@@ -0,0 +1,554 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_modq_encode(
+ void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn) {
+ size_t n, out_len, u;
+ uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = 1 << logn;
+ out_len = ((n * 14) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+
+ for (u = 0; u < n; u ++) {
+ if (x[u] >= FALCON_Q) {
+ return 0;
+ }
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << 14) | x[u];
+ acc_len += 14;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_modq_decode(uint16_t *x, const void *in, size_t max_in_len, unsigned logn) {
+ size_t n, in_len, u;
+ const uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = 1 << logn;
+ in_len = ((n * 14) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ u = 0;
+ while (u < n) {
+ acc = (acc << 8) | (*buf ++);
+ acc_len += 8;
+ if (acc_len >= 14) {
+ unsigned w;
+
+ acc_len -= 14;
+ w = (acc >> acc_len) & 0x3FFF;
+ if (w >= 12289) {
+ return 0;
+ }
+ x[u ++] = (uint16_t)w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint16_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_decode(
+ int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ w |= -(w & mask2);
+ x[u ++] = (int16_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(void *out, size_t max_out_len,
+ const int8_t *x, uint8_t bits) {
+ size_t u, out_len;
+ int8_t minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ out_len = (size_t) ((FALCON_N * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+
+ maxv = (int8_t) (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ if (PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int8(x, minv, maxv)) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < FALCON_N; u ++) {
+ acc = (acc << bits) | ((uint8_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(int8_t *x, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ in_len = ((FALCON_N * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < FALCON_N) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < FALCON_N) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ x[u ++] = (int8_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x) {
+ uint8_t *buf;
+ size_t u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ buf = out;
+
+ /*
+ * Make sure that all values are within the -2047..+2047 range.
+ */
+ if (PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int16(x, -2047, 2047)) {
+ return 0;
+ }
+
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < FALCON_N; u ++) {
+ int t;
+ unsigned w;
+
+ /*
+ * Get sign and absolute value of next integer; push the
+ * sign bit.
+ */
+ acc <<= 1;
+ t = x[u];
+ if (t < 0) {
+ t = -t;
+ acc |= 1;
+ }
+ w = (unsigned)t;
+
+ /*
+ * Push the low 7 bits of the absolute value.
+ */
+ acc <<= 7;
+ acc |= w & 127u;
+ w >>= 7;
+
+ /*
+ * We pushed exactly 8 bits.
+ */
+ acc_len += 8;
+
+ /*
+ * Push as many zeros as necessary, then a one. Since the
+ * absolute value is at most 2047, w can only range up to
+ * 15 at this point, thus we will add at most 16 bits
+ * here. With the 8 bits above and possibly up to 7 bits
+ * from previous iterations, we may go up to 31 bits, which
+ * will fit in the accumulator, which is an uint32_t.
+ */
+ acc <<= (w + 1);
+ acc |= 1;
+ acc_len += w + 1;
+
+ /*
+ * Produce all full bytes.
+ */
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc >> acc_len);
+ }
+ v ++;
+ }
+ }
+
+ /*
+ * Flush remaining bits (if any).
+ */
+ if (acc_len > 0) {
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc << (8 - acc_len));
+ }
+ v ++;
+ }
+
+ return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len) {
+ const uint8_t *buf;
+ size_t u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < FALCON_N; u ++) {
+ unsigned b, s, m;
+
+ /*
+ * Get next eight bits: sign and low seven bits of the
+ * absolute value.
+ */
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ b = acc >> acc_len;
+ s = b & 128;
+ m = b & 127;
+
+ /*
+ * Get next bits until a 1 is reached.
+ */
+ for (;;) {
+ if (acc_len == 0) {
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ acc_len = 8;
+ }
+ acc_len --;
+ if (((acc >> acc_len) & 1) != 0) {
+ break;
+ }
+ m += 128;
+ if (m > 2047) {
+ return 0;
+ }
+ }
+
+ /*
+ * "-0" is forbidden.
+ */
+ if (s && m == 0) {
+ return 0;
+ }
+
+ x[u] = (int16_t)(s ? -(int)m : (int)m);
+ }
+
+ /*
+ * Unused bits in the last byte must be zero.
+ */
+ if ((acc & ((1u << acc_len) - 1u)) != 0) {
+ return 0;
+ }
+
+ return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ * log(n) n max(f,g) std(f,g) max(F,G) std(F,G)
+ * 1 2 129 56.31 143 60.02
+ * 2 4 123 40.93 160 46.52
+ * 3 8 97 28.97 159 38.01
+ * 4 16 100 21.48 154 32.50
+ * 5 32 71 15.41 151 29.36
+ * 6 64 59 11.07 138 27.77
+ * 7 128 39 7.91 144 27.00
+ * 8 256 32 5.63 148 26.61
+ * 9 512 22 4.00 137 26.46
+ * 10 1024 15 2.84 146 26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 7,
+ 7,
+ 6,
+ 6,
+ 5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ * log(n) n max sig coeff (theoretical)
+ * 1 2 412
+ * 2 4 583
+ * 3 8 824
+ * 4 16 1166
+ * 5 32 1649
+ * 6 64 2332
+ * 7 128 3299
+ * 8 256 4665
+ * 9 512 6598
+ * 10 1024 9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_sig_bits[] = {
+ 0, /* unused */
+ 10,
+ 11,
+ 11,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/common.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/common.c
new file mode 100644
index 000000000..b461baa8c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/common.c
@@ -0,0 +1,549 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "macrous.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_vartime(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn) {
+ /*
+ * This is the straightforward per-the-spec implementation. It
+ * is not constant-time, thus it might reveal information on the
+ * plaintext (at least, enough to check the plaintext against a
+ * list of potential plaintexts) in a scenario where the
+ * attacker does not have access to the signature value or to
+ * the public key, but knows the nonce (without knowledge of the
+ * nonce, the hashed output cannot be matched against potential
+ * plaintexts).
+ */
+ size_t n;
+
+ n = (size_t)1 << logn;
+ while (n > 0) {
+ uint8_t buf[2];
+ uint32_t w;
+
+ inner_shake256_extract(sc, (void *)buf, sizeof buf);
+ w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+ if (w < 5 * FALCON_Q) {
+ while (w >= FALCON_Q) {
+ w -= FALCON_Q;
+ }
+ *x++ = (uint16_t)w;
+ n--;
+ }
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp) {
+ /*
+ * Each 16-bit sample is a value in 0..65535. The value is
+ * kept if it falls in 0..61444 (because 61445 = 5*12289)
+ * and rejected otherwise; thus, each sample has probability
+ * about 0.93758 of being selected.
+ *
+ * We want to oversample enough to be sure that we will
+ * have enough values with probability at least 1 - 2^(-256).
+ * Depending on degree N, this leads to the following
+ * required oversampling:
+ *
+ * logn n oversampling
+ * 1 2 65
+ * 2 4 67
+ * 3 8 71
+ * 4 16 77
+ * 5 32 86
+ * 6 64 100
+ * 7 128 122
+ * 8 256 154
+ * 9 512 205
+ * 10 1024 287
+ *
+ * If logn >= 7, then the provided temporary buffer is large
+ * enough. Otherwise, we use a stack buffer of 63 entries
+ * (i.e. 126 bytes) for the values that do not fit in tmp[].
+ */
+
+ static const uint16_t overtab[] = {
+ 0, /* unused */
+ 65,
+ 67,
+ 71,
+ 77,
+ 86,
+ 100,
+ 122,
+ 154,
+ 205,
+ 287
+ };
+
+ unsigned n, n2, u, m, p, over;
+ uint16_t *tt1, tt2[63];
+
+ /*
+ * We first generate m 16-bit value. Values 0..n-1 go to x[].
+ * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+ * We also reduce modulo q the values; rejected values are set
+ * to 0xFFFF.
+ */
+ n = 1U << logn;
+ n2 = n << 1;
+ over = overtab[logn];
+ m = n + over;
+ tt1 = (uint16_t *)tmp;
+ for (u = 0; u < m; u++) {
+ uint8_t buf[2];
+ uint32_t w, wr;
+
+ inner_shake256_extract(sc, buf, sizeof buf);
+ w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+ wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+ wr |= ((w - 61445) >> 31) - 1;
+ if (u < n) {
+ x[u] = (uint16_t)wr;
+ } else if (u < n2) {
+ tt1[u - n] = (uint16_t)wr;
+ } else {
+ tt2[u - n2] = (uint16_t)wr;
+ }
+ }
+
+ /*
+ * Now we must "squeeze out" the invalid values. We do this in
+ * a logarithmic sequence of passes; each pass computes where a
+ * value should go, and moves it down by 'p' slots if necessary,
+ * where 'p' uses an increasing powers-of-two scale. It can be
+ * shown that in all cases where the loop decides that a value
+ * has to be moved down by p slots, the destination slot is
+ * "free" (i.e. contains an invalid value).
+ */
+ for (p = 1; p <= over; p <<= 1) {
+ unsigned v;
+
+ /*
+ * In the loop below:
+ *
+ * - v contains the index of the final destination of
+ * the value; it is recomputed dynamically based on
+ * whether values are valid or not.
+ *
+ * - u is the index of the value we consider ("source");
+ * its address is s.
+ *
+ * - The loop may swap the value with the one at index
+ * u-p. The address of the swap destination is d.
+ */
+ v = 0;
+ for (u = 0; u < m; u++) {
+ uint16_t *s, *d;
+ unsigned j, sv, dv, mk;
+
+ if (u < n) {
+ s = &x[u];
+ } else if (u < n2) {
+ s = &tt1[u - n];
+ } else {
+ s = &tt2[u - n2];
+ }
+ sv = *s;
+
+ /*
+ * The value in sv should ultimately go to
+ * address v, i.e. jump back by u-v slots.
+ */
+ j = u - v;
+
+ /*
+ * We increment v for the next iteration, but
+ * only if the source value is valid. The mask
+ * 'mk' is -1 if the value is valid, 0 otherwise,
+ * so we _subtract_ mk.
+ */
+ mk = (sv >> 15) - 1U;
+ v -= mk;
+
+ /*
+ * In this loop we consider jumps by p slots; if
+ * u < p then there is nothing more to do.
+ */
+ if (u < p) {
+ continue;
+ }
+
+ /*
+ * Destination for the swap: value at address u-p.
+ */
+ if ((u - p) < n) {
+ d = &x[u - p];
+ } else if ((u - p) < n2) {
+ d = &tt1[(u - p) - n];
+ } else {
+ d = &tt2[(u - p) - n2];
+ }
+ dv = *d;
+
+ /*
+ * The swap should be performed only if the source
+ * is valid AND the jump j has its 'p' bit set.
+ */
+ mk &= -(((j & p) + 0x1FF) >> 9);
+
+ *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+ *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+ }
+ }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+ 0, /* unused */
+ 101498,
+ 208714,
+ 428865,
+ 892039,
+ 1852696,
+ 3842630,
+ 7959734,
+ 16468416,
+ 34034726,
+ 70265242
+};
+
+/* see inner.h
+ * In NEON, there is sign saturating doubling add instruction sqdmlal/sqdmlal2,
+ * thus, we enable 2 parallel dependency rather than 1 for better scheduling.
+ * Each for loop is tuned for cache locality.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short(const int16_t *s1, const int16_t *s2) {
+ // Total SIMD register 18 = 16 + 2
+ int16x8x4_t neon_s1, neon_s2, neon_s3, neon_s4; // 16
+ int32x4_t neon_s, neon_sh; // 2
+ int32x2_t tmp;
+ uint32_t s;
+ neon_s = vdupq_n_s32(0);
+ neon_sh = vdupq_n_s32(0);
+
+ for (unsigned u = 0; u < FALCON_N; u += 128) {
+ vload_s16_x4(neon_s1, &s1[u]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+ vload_s16_x4(neon_s2, &s1[u + 32]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+ vload_s16_x4(neon_s3, &s1[u + 64]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+ vload_s16_x4(neon_s4, &s1[u + 96]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+ }
+ for (unsigned u = 0; u < FALCON_N; u += 128) {
+ vload_s16_x4(neon_s1, &s2[u]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[0]), vget_low_s16(neon_s1.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[1]), vget_low_s16(neon_s1.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[2]), vget_low_s16(neon_s1.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s1.val[3]), vget_low_s16(neon_s1.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[0], neon_s1.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[1], neon_s1.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[2], neon_s1.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s1.val[3], neon_s1.val[3]);
+
+ vload_s16_x4(neon_s2, &s2[u + 32]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[0]), vget_low_s16(neon_s2.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[1]), vget_low_s16(neon_s2.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[2]), vget_low_s16(neon_s2.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s2.val[3]), vget_low_s16(neon_s2.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[0], neon_s2.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[1], neon_s2.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[2], neon_s2.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s2.val[3], neon_s2.val[3]);
+
+ vload_s16_x4(neon_s3, &s2[u + 64]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[0]), vget_low_s16(neon_s3.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[1]), vget_low_s16(neon_s3.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[2]), vget_low_s16(neon_s3.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s3.val[3]), vget_low_s16(neon_s3.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[0], neon_s3.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[1], neon_s3.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[2], neon_s3.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s3.val[3], neon_s3.val[3]);
+
+ vload_s16_x4(neon_s4, &s2[u + 96]);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[0]), vget_low_s16(neon_s4.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[1]), vget_low_s16(neon_s4.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[2]), vget_low_s16(neon_s4.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_s4.val[3]), vget_low_s16(neon_s4.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[0], neon_s4.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[1], neon_s4.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[2], neon_s4.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_s4.val[3], neon_s4.val[3]);
+ }
+ // 32x4
+ neon_s = vhaddq_s32(neon_s, neon_sh);
+ // 32x4 -> 32x2
+ tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+ // 32x2 -> 32x1
+ // Use saturating add to prevent overflow
+ s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+ return s <= l2bound[FALCON_LOGN];
+}
+
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+ const int16_t *hm, const fpr *t0,
+ const fpr *t1) {
+ // Total SIMD registers: 26 = 16 + 8 + 2
+ int16x8x4_t neon_hm, neon_ts; // 8
+ float64x2x4_t neon_tf0, neon_tf1, neon_tf2, neon_tf3; // 16
+ int64x2x4_t neon_ts0, neon_ts1, neon_ts2, neon_ts3; // 16
+ int32x4x4_t neon_ts4, neon_ts5; // 8
+ int32x4_t neon_s, neon_sh; // 2
+ int32x2_t tmp;
+ uint32_t s;
+
+ neon_s = vdupq_n_s32(0);
+ neon_sh = vdupq_n_s32(0);
+
+ // s1tmp
+ for (int i = 0; i < FALCON_N; i += 32) {
+ vloadx4(neon_tf0, &t0[i]);
+ vloadx4(neon_tf1, &t0[i + 8]);
+ vfrintx4(neon_ts0, neon_tf0);
+ vfrintx4(neon_ts1, neon_tf1);
+
+ neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+ neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+ neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+ neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+ vloadx4(neon_tf2, &t0[i + 16]);
+ vloadx4(neon_tf3, &t0[i + 24]);
+ vfrintx4(neon_ts2, neon_tf2);
+ vfrintx4(neon_ts3, neon_tf3);
+
+ neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+ neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+ neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+ neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+ neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+ neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+ neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+ neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+ // hm = hm - fpr_rint(t0)
+ vload_s16_x4(neon_hm, &hm[i]);
+ neon_hm.val[0] = vsubq_s16(neon_hm.val[0], neon_ts.val[0]);
+ neon_hm.val[1] = vsubq_s16(neon_hm.val[1], neon_ts.val[1]);
+ neon_hm.val[2] = vsubq_s16(neon_hm.val[2], neon_ts.val[2]);
+ neon_hm.val[3] = vsubq_s16(neon_hm.val[3], neon_ts.val[3]);
+ vstore_s16_x4(&s1tmp[i], neon_hm);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[0]), vget_low_s16(neon_hm.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[1]), vget_low_s16(neon_hm.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[2]), vget_low_s16(neon_hm.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_hm.val[3]), vget_low_s16(neon_hm.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[0], neon_hm.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[1], neon_hm.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[2], neon_hm.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_hm.val[3], neon_hm.val[3]);
+ }
+
+ // s2tmp
+ for (int i = 0; i < FALCON_N; i += 32) {
+ vloadx4(neon_tf0, &t1[i]);
+ vloadx4(neon_tf1, &t1[i + 8]);
+
+ vfrintx4(neon_ts0, neon_tf0);
+ vfrintx4(neon_ts1, neon_tf1);
+
+ neon_ts4.val[0] = vmovn_high_s64(vmovn_s64(neon_ts0.val[0]), neon_ts0.val[1]);
+ neon_ts4.val[1] = vmovn_high_s64(vmovn_s64(neon_ts0.val[2]), neon_ts0.val[3]);
+ neon_ts4.val[2] = vmovn_high_s64(vmovn_s64(neon_ts1.val[0]), neon_ts1.val[1]);
+ neon_ts4.val[3] = vmovn_high_s64(vmovn_s64(neon_ts1.val[2]), neon_ts1.val[3]);
+
+ vloadx4(neon_tf2, &t1[i + 16]);
+ vloadx4(neon_tf3, &t1[i + 24]);
+
+ vfrintx4(neon_ts2, neon_tf2);
+ vfrintx4(neon_ts3, neon_tf3);
+
+ neon_ts5.val[0] = vmovn_high_s64(vmovn_s64(neon_ts2.val[0]), neon_ts2.val[1]);
+ neon_ts5.val[1] = vmovn_high_s64(vmovn_s64(neon_ts2.val[2]), neon_ts2.val[3]);
+ neon_ts5.val[2] = vmovn_high_s64(vmovn_s64(neon_ts3.val[0]), neon_ts3.val[1]);
+ neon_ts5.val[3] = vmovn_high_s64(vmovn_s64(neon_ts3.val[2]), neon_ts3.val[3]);
+
+ neon_ts.val[0] = vmovn_high_s32(vmovn_s32(neon_ts4.val[0]), neon_ts4.val[1]);
+ neon_ts.val[1] = vmovn_high_s32(vmovn_s32(neon_ts4.val[2]), neon_ts4.val[3]);
+ neon_ts.val[2] = vmovn_high_s32(vmovn_s32(neon_ts5.val[0]), neon_ts5.val[1]);
+ neon_ts.val[3] = vmovn_high_s32(vmovn_s32(neon_ts5.val[2]), neon_ts5.val[3]);
+
+ neon_ts.val[0] = vnegq_s16(neon_ts.val[0]);
+ neon_ts.val[1] = vnegq_s16(neon_ts.val[1]);
+ neon_ts.val[2] = vnegq_s16(neon_ts.val[2]);
+ neon_ts.val[3] = vnegq_s16(neon_ts.val[3]);
+ vstore_s16_x4(&s2tmp[i], neon_ts);
+
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[0]), vget_low_s16(neon_ts.val[0]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[1]), vget_low_s16(neon_ts.val[1]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[2]), vget_low_s16(neon_ts.val[2]));
+ neon_s = vqdmlal_s16(neon_s, vget_low_s16(neon_ts.val[3]), vget_low_s16(neon_ts.val[3]));
+
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[0], neon_ts.val[0]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[1], neon_ts.val[1]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[2], neon_ts.val[2]);
+ neon_sh = vqdmlal_high_s16(neon_sh, neon_ts.val[3], neon_ts.val[3]);
+ }
+
+ // 32x4
+ neon_s = vhaddq_s32(neon_s, neon_sh);
+ // 32x4 -> 32x2
+ tmp = vqadd_s32(vget_low_s32(neon_s), vget_high_s32(neon_s));
+
+ // 32x2 -> 32x1
+ // Use saturating add to prevent overflow
+ s = (uint32_t) vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+ return s <= l2bound[FALCON_LOGN];
+}
+
+int32_t PQCLEAN_FALCONPADDED512_AARCH64_poly_small_sqnorm(const int8_t *f) {
+ int8x16x4_t a;
+ int16x8x4_t b, c;
+ int32x4_t norm, norm_sh;
+
+ norm = vdupq_n_s32(0);
+ norm_sh = vdupq_n_s32(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ a = vld1q_s8_x4(&f[0]);
+
+ b.val[0] = vmovl_s8(vget_low_s8(a.val[0]));
+ b.val[1] = vmovl_high_s8(a.val[0]);
+ b.val[2] = vmovl_s8(vget_low_s8(a.val[1]));
+ b.val[3] = vmovl_high_s8(a.val[1]);
+
+ c.val[0] = vmovl_s8(vget_low_s8(a.val[2]));
+ c.val[1] = vmovl_high_s8(a.val[2]);
+ c.val[2] = vmovl_s8(vget_low_s8(a.val[3]));
+ c.val[3] = vmovl_high_s8(a.val[3]);
+
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[0]), vget_low_s16(b.val[0]));
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[1]), vget_low_s16(b.val[1]));
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[2]), vget_low_s16(b.val[2]));
+ norm = vqdmlal_s16(norm, vget_low_s16(b.val[3]), vget_low_s16(b.val[3]));
+
+ norm = vqdmlal_high_s16(norm, b.val[0], b.val[0]);
+ norm = vqdmlal_high_s16(norm, b.val[1], b.val[1]);
+ norm = vqdmlal_high_s16(norm, b.val[2], b.val[2]);
+ norm = vqdmlal_high_s16(norm, b.val[3], b.val[3]);
+
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[0]), vget_low_s16(c.val[0]));
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[1]), vget_low_s16(c.val[1]));
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[2]), vget_low_s16(c.val[2]));
+ norm_sh = vqdmlal_s16(norm_sh, vget_low_s16(c.val[3]), vget_low_s16(c.val[3]));
+
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[0], c.val[0]);
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[1], c.val[1]);
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[2], c.val[2]);
+ norm_sh = vqdmlal_high_s16(norm_sh, c.val[3], c.val[3]);
+ }
+ // 32x4
+ norm = vhaddq_s32(norm, norm_sh);
+ // 32x4 -> 32x2
+ int32x2_t tmp;
+ tmp = vqadd_s32(vget_low_s32(norm), vget_high_s32(norm));
+
+ // 32x2 -> 32x1
+ // Use saturating add to prevent overflow
+ int32_t s;
+ s = vqadds_s32(vget_lane_s32(tmp, 0), vget_lane_s32(tmp, 1));
+
+ return s;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft.c
new file mode 100644
index 000000000..9de1bc33e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft.c
@@ -0,0 +1,1038 @@
+/*
+ * High-speed vectorize FFT code for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Forward FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log2(fpr *f) {
+ fpr x_re, x_im, y_re, y_im, v_re, v_im, t_re, t_im, s;
+
+ x_re = f[0];
+ y_re = f[1];
+ x_im = f[2];
+ y_im = f[3];
+ s = fpr_tab_log2[0];
+
+ t_re = y_re * s;
+ t_im = y_im * s;
+
+ v_re = t_re - t_im;
+ v_im = t_re + t_im;
+
+ f[0] = x_re + v_re;
+ f[1] = x_re - v_re;
+ f[2] = x_im + v_im;
+ f[3] = x_im - v_im;
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 4 complex points (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log3(fpr *f) {
+ // Total SIMD registers: 18 = 4 + 6 + 8
+ float64x2x4_t tmp; // 4
+ float64x2x2_t s_re_im, x, y; // 6
+ float64x2_t v_re, v_im, x_re, x_im, y_re, y_im, t_x, t_y; // 8
+
+ vloadx4(tmp, &f[0]);
+ s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ vfmul(v_re, tmp.val[1], s_re_im.val[0]);
+ vfmul(v_im, tmp.val[3], s_re_im.val[0]);
+
+ vfsub(t_x, v_re, v_im);
+ vfadd(t_y, v_re, v_im);
+
+ vfsub(tmp.val[1], tmp.val[0], t_x);
+ vfsub(tmp.val[3], tmp.val[2], t_y);
+
+ vfadd(tmp.val[0], tmp.val[0], t_x);
+ vfadd(tmp.val[2], tmp.val[2], t_y);
+
+ x_re = vtrn1q_f64(tmp.val[0], tmp.val[1]);
+ y_re = vtrn2q_f64(tmp.val[0], tmp.val[1]);
+ x_im = vtrn1q_f64(tmp.val[2], tmp.val[3]);
+ y_im = vtrn2q_f64(tmp.val[2], tmp.val[3]);
+
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ FWD_TOP(v_re, v_im, y_re, y_im, s_re_im.val[0], s_re_im.val[1]);
+
+ FPC_ADD(x.val[0], y.val[0], x_re, x_im, v_re, v_im);
+ FPC_SUB(x.val[1], y.val[1], x_re, x_im, v_re, v_im);
+
+ vstore2(&f[0], x);
+ vstore2(&f[4], y);
+}
+
+/*
+ * Vectorized 3 layers of Forward FFT for 8 complex points (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log4(fpr *f) {
+ // Total SIMD register: 26 = 8 + 18
+ float64x2x4_t t0, t1; // 8
+ float64x2x2_t x_re, x_im, y_re, y_im, v1, v2, tx, ty, s_re_im; // 18
+
+ vloadx4(t0, &f[0]);
+ vloadx4(t1, &f[8]);
+ vload(s_re_im.val[0], &fpr_tab_log2[0]);
+
+ vfmul(v1.val[0], t0.val[2], s_re_im.val[0]);
+ vfmul(v1.val[1], t0.val[3], s_re_im.val[0]);
+
+ vfmul(v2.val[0], t1.val[2], s_re_im.val[0]);
+ vfmul(v2.val[1], t1.val[3], s_re_im.val[0]);
+
+ vfsub(tx.val[0], v1.val[0], v2.val[0]);
+ vfsub(tx.val[1], v1.val[1], v2.val[1]);
+
+ vfadd(ty.val[0], v1.val[0], v2.val[0]);
+ vfadd(ty.val[1], v1.val[1], v2.val[1]);
+
+ FWD_BOT(t0.val[0], t1.val[0], t0.val[2], t1.val[2], tx.val[0], ty.val[0]);
+ FWD_BOT(t0.val[1], t1.val[1], t0.val[3], t1.val[3], tx.val[1], ty.val[1]);
+
+ vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+ FWD_TOP_LANE(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0]);
+ FWD_TOP_LANE(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0]);
+
+ FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+ FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+ x_re.val[0] = t0.val[0];
+ x_re.val[1] = t0.val[2];
+ y_re.val[0] = t0.val[1];
+ y_re.val[1] = t0.val[3];
+
+ x_im.val[0] = t1.val[0];
+ x_im.val[1] = t1.val[2];
+ y_im.val[0] = t1.val[1];
+ y_im.val[1] = t1.val[3];
+
+ t0.val[0] = vzip1q_f64(x_re.val[0], x_re.val[1]);
+ t0.val[1] = vzip2q_f64(x_re.val[0], x_re.val[1]);
+ t0.val[2] = vzip1q_f64(y_re.val[0], y_re.val[1]);
+ t0.val[3] = vzip2q_f64(y_re.val[0], y_re.val[1]);
+
+ t1.val[0] = vzip1q_f64(x_im.val[0], x_im.val[1]);
+ t1.val[1] = vzip2q_f64(x_im.val[0], x_im.val[1]);
+ t1.val[2] = vzip1q_f64(y_im.val[0], y_im.val[1]);
+ t1.val[3] = vzip2q_f64(y_im.val[0], y_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab_log4[0]);
+
+ FWD_TOP(v1.val[0], v1.val[1], t0.val[1], t1.val[1], s_re_im.val[0], s_re_im.val[1]);
+ FWD_TOP(v2.val[0], v2.val[1], t0.val[3], t1.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ FWD_BOT(t0.val[0], t1.val[0], t0.val[1], t1.val[1], v1.val[0], v1.val[1]);
+ FWD_BOTJ(t0.val[2], t1.val[2], t0.val[3], t1.val[3], v2.val[0], v2.val[1]);
+
+ vstore4(&f[0], t0);
+ vstore4(&f[8], t1);
+}
+
+/*
+ * Vectorized 4 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(fpr *f, const unsigned logn) {
+ // Total SIMD register: 34 = 2 + 32
+ float64x2x2_t s_re_im; // 2
+ float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im, v_re, v_im; // 32
+
+ const unsigned int falcon_n = 1 << logn;
+ const unsigned int hn = falcon_n >> 1;
+
+ unsigned int level = logn - 3;
+ const fpr *fpr_tab2 = fpr_table[level++],
+ *fpr_tab3 = fpr_table[level++],
+ *fpr_tab4 = fpr_table[level++],
+ *fpr_tab5 = fpr_table[level];
+ int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+ for (unsigned j = 0; j < hn; j += 16) {
+ vload(s_re_im.val[0], &fpr_tab2[k2]);
+
+ /*
+ * We only increase k2 when j value has the form j = 32*x + 16
+ * Modulo 32 both sides, then check if (j % 32) == 16.
+ */
+ k2 += 2 * ((j & 31) == 16);
+
+ vloadx4(y_re, &f[j + 8]);
+ vloadx4(y_im, &f[j + 8 + hn]);
+
+ if (logn == 5) {
+ // Handle special case when use fpr_tab_log2, where re == im
+ // This reduce number of multiplications,
+ // although equal number of instructions as the "else" branch
+ vfmulx4_i(t_im, y_im, s_re_im.val[0]);
+ vfmulx4_i(t_re, y_re, s_re_im.val[0]);
+ vfsubx4(v_re, t_re, t_im);
+ vfaddx4(v_im, t_re, t_im);
+ } else {
+ FWD_TOP_LANEx4(v_re, v_im, y_re, y_im, s_re_im.val[0]);
+ }
+
+ vloadx4(x_re, &f[j]);
+ vloadx4(x_im, &f[j + hn]);
+
+ if ((j >> 4) & 1) {
+ FWD_BOTJx4(x_re, x_im, y_re, y_im, v_re, v_im);
+ } else {
+ FWD_BOTx4(x_re, x_im, y_re, y_im, v_re, v_im);
+ }
+
+ vload(s_re_im.val[0], &fpr_tab3[k3]);
+ k3 += 2;
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[2], x_im.val[2], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[2], y_im.val[2], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0]);
+
+ FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+ FWD_BOTJ(y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+ vloadx2(s_re_im, &fpr_tab4[k4]);
+ k4 += 4;
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0]);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[1]);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[1]);
+
+ FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+ FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+ FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+ transpose_f64(x_re, x_re, v_re, 0, 2, 0);
+ transpose_f64(x_re, x_re, v_re, 1, 3, 1);
+ transpose_f64(x_im, x_im, v_im, 0, 2, 0);
+ transpose_f64(x_im, x_im, v_im, 1, 3, 1);
+
+ v_re.val[0] = x_re.val[2];
+ x_re.val[2] = x_re.val[1];
+ x_re.val[1] = v_re.val[0];
+
+ v_im.val[0] = x_im.val[2];
+ x_im.val[2] = x_im.val[1];
+ x_im.val[1] = v_im.val[0];
+
+ transpose_f64(y_re, y_re, v_re, 0, 2, 2);
+ transpose_f64(y_re, y_re, v_re, 1, 3, 3);
+ transpose_f64(y_im, y_im, v_im, 0, 2, 2);
+ transpose_f64(y_im, y_im, v_im, 1, 3, 3);
+
+ v_re.val[0] = y_re.val[2];
+ y_re.val[2] = y_re.val[1];
+ y_re.val[1] = v_re.val[0];
+
+ v_im.val[0] = y_im.val[2];
+ y_im.val[2] = y_im.val[1];
+ y_im.val[1] = v_im.val[0];
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ FWD_TOP(t_re.val[0], t_im.val[0], x_re.val[1], x_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ FWD_TOP(t_re.val[1], t_im.val[1], x_re.val[3], x_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ FWD_TOP(t_re.val[2], t_im.val[2], y_re.val[1], y_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ FWD_TOP(t_re.val[3], t_im.val[3], y_re.val[3], y_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ FWD_BOT(x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0]);
+ FWD_BOTJ(x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1]);
+
+ vstore4(&f[j], x_re);
+ vstore4(&f[j + hn], x_im);
+
+ FWD_BOT(y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3]);
+
+ vstore4(&f[j + 8], y_re);
+ vstore4(&f[j + 8 + hn], y_im);
+ }
+}
+
+/*
+ * Vectorized 1 layer of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(fpr *f, const unsigned logn) {
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+ const unsigned ht = n >> 2;
+
+ // Total SIMD register: 25 = 1 + 24
+ float64x2_t s_re_im; // 1
+ float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im, v_re, v_im; // 24
+
+ s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+ for (unsigned j = 0; j < ht; j += 8) {
+ vloadx4(b_re, &f[j + ht]);
+ vfmulx4_i(t_re, b_re, s_re_im);
+
+ vloadx4(b_im, &f[j + ht + hn]);
+ vfmulx4_i(t_im, b_im, s_re_im);
+
+ vfsubx4(v_re, t_re, t_im);
+ vfaddx4(v_im, t_re, t_im);
+
+ vloadx4(a_re, &f[j]);
+ vloadx4(a_im, &f[j + hn]);
+
+ FWD_BOTx4(a_re, a_im, b_re, b_im, v_re, v_im);
+ vstorex4(&f[j + ht], b_re);
+ vstorex4(&f[j], a_re);
+
+ vstorex4(&f[j + ht + hn], b_im);
+ vstorex4(&f[j + hn], a_im);
+ }
+}
+
+/*
+ * Vectorized 2 layers of Forward FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(fpr *f, const unsigned logn, const unsigned level) {
+ const unsigned int falcon_n = 1 << logn;
+ const unsigned int hn = falcon_n >> 1;
+
+ // Total SIMD register: 26 = 8 + 16 + 2
+ float64x2x4_t t_re, t_im; // 8
+ float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+ y1_re, y2_re, y1_im, y2_im; // 16
+ float64x2_t s1_re_im, s2_re_im; // 2
+
+ const fpr *fpr_tab1 = NULL, *fpr_tab2 = NULL;
+ unsigned l, len, start, j, k1, k2;
+ unsigned bar = logn - level + 2;
+
+ for (l = level - 1; l > 4; l -= 2) {
+ len = 1 << (l - 2);
+ fpr_tab1 = fpr_table[bar++];
+ fpr_tab2 = fpr_table[bar++];
+ k1 = 0;
+ k2 = 0;
+
+ for (start = 0; start < hn; start += 1U << l) {
+ vload(s1_re_im, &fpr_tab1[k1]);
+ vload(s2_re_im, &fpr_tab2[k2]);
+ k1 += 2U * ((start & 127) == 64);
+ k2 += 2;
+
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(y1_re, &f[j + 2 * len]);
+ vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(x2_re, &f[j + len]);
+ vloadx2(x2_im, &f[j + len + hn]);
+
+ FWD_BOT(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+ FWD_BOT(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOT(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+ FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+ vstorex2(&f[j + len], x2_re);
+ vstorex2(&f[j + len + hn], x2_im);
+
+ FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ vstorex2(&f[j + 2 * len], y1_re);
+ vstorex2(&f[j + 2 * len + hn], y1_im);
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+ }
+
+ start += 1U << l;
+ if (start >= hn) {
+ break;
+ }
+
+ vload(s1_re_im, &fpr_tab1[k1]);
+ vload(s2_re_im, &fpr_tab2[k2]);
+ k1 += 2U * ((start & 127) == 64);
+ k2 += 2;
+
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(y1_re, &f[j + 2 * len]);
+ vloadx2(y1_im, &f[j + 2 * len + hn]);
+
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], y1_re.val[0], y1_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], y1_re.val[1], y1_im.val[1], s1_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s1_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s1_re_im);
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(x2_re, &f[j + len]);
+ vloadx2(x2_im, &f[j + len + hn]);
+
+ FWD_BOTJ(x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOTJ(x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1]);
+ FWD_BOTJ(x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ FWD_TOP_LANE(t_re.val[0], t_im.val[0], x2_re.val[0], x2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[1], t_im.val[1], x2_re.val[1], x2_im.val[1], s2_re_im);
+ FWD_TOP_LANE(t_re.val[2], t_im.val[2], y2_re.val[0], y2_im.val[0], s2_re_im);
+ FWD_TOP_LANE(t_re.val[3], t_im.val[3], y2_re.val[1], y2_im.val[1], s2_re_im);
+
+ FWD_BOT(x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0]);
+ FWD_BOT(x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1]);
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+ vstorex2(&f[j + len], x2_re);
+ vstorex2(&f[j + len + hn], x2_im);
+
+ FWD_BOTJ(y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2]);
+ FWD_BOTJ(y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3]);
+
+ vstorex2(&f[j + 2 * len], y1_re);
+ vstorex2(&f[j + 2 * len + hn], y1_im);
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+ }
+ }
+ }
+}
+
+/*
+ * 1 layer of Inverse FFT for 2 complex points (4 coefficients).
+ * Note: The scalar version is faster than vectorized code.
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log2(fpr *f) {
+ fpr x_re, x_im, y_re, y_im, s;
+ x_re = f[0];
+ y_re = f[1];
+ x_im = f[2];
+ y_im = f[3];
+ s = fpr_tab_log2[0] * 0.5;
+
+ f[0] = (x_re + y_re) * 0.5;
+ f[2] = (x_im + y_im) * 0.5;
+
+ x_re = (x_re - y_re) * s;
+ x_im = (x_im - y_im) * s;
+
+ f[1] = x_im + x_re;
+ f[3] = x_im - x_re;
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 4 complex point (8 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log3(fpr *f) {
+ // Total SIMD registers: 12 = 4 + 8
+ float64x2x4_t tmp; // 4
+ float64x2x2_t x_re_im, y_re_im, v, s_re_im; // 8
+
+ vload2(x_re_im, &f[0]);
+ vload2(y_re_im, &f[4]);
+
+ vfsub(v.val[0], x_re_im.val[0], x_re_im.val[1]);
+ vfsub(v.val[1], y_re_im.val[0], y_re_im.val[1]);
+ vfadd(x_re_im.val[0], x_re_im.val[0], x_re_im.val[1]);
+ vfadd(x_re_im.val[1], y_re_im.val[0], y_re_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ vfmul(y_re_im.val[0], v.val[1], s_re_im.val[1]);
+ vfmla(y_re_im.val[0], y_re_im.val[0], v.val[0], s_re_im.val[0]);
+ vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+ vfmls(y_re_im.val[1], y_re_im.val[1], v.val[0], s_re_im.val[1]);
+
+ tmp.val[0] = vtrn1q_f64(x_re_im.val[0], y_re_im.val[0]);
+ tmp.val[1] = vtrn2q_f64(x_re_im.val[0], y_re_im.val[0]);
+ tmp.val[2] = vtrn1q_f64(x_re_im.val[1], y_re_im.val[1]);
+ tmp.val[3] = vtrn2q_f64(x_re_im.val[1], y_re_im.val[1]);
+
+ s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ vfadd(x_re_im.val[0], tmp.val[0], tmp.val[1]);
+ vfadd(x_re_im.val[1], tmp.val[2], tmp.val[3]);
+ vfsub(v.val[0], tmp.val[0], tmp.val[1]);
+ vfsub(v.val[1], tmp.val[2], tmp.val[3]);
+
+ vfmuln(tmp.val[0], x_re_im.val[0], 0.25);
+ vfmuln(tmp.val[2], x_re_im.val[1], 0.25);
+
+ vfmuln(s_re_im.val[0], s_re_im.val[0], 0.25);
+
+ vfmul(y_re_im.val[0], v.val[0], s_re_im.val[0]);
+ vfmul(y_re_im.val[1], v.val[1], s_re_im.val[0]);
+
+ vfadd(tmp.val[1], y_re_im.val[1], y_re_im.val[0]);
+ vfsub(tmp.val[3], y_re_im.val[1], y_re_im.val[0]);
+
+ vstorex4(&f[0], tmp);
+}
+
+/*
+ * Vectorized 3 layers of Inverse FFT for 8 complex point (16 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log4(fpr *f) {
+ // Total SIMD registers: 18 = 12 + 6
+ float64x2x4_t re, im, t; // 12
+ float64x2x2_t t_re, t_im, s_re_im; // 6
+
+ vload4(re, &f[0]);
+ vload4(im, &f[8]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+ vload2(s_re_im, &fpr_tab_log4[0]);
+
+ INV_BOTJ(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ // re: 0, 4 | 1, 5 | 2, 6 | 3, 7
+ // im: 8, 12| 9, 13|10, 14|11, 15
+ transpose_f64(re, re, t, 0, 1, 0);
+ transpose_f64(re, re, t, 2, 3, 1);
+ transpose_f64(im, im, t, 0, 1, 2);
+ transpose_f64(im, im, t, 2, 3, 3);
+
+ // re: 0, 1 | 4, 5 | 2, 3 | 6, 7
+ // im: 8, 9 | 12, 13|10, 11| 14, 15
+ t.val[0] = re.val[1];
+ re.val[1] = re.val[2];
+ re.val[2] = t.val[0];
+
+ t.val[1] = im.val[1];
+ im.val[1] = im.val[2];
+ im.val[2] = t.val[1];
+
+ // re: 0, 1 | 2, 3| 4, 5 | 6, 7
+ // im: 8, 9 | 10, 11| 12, 13| 14, 15
+ INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[1], im.val[1]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], re.val[2], im.val[2], re.val[3], im.val[3]);
+
+ vload(s_re_im.val[0], &fpr_tab_log3[0]);
+
+ INV_BOTJ_LANE(re.val[1], im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+ INV_BOTJm_LANE(re.val[3], im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], re.val[0], im.val[0], re.val[2], im.val[2]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], re.val[1], im.val[1], re.val[3], im.val[3]);
+
+ vfmuln(re.val[0], re.val[0], 0.12500000000);
+ vfmuln(re.val[1], re.val[1], 0.12500000000);
+ vfmuln(im.val[0], im.val[0], 0.12500000000);
+ vfmuln(im.val[1], im.val[1], 0.12500000000);
+
+ s_re_im.val[0] = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ vfmuln(s_re_im.val[0], s_re_im.val[0], 0.12500000000);
+
+ vfmul(t_re.val[0], t_re.val[0], s_re_im.val[0]);
+ vfmul(t_re.val[1], t_re.val[1], s_re_im.val[0]);
+ vfmul(t_im.val[0], t_im.val[0], s_re_im.val[0]);
+ vfmul(t_im.val[1], t_im.val[1], s_re_im.val[0]);
+
+ vfsub(im.val[2], t_im.val[0], t_re.val[0]);
+ vfsub(im.val[3], t_im.val[1], t_re.val[1]);
+ vfadd(re.val[2], t_im.val[0], t_re.val[0]);
+ vfadd(re.val[3], t_im.val[1], t_re.val[1]);
+
+ vstorex4(&f[0], re);
+ vstorex4(&f[8], im);
+}
+
+/*
+ * Vectorized 4 layers of Inverse FFT for 16 complex point (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(fpr *f, const unsigned logn, const unsigned last) {
+ // Total SIMD register: 26 = 24 + 2
+ float64x2x4_t x_re, x_im, y_re, y_im, t_re, t_im; // 24
+ float64x2x2_t s_re_im; // 2
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+
+ unsigned int level = logn;
+ const fpr *fpr_tab5 = fpr_table[level--],
+ *fpr_tab4 = fpr_table[level--],
+ *fpr_tab3 = fpr_table[level--],
+ *fpr_tab2 = fpr_table[level];
+ int k2 = 0, k3 = 0, k4 = 0, k5 = 0;
+
+ for (unsigned j = 0; j < hn; j += 16) {
+
+ vload4(x_re, &f[j]);
+ vload4(x_im, &f[j + hn]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+ INV_TOPJm(t_re.val[2], t_im.val[2], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+ vload4(y_re, &f[j + 8]);
+ vload4(y_im, &f[j + 8 + hn]);
+
+ INV_TOPJ(t_re.val[1], t_im.val[1], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ INV_BOTJ(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(x_re.val[3], x_im.val[3], t_re.val[2], t_im.val[2], s_re_im.val[0], s_re_im.val[1]);
+
+ vload2(s_re_im, &fpr_tab5[k5]);
+ k5 += 4;
+
+ INV_BOTJ(y_re.val[1], y_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0], s_re_im.val[1]);
+
+ transpose_f64(x_re, x_re, t_re, 0, 1, 0);
+ transpose_f64(x_re, x_re, t_re, 2, 3, 1);
+ transpose_f64(y_re, y_re, t_re, 0, 1, 2);
+ transpose_f64(y_re, y_re, t_re, 2, 3, 3);
+
+ transpose_f64(x_im, x_im, t_im, 0, 1, 0);
+ transpose_f64(x_im, x_im, t_im, 2, 3, 1);
+ transpose_f64(y_im, y_im, t_im, 0, 1, 2);
+ transpose_f64(y_im, y_im, t_im, 2, 3, 3);
+
+ t_re.val[0] = x_re.val[1];
+ x_re.val[1] = x_re.val[2];
+ x_re.val[2] = t_re.val[0];
+
+ t_re.val[1] = y_re.val[1];
+ y_re.val[1] = y_re.val[2];
+ y_re.val[2] = t_re.val[1];
+
+ t_im.val[0] = x_im.val[1];
+ x_im.val[1] = x_im.val[2];
+ x_im.val[2] = t_im.val[0];
+
+ t_im.val[1] = y_im.val[1];
+ y_im.val[1] = y_im.val[2];
+ y_im.val[2] = t_im.val[1];
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[1], x_im.val[1]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], x_re.val[2], x_im.val[2], x_re.val[3], x_im.val[3]);
+
+ INV_TOPJ(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[1], y_im.val[1]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[2], y_im.val[2], y_re.val[3], y_im.val[3]);
+
+ vloadx2(s_re_im, &fpr_tab4[k4]);
+ k4 += 4;
+
+ INV_BOTJ_LANE(x_re.val[1], x_im.val[1], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+ INV_BOTJm_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+ INV_BOTJ_LANE(y_re.val[1], y_im.val[1], t_re.val[2], t_im.val[2], s_re_im.val[1]);
+ INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[1]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x_re.val[0], x_im.val[0], x_re.val[2], x_im.val[2]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x_re.val[1], x_im.val[1], x_re.val[3], x_im.val[3]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], y_re.val[0], y_im.val[0], y_re.val[2], y_im.val[2]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y_re.val[1], y_im.val[1], y_re.val[3], y_im.val[3]);
+
+ vload(s_re_im.val[0], &fpr_tab3[k3]);
+ k3 += 2;
+
+ INV_BOTJ_LANE(x_re.val[2], x_im.val[2], t_re.val[0], t_im.val[0], s_re_im.val[0]);
+ INV_BOTJ_LANE(x_re.val[3], x_im.val[3], t_re.val[1], t_im.val[1], s_re_im.val[0]);
+
+ INV_BOTJm_LANE(y_re.val[2], y_im.val[2], t_re.val[2], t_im.val[2], s_re_im.val[0]);
+ INV_BOTJm_LANE(y_re.val[3], y_im.val[3], t_re.val[3], t_im.val[3], s_re_im.val[0]);
+
+ if ((j >> 4) & 1) {
+ INV_TOPJmx4(t_re, t_im, x_re, x_im, y_re, y_im);
+ } else {
+ INV_TOPJx4(t_re, t_im, x_re, x_im, y_re, y_im);
+ }
+
+ vload(s_re_im.val[0], &fpr_tab2[k2]);
+ k2 += 2 * ((j & 31) == 16);
+
+ if (last) {
+ vfmuln(s_re_im.val[0], s_re_im.val[0], fpr_p2_tab[logn]);
+ vfmulnx4(x_re, x_re, fpr_p2_tab[logn]);
+ vfmulnx4(x_im, x_im, fpr_p2_tab[logn]);
+ }
+ vstorex4(&f[j], x_re);
+ vstorex4(&f[j + hn], x_im);
+
+ if (logn == 5) {
+ // Special case in fpr_tab_log2 where re == im
+ vfmulx4_i(t_re, t_re, s_re_im.val[0]);
+ vfmulx4_i(t_im, t_im, s_re_im.val[0]);
+
+ vfaddx4(y_re, t_im, t_re);
+ vfsubx4(y_im, t_im, t_re);
+ } else {
+ if ((j >> 4) & 1) {
+ INV_BOTJm_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+ } else {
+ INV_BOTJ_LANEx4(y_re, y_im, t_re, t_im, s_re_im.val[0]);
+ }
+ }
+
+ vstorex4(&f[j + 8], y_re);
+ vstorex4(&f[j + 8 + hn], y_im);
+ }
+}
+
+/*
+ * Vectorized 1 layer of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(fpr *f, const unsigned logn, const unsigned last) {
+ // Total SIMD register 26 = 24 + 2
+ float64x2x4_t a_re, a_im, b_re, b_im, t_re, t_im; // 24
+ float64x2_t s_re_im; // 2
+
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+ const unsigned ht = n >> 2;
+
+ for (unsigned j = 0; j < ht; j += 8) {
+ vloadx4(a_re, &f[j]);
+ vloadx4(a_im, &f[j + hn]);
+ vloadx4(b_re, &f[j + ht]);
+ vloadx4(b_im, &f[j + ht + hn]);
+
+ INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im);
+
+ s_re_im = vld1q_dup_f64(&fpr_tab_log2[0]);
+
+ if (last) {
+ vfmuln(s_re_im, s_re_im, fpr_p2_tab[logn]);
+ vfmulnx4(a_re, a_re, fpr_p2_tab[logn]);
+ vfmulnx4(a_im, a_im, fpr_p2_tab[logn]);
+ }
+
+ vstorex4(&f[j], a_re);
+ vstorex4(&f[j + hn], a_im);
+
+ vfmulx4_i(t_re, t_re, s_re_im);
+ vfmulx4_i(t_im, t_im, s_re_im);
+
+ vfaddx4(b_re, t_im, t_re);
+ vfsubx4(b_im, t_im, t_re);
+
+ vstorex4(&f[j + ht], b_re);
+ vstorex4(&f[j + ht + hn], b_im);
+ }
+}
+
+/*
+ * Vectorized 2 layers of Inverse FFT for 16 complex points (32 coefficients).
+ */
+static void PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(fpr *f, const unsigned logn, const unsigned level, unsigned last) {
+ const unsigned int falcon_n = 1 << logn;
+ const unsigned int hn = falcon_n >> 1;
+
+ // Total SIMD register: 26 = 16 + 8 + 2
+ float64x2x4_t t_re, t_im; // 8
+ float64x2x2_t x1_re, x2_re, x1_im, x2_im,
+ y1_re, y2_re, y1_im, y2_im; // 16
+ float64x2_t s1_re_im, s2_re_im; // 2
+
+ const fpr *fpr_inv_tab1 = NULL, *fpr_inv_tab2 = NULL;
+ unsigned l, len, start, j, k1, k2;
+ unsigned bar = logn - 4;
+
+ for (l = 4; l < logn - level - 1; l += 2) {
+ len = 1 << l;
+ last -= 1;
+ fpr_inv_tab1 = fpr_table[bar--];
+ fpr_inv_tab2 = fpr_table[bar--];
+ k1 = 0;
+ k2 = 0;
+
+ for (start = 0; start < hn; start += 1U << (l + 2)) {
+ vload(s1_re_im, &fpr_inv_tab1[k1]);
+ vload(s2_re_im, &fpr_inv_tab2[k2]);
+ k1 += 2;
+ k2 += 2U * ((start & 127) == 64);
+ if (!last) {
+ vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+ }
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(y1_re, &f[j + len]);
+ vloadx2(y1_im, &f[j + len + hn]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+ vloadx2(x2_re, &f[j + 2 * len]);
+ vloadx2(x2_im, &f[j + 2 * len + hn]);
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+ INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+ INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+ INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+ INV_TOPJ(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJ(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJ_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+ INV_BOTJ_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+ INV_BOTJ_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+ INV_BOTJ_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+ vstorex2(&f[j + 2 * len], x2_re);
+ vstorex2(&f[j + 2 * len + hn], x2_im);
+
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+
+ if (!last) {
+ vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+ vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+ }
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+
+ vstorex2(&f[j + len], y1_re);
+ vstorex2(&f[j + len + hn], y1_im);
+ }
+
+ start += 1U << (l + 2);
+ if (start >= hn) {
+ break;
+ }
+
+ vload(s1_re_im, &fpr_inv_tab1[k1]);
+ vload(s2_re_im, &fpr_inv_tab2[k2]);
+ k1 += 2;
+ k2 += 2U * ((start & 127) == 64);
+ if (!last) {
+ vfmuln(s2_re_im, s2_re_im, fpr_p2_tab[logn]);
+ }
+
+ for (j = start; j < start + len; j += 4) {
+
+ vloadx2(x1_re, &f[j]);
+ vloadx2(x1_im, &f[j + hn]);
+ vloadx2(y1_re, &f[j + len]);
+ vloadx2(y1_im, &f[j + len + hn]);
+
+ INV_TOPJ(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], y1_re.val[0], y1_im.val[0]);
+ INV_TOPJ(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], y1_re.val[1], y1_im.val[1]);
+
+ vloadx2(x2_re, &f[j + 2 * len]);
+ vloadx2(x2_im, &f[j + 2 * len + hn]);
+ vloadx2(y2_re, &f[j + 3 * len]);
+ vloadx2(y2_im, &f[j + 3 * len + hn]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], x2_re.val[0], x2_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], x2_re.val[1], x2_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJ_LANE(y1_re.val[0], y1_im.val[0], t_re.val[0], t_im.val[0], s1_re_im);
+ INV_BOTJ_LANE(y1_re.val[1], y1_im.val[1], t_re.val[1], t_im.val[1], s1_re_im);
+
+ INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s1_re_im);
+ INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s1_re_im);
+
+ INV_TOPJm(t_re.val[0], t_im.val[0], x1_re.val[0], x1_im.val[0], x2_re.val[0], x2_im.val[0]);
+ INV_TOPJm(t_re.val[1], t_im.val[1], x1_re.val[1], x1_im.val[1], x2_re.val[1], x2_im.val[1]);
+
+ INV_TOPJm(t_re.val[2], t_im.val[2], y1_re.val[0], y1_im.val[0], y2_re.val[0], y2_im.val[0]);
+ INV_TOPJm(t_re.val[3], t_im.val[3], y1_re.val[1], y1_im.val[1], y2_re.val[1], y2_im.val[1]);
+
+ INV_BOTJm_LANE(x2_re.val[0], x2_im.val[0], t_re.val[0], t_im.val[0], s2_re_im);
+ INV_BOTJm_LANE(x2_re.val[1], x2_im.val[1], t_re.val[1], t_im.val[1], s2_re_im);
+ INV_BOTJm_LANE(y2_re.val[0], y2_im.val[0], t_re.val[2], t_im.val[2], s2_re_im);
+ INV_BOTJm_LANE(y2_re.val[1], y2_im.val[1], t_re.val[3], t_im.val[3], s2_re_im);
+
+ vstorex2(&f[j + 2 * len], x2_re);
+ vstorex2(&f[j + 2 * len + hn], x2_im);
+
+ vstorex2(&f[j + 3 * len], y2_re);
+ vstorex2(&f[j + 3 * len + hn], y2_im);
+
+ if (!last) {
+ vfmuln(x1_re.val[0], x1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_re.val[1], x1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[0], x1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(x1_im.val[1], x1_im.val[1], fpr_p2_tab[logn]);
+
+ vfmuln(y1_re.val[0], y1_re.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_re.val[1], y1_re.val[1], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[0], y1_im.val[0], fpr_p2_tab[logn]);
+ vfmuln(y1_im.val[1], y1_im.val[1], fpr_p2_tab[logn]);
+ }
+
+ vstorex2(&f[j], x1_re);
+ vstorex2(&f[j + hn], x1_im);
+
+ vstorex2(&f[j + len], y1_re);
+ vstorex2(&f[j + len + hn], y1_im);
+ }
+ }
+ }
+}
+
+/*
+ * Scalable vectorized Forward FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_FFT(fpr *f, const unsigned logn) {
+ unsigned level = logn;
+ switch (logn) {
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_log2(f);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_log3(f);
+ break;
+
+ case 4:
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_log4(f);
+ break;
+
+ case 5:
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, 5);
+ break;
+
+ case 6:
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(f, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
+ break;
+
+ case 7:
+ case 9:
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(f, logn, level);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
+ break;
+
+ case 8:
+ case 10:
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn1(f, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_logn2(f, logn, level - 1);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT_log5(f, logn);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/*
+ * Scalable vectorized Inverse FFT implementation.
+ * Support logn from [1, 10]
+ * Can be easily extended to logn > 10
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_iFFT(fpr *f, const unsigned logn) {
+ const unsigned level = (logn - 5) & 1;
+
+ switch (logn) {
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log2(f);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log3(f);
+ break;
+
+ case 4:
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log4(f);
+ break;
+
+ case 5:
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, 5, 1);
+ break;
+
+ case 6:
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(f, logn, 1);
+ break;
+
+ case 7:
+ case 9:
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(f, logn, level, 1);
+ break;
+
+ case 8:
+ case 10:
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_log5(f, logn, 0);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn2(f, logn, level, 0);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT_logn1(f, logn, 1);
+ break;
+
+ default:
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft_tree.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft_tree.c
new file mode 100644
index 000000000..7ff6baca4
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fft_tree.c
@@ -0,0 +1,247 @@
+/*
+ * High-speed vectorize FFT tree for arbitrary `logn`.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/*
+ * 1 layer of Merge FFT for 2 complex points (4 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log2(fpr *f, const fpr *f0, const fpr *f1) {
+ fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+ a_re = f0[0];
+ a_im = f0[1];
+ s = fpr_tab_log2[0];
+ b_re = f1[0] * s;
+ b_im = f1[1] * s;
+
+ d_re = b_re - b_im;
+ d_im = b_re + b_im;
+
+ f[0] = a_re + d_re;
+ f[2] = a_im + d_im;
+ f[1] = a_re - d_re;
+ f[3] = a_im - d_im;
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log3(fpr *f, const fpr *f0, const fpr *f1) {
+ // Total SIMD registers: 12 = 10 + 2
+ float64x2x2_t g1, g0, g_re, g_im, s_re_im; // 10
+ float64x2_t t_re, t_im; // 2
+
+ vloadx2(g1, &f1[0]);
+
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ FWD_TOP(t_re, t_im, g1.val[0], g1.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ vloadx2(g0, &f0[0]);
+
+ FPC_ADD(g_re.val[0], g_im.val[0], g0.val[0], g0.val[1], t_re, t_im);
+ FPC_SUB(g_re.val[1], g_im.val[1], g0.val[0], g0.val[1], t_re, t_im);
+
+ vstore2(&f[0], g_re);
+ vstore2(&f[4], g_im);
+}
+
+/*
+ * Vectorized 1 layer of Merge FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log4(fpr *f, const fpr *f0, const fpr *f1, const unsigned logn) {
+ const unsigned n = 1 << logn;
+ const unsigned ht = n >> 2;
+ const fpr *fpr_merge = fpr_table[logn];
+
+ // Total SIMD register 22 = 14 + 8
+ float64x2x2_t g1_re, g1_im, g0_re, g0_im, s_re_im, t_re, t_im; // 14
+ float64x2x4_t g_re, g_im; // 8
+
+ for (unsigned j = 0; j < ht; j += 4) {
+ vload2(g1_re, &f1[j]);
+ vload2(g1_im, &f1[j + ht]);
+
+ vload2(s_re_im, &fpr_merge[j]);
+
+ FWD_TOP(t_re.val[0], t_im.val[0], g1_re.val[0], g1_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ vload2(g0_re, &f0[j]);
+
+ FWD_TOP(t_re.val[1], t_im.val[1], g1_re.val[1], g1_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+ vload2(g0_im, &f0[j + ht]);
+
+ FPC_ADD(g_re.val[0], g_im.val[0], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+ FPC_SUB(g_re.val[1], g_im.val[1], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]);
+ FPC_ADDJ(g_re.val[2], g_im.val[2], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+ FPC_SUBJ(g_re.val[3], g_im.val[3], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]);
+
+ vstore4(&f[j << 1], g_re);
+ vstore4(&f[(j + ht) << 1], g_im);
+ }
+}
+
+/*
+ * 1 layer of Split FFT for 2 complex points (4 coefficients).
+ */
+static void
+PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log2(fpr *restrict f0, fpr *restrict f1, const fpr *restrict f) {
+ fpr a_re, a_im, b_re, b_im, d_re, d_im, s;
+ a_re = f[0];
+ b_re = f[1];
+ a_im = f[2];
+ b_im = f[3];
+ s = fpr_tab_log2[0] * 0.5;
+
+ f0[0] = (a_re + b_re) * 0.5;
+ f0[1] = (a_im + b_im) * 0.5;
+
+ d_re = (a_re - b_re) * s;
+ d_im = (a_im - b_im) * s;
+
+ f1[0] = d_im + d_re;
+ f1[1] = d_im - d_re;
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 4 complex points (8 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log3(fpr *f0, fpr *f1, const fpr *f) {
+ // Total SIMD registers: 12
+ float64x2x2_t re, im, g0, g1, s_re_im, tm; // 12
+
+ vload2(re, &f[0]);
+ vload2(im, &f[4]);
+
+ FPC_ADD(g0.val[0], g0.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+ FPC_SUB(tm.val[0], tm.val[1], re.val[0], im.val[0], re.val[1], im.val[1]);
+ vload2(s_re_im, &fpr_tab_log3[0]);
+
+ vfmuln(g0.val[0], g0.val[0], 0.5);
+ vfmuln(g0.val[1], g0.val[1], 0.5);
+ vstorex2(&f0[0], g0);
+
+ vfmuln(s_re_im.val[0], s_re_im.val[0], 0.5);
+ vfmuln(s_re_im.val[1], s_re_im.val[1], 0.5);
+
+ INV_BOTJ(g1.val[0], g1.val[1], tm.val[0], tm.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ vstorex2(&f1[0], g1);
+}
+
+/*
+ * Vectorized 1 layer of Split FFT for 8 complex points (16 coefficients).
+ */
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log4(fpr *f0, fpr *f1, const fpr *f, const unsigned logn) {
+ const unsigned n = 1 << logn;
+ const unsigned hn = n >> 1;
+ const unsigned ht = n >> 2;
+ const fpr *fpr_split = fpr_table[logn];
+
+ // Total SIMD register 23 = 1 + 8 + 14
+ float64x2_t half; // 1
+ float64x2x4_t g_re, g_im; // 8
+ float64x2x2_t s_re_im, t_re, t_im, g1_re, g1_im, g0_re, g0_im; // 14
+
+ half = vdupq_n_f64(0.5);
+ for (unsigned j = 0; j < ht; j += 4) {
+ unsigned j2 = j << 1;
+ vload4(g_re, &f[j2]);
+ vload4(g_im, &f[j2 + hn]);
+
+ FPC_ADD(g0_re.val[0], g0_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+ FPC_ADD(g0_re.val[1], g0_im.val[1], g_re.val[2], g_im.val[2], g_re.val[3], g_im.val[3]);
+
+ FPC_SUB(t_re.val[0], t_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]);
+ FPC_SUB(t_re.val[1], t_im.val[1], g_re.val[3], g_im.val[3], g_re.val[2], g_im.val[2]);
+
+ vload2(s_re_im, &fpr_split[j]);
+
+ vfmul(g0_re.val[0], g0_re.val[0], half);
+ vfmul(g0_re.val[1], g0_re.val[1], half);
+ vstore2(&f0[j], g0_re);
+
+ vfmul(g0_im.val[0], g0_im.val[0], half);
+ vfmul(g0_im.val[1], g0_im.val[1], half);
+ vstore2(&f0[j + ht], g0_im);
+
+ vfmul(s_re_im.val[0], s_re_im.val[0], half);
+ vfmul(s_re_im.val[1], s_re_im.val[1], half);
+
+ INV_BOTJ(g1_re.val[0], g1_im.val[0], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]);
+ INV_BOTJm(g1_re.val[1], g1_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]);
+
+ vstore2(&f1[j], g1_re);
+ vstore2(&f1[j + ht], g1_im);
+ }
+}
+
+/*
+ * Vectorized Split FFT implementation
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1, const fpr *f, const unsigned logn) {
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1; qn = 0;
+ f0[0] = f[0];
+ f1[0] = f[1];
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log2(f0, f1, f);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log3(f0, f1, f);
+ break;
+
+ default:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_splitFFT_log4(f0, f1, f, logn);
+ break;
+ }
+}
+
+/*
+ * Vectorized Merge FFT implementation
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(fpr *restrict f, const fpr *restrict f0,
+ const fpr *restrict f1, const unsigned logn) {
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1;
+ f[0] = f0[0];
+ f[1] = f1[0];
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log2(f, f0, f1);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log3(f, f0, f1);
+ break;
+
+ default:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mergeFFT_log4(f, f0, f1, logn);
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.c
new file mode 100644
index 000000000..94e92a56c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.c
@@ -0,0 +1,204 @@
+/*
+ * Compressed floating-point Twiddle Factor.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+
+const fpr fpr_p2_tab[] = {
+ 2.00000000000,
+ 1.00000000000,
+ 0.50000000000,
+ 0.25000000000,
+ 0.12500000000,
+ 0.06250000000,
+ 0.03125000000,
+ 0.01562500000,
+ 0.00781250000,
+ 0.00390625000,
+ 0.00195312500
+};
+
+const fpr fpr_tab_log2[] = {
+ 0.707106781186547524400844362, 0.707106781186547524400844362, // 4, 5
+};
+
+const fpr fpr_tab_log3[] = {
+ 0.923879532511286756128183189, 0.382683432365089771728459984, // 8, 9
+ -0.382683432365089771728459984, 0.923879532511286756128183189,
+};
+
+const fpr fpr_tab_log4[] = {
+ 0.980785280403230449126182236, 0.195090322016128267848284868, // 16
+ 0.555570233019602224742830814, 0.831469612302545237078788378, // 20
+};
+
+const fpr fpr_tab_log5[] = {
+ 0.995184726672196886244836953, 0.098017140329560601994195564, // 32
+ 0.634393284163645498215171613, 0.773010453362736960810906610, // 36
+ 0.881921264348355029712756864, 0.471396736825997648556387626, // 40
+ 0.290284677254462367636192376, 0.956940335732208864935797887, // 44
+};
+
+const fpr fpr_tab_log6[] = {
+ 0.998795456205172392714771605, 0.049067674327418014254954977, // 64
+ 0.671558954847018400625376850, 0.740951125354959091175616897, // 68
+ 0.903989293123443331586200297, 0.427555093430282094320966857, // 72
+ 0.336889853392220050689253213, 0.941544065183020778412509403, // 76
+ 0.970031253194543992603984207, 0.242980179903263889948274162, // 80
+ 0.514102744193221726593693839, 0.857728610000272069902269984, // 84
+ 0.803207531480644909806676513, 0.595699304492433343467036529, // 88
+ 0.146730474455361751658850130, 0.989176509964780973451673738, // 92
+};
+
+const fpr fpr_tab_log7[] = {
+ 0.999698818696204220115765650, 0.024541228522912288031734529, // 128
+ 0.689540544737066924616730630, 0.724247082951466920941069243, // 132
+ 0.914209755703530654635014829, 0.405241314004989870908481306, // 136
+ 0.359895036534988148775104572, 0.932992798834738887711660256, // 140
+ 0.975702130038528544460395766, 0.219101240156869797227737547, // 144
+ 0.534997619887097210663076905, 0.844853565249707073259571205, // 148
+ 0.817584813151583696504920884, 0.575808191417845300745972454, // 152
+ 0.170961888760301226363642357, 0.985277642388941244774018433, // 156
+ 0.992479534598709998156767252, 0.122410675199216198498704474, // 160
+ 0.615231590580626845484913563, 0.788346427626606262009164705, // 164
+ 0.870086991108711418652292404, 0.492898192229784036873026689, // 168
+ 0.266712757474898386325286515, 0.963776065795439866686464356, // 172
+ 0.949528180593036667195936074, 0.313681740398891476656478846, // 176
+ 0.449611329654606600046294579, 0.893224301195515320342416447, // 180
+ 0.757208846506484547575464054, 0.653172842953776764084203014, // 184
+ 0.073564563599667423529465622, 0.997290456678690216135597140, // 188
+};
+
+const fpr fpr_tab_log8[] = {
+ 0.999924701839144540921646491, 0.012271538285719926079408262, // 256
+ 0.698376249408972853554813503, 0.715730825283818654125532623, // 260
+ 0.919113851690057743908477789, 0.393992040061048108596188661, // 264
+ 0.371317193951837543411934967, 0.928506080473215565937167396, // 268
+ 0.978317370719627633106240097, 0.207111376192218549708116020, // 272
+ 0.545324988422046422313987347, 0.838224705554838043186996856, // 276
+ 0.824589302785025264474803737, 0.565731810783613197389765011, // 280
+ 0.183039887955140958516532578, 0.983105487431216327180301155, // 284
+ 0.993906970002356041546922813, 0.110222207293883058807899140, // 288
+ 0.624859488142386377084072816, 0.780737228572094478301588484, // 292
+ 0.876070094195406607095844268, 0.482183772079122748517344481, // 296
+ 0.278519689385053105207848526, 0.960430519415565811199035138, // 300
+ 0.953306040354193836916740383, 0.302005949319228067003463232, // 304
+ 0.460538710958240023633181487, 0.887639620402853947760181617, // 308
+ 0.765167265622458925888815999, 0.643831542889791465068086063, // 312
+ 0.085797312344439890461556332, 0.996312612182778012627226190, // 316
+ 0.998118112900149207125155861, 0.061320736302208577782614593, // 320
+ 0.662415777590171761113069817, 0.749136394523459325469203257, // 324
+ 0.898674465693953843041976744, 0.438616238538527637647025738, // 328
+ 0.325310292162262934135954708, 0.945607325380521325730945387, // 332
+ 0.966976471044852109087220226, 0.254865659604514571553980779, // 336
+ 0.503538383725717558691867071, 0.863972856121586737918147054, // 340
+ 0.795836904608883536262791915, 0.605511041404325513920626941, // 344
+ 0.134580708507126186316358409, 0.990902635427780025108237011, // 348
+ 0.987301418157858382399815802, 0.158858143333861441684385360, // 352
+ 0.585797857456438860328080838, 0.810457198252594791726703434, // 356
+ 0.851355193105265142261290312, 0.524589682678468906215098464, // 360
+ 0.231058108280671119643236018, 0.972939952205560145467720114, // 364
+ 0.937339011912574923201899593, 0.348418680249434568419308588, // 368
+ 0.416429560097637182562598911, 0.909167983090522376563884788, // 372
+ 0.732654271672412834615546649, 0.680600997795453050594430464, // 376
+ 0.036807222941358832324332691, 0.999322384588349500896221011, // 380
+};
+
+const fpr fpr_tab_log9[] = {
+ 0.999981175282601142656990438, 0.006135884649154475359640235, // 512
+ 0.702754744457225302452914421, 0.711432195745216441522130290, // 516
+ 0.921514039342041943465396332, 0.388345046698826291624993541, // 520
+ 0.377007410216418256726567823, 0.926210242138311341974793388, // 524
+ 0.979569765685440534439326110, 0.201104634842091911558443546, // 528
+ 0.550457972936604802977289893, 0.834862874986380056304401383, // 532
+ 0.828045045257755752067527592, 0.560661576197336023839710223, // 536
+ 0.189068664149806212754997837, 0.981963869109555264072848154, // 540
+ 0.994564570734255452119106243, 0.104121633872054579120943880, // 544
+ 0.629638238914927025372981341, 0.776888465673232450040827983, // 548
+ 0.879012226428633477831323711, 0.476799230063322133342158117, // 552
+ 0.284407537211271843618310615, 0.958703474895871555374645792, // 556
+ 0.955141168305770721498157712, 0.296150888243623824121786128, // 560
+ 0.465976495767966177902756065, 0.884797098430937780104007041, // 564
+ 0.769103337645579639346626069, 0.639124444863775743801488193, // 568
+ 0.091908956497132728624990979, 0.995767414467659793982495643, // 572
+ 0.998475580573294752208559038, 0.055195244349689939809447526, // 576
+ 0.666999922303637506650154222, 0.745057785441465962407907310, // 580
+ 0.901348847046022014570746093, 0.433093818853151968484222638, // 584
+ 0.331106305759876401737190737, 0.943593458161960361495301445, // 588
+ 0.968522094274417316221088329, 0.248927605745720168110682816, // 592
+ 0.508830142543107036931749324, 0.860866938637767279344583877, // 596
+ 0.799537269107905033500246232, 0.600616479383868926653875896, // 600
+ 0.140658239332849230714788846, 0.990058210262297105505906464, // 604
+ 0.988257567730749491404792538, 0.152797185258443427720336613, // 608
+ 0.590759701858874228423887908, 0.806847553543799272206514313, // 612
+ 0.854557988365400520767862276, 0.519355990165589587361829932, // 616
+ 0.237023605994367206867735915, 0.971503890986251775537099622, // 620
+ 0.939459223602189911962669246, 0.342660717311994397592781983, // 624
+ 0.422000270799799685941287941, 0.906595704514915365332960588, // 628
+ 0.736816568877369875090132520, 0.676092703575315960360419228, // 632
+ 0.042938256934940823077124540, 0.999077727752645382888781997, // 636
+ 0.999529417501093163079703322, 0.030674803176636625934021028, // 640
+ 0.685083667772700381362052545, 0.728464390448225196492035438, // 644
+ 0.911706032005429851404397325, 0.410843171057903942183466675, // 648
+ 0.354163525420490382357395796, 0.935183509938947577642207480, // 652
+ 0.974339382785575860518721668, 0.225083911359792835991642120, // 656
+ 0.529803624686294668216054671, 0.848120344803297251279133563, // 660
+ 0.814036329705948361654516690, 0.580813958095764545075595272, // 664
+ 0.164913120489969921418189113, 0.986308097244598647863297524, // 668
+ 0.991709753669099522860049931, 0.128498110793793172624415589, // 672
+ 0.610382806276309452716352152, 0.792106577300212351782342879, // 676
+ 0.867046245515692651480195629, 0.498227666972781852410983869, // 680
+ 0.260794117915275518280186509, 0.965394441697689374550843858, // 684
+ 0.947585591017741134653387321, 0.319502030816015677901518272, // 688
+ 0.444122144570429231642069418, 0.895966249756185155914560282, // 692
+ 0.753186799043612482483430486, 0.657806693297078656931182264, // 696
+ 0.067443919563664057897972422, 0.997723066644191609848546728, // 700
+ 0.996820299291165714972629398, 0.079682437971430121147120656, // 704
+ 0.648514401022112445084560551, 0.761202385484261814029709836, // 708
+ 0.890448723244757889952150560, 0.455083587126343823535869268, // 712
+ 0.307849640041534893682063646, 0.951435020969008369549175569, // 716
+ 0.962121404269041595429604316, 0.272621355449948984493347477, // 720
+ 0.487550160148435954641485027, 0.873094978418290098636085973, // 724
+ 0.784556597155575233023892575, 0.620057211763289178646268191, // 728
+ 0.116318630911904767252544319, 0.993211949234794533104601012, // 732
+ 0.984210092386929073193874387, 0.177004220412148756196839844, // 736
+ 0.570780745886967280232652864, 0.821102514991104679060430820, // 740
+ 0.841554977436898409603499520, 0.540171472729892881297845480, // 744
+ 0.213110319916091373967757518, 0.977028142657754351485866211, // 748
+ 0.930766961078983731944872340, 0.365612997804773870011745909, // 752
+ 0.399624199845646828544117031, 0.916679059921042663116457013, // 756
+ 0.720002507961381629076682999, 0.693971460889654009003734389, // 760
+ 0.018406729905804820927366313, 0.999830581795823422015722275, // 764
+};
+
+const fpr *fpr_table[] = {
+ NULL, NULL,
+ fpr_tab_log2,
+ fpr_tab_log3,
+ fpr_tab_log4,
+ fpr_tab_log5,
+ fpr_tab_log6,
+ fpr_tab_log7,
+ fpr_tab_log8,
+ fpr_tab_log9,
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.h
new file mode 100644
index 000000000..6a045a45e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/fpr.h
@@ -0,0 +1,245 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/* ====================================================================== */
+
+#include
+#include
+
+#include "macrof.h"
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef double fpr;
+
+static inline fpr
+FPR(double v) {
+ fpr x;
+
+ x = v;
+ return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+ return (double)i;
+}
+
+static const fpr fpr_q = 12289.0 ;
+static const fpr fpr_inverse_of_q = 1.0 / 12289.0 ;
+static const fpr fpr_inv_2sqrsigma0 = .150865048875372721532312163019 ;
+static const fpr fpr_inv_sigma_9 = 0.0060336696681577241031668062510953022 ;
+static const fpr fpr_sigma_min_9 = 1.2778336969128335860256340575729042 ;
+static const fpr fpr_log2 = 0.69314718055994530941723212146 ;
+static const fpr fpr_inv_log2 = 1.4426950408889634073599246810 ;
+static const fpr fpr_bnorm_max = 16822.4121 ;
+static const fpr fpr_zero = 0.0 ;
+static const fpr fpr_one = 1.0 ;
+static const fpr fpr_two = 2.0 ;
+static const fpr fpr_onehalf = 0.5 ;
+static const fpr fpr_invsqrt2 = 0.707106781186547524400844362105 ;
+static const fpr fpr_invsqrt8 = 0.353553390593273762200422181052 ;
+static const fpr fpr_ptwo31 = 2147483648.0 ;
+static const fpr fpr_ptwo31m1 = 2147483647.0 ;
+static const fpr fpr_mtwo31m1 = -2147483647.0 ;
+static const fpr fpr_ptwo63m1 = 9223372036854775807.0 ;
+static const fpr fpr_mtwo63m1 = -9223372036854775807.0 ;
+static const fpr fpr_ptwo63 = 9223372036854775808.0 ;
+
+static inline int64_t
+fpr_rint(fpr x) {
+ int64_t t;
+ __asm__ ( "fcvtns %x0, %d1": "=r" (t) : "w" (x));
+ return t;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+ int64_t r;
+
+ /*
+ * The cast performs a trunc() (rounding toward 0) and thus is
+ * wrong by 1 for most negative values. The correction below is
+ * constant-time as long as the compiler turns the
+ * floating-point conversion result into a 0/1 integer without a
+ * conditional branch or another non-constant-time construction.
+ * This should hold on all modern architectures with an FPU (and
+ * if it is false on a given arch, then chances are that the FPU
+ * itself is not constant-time, making the point moot).
+ */
+ r = (int64_t)x;
+ return r - (x < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+ return (int64_t)x;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+ return (x + y);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+ return (x - y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+ return (-x);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+ return (x * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+ return (x + x);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+ return (x * y);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+ return (x * x);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+ return (1.0 / x);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+ return (x / y);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+ __asm__ ( "fsqrt %d0, %d0" : "+w" (x) : : );
+ return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+ return x < y;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+ static const double C_expm[] = {
+ 1.000000000000000000000000000000, // c0
+ -0.999999999999994892974086724280, // c1
+ 0.500000000000019206858326015208, // c2
+ -0.166666666666984014666397229121, // c3
+ 0.041666666666110491190622155955, // c4
+ -0.008333333327800835146903501993, // c5
+ 0.001388888894063186997887560103, // c6
+ -0.000198412739277311890541063977, // c7
+ 0.000024801566833585381209939524, // c8
+ -0.000002755586350219122514855659, // c9
+ 0.000000275607356160477811864927, // c10
+ -0.000000025299506379442070029551, // c11
+ 0.000000002073772366009083061987, // c12
+ 0.000000000000000000000000000000,
+ };
+ float64x2_t neon_x, neon_1x, neon_x2,
+ neon_x4, neon_x8, neon_x12, neon_ccs;
+ float64x2x4_t neon_exp0;
+ float64x2x3_t neon_exp1;
+ float64x2_t y1, y2, y3, y;
+ double ret;
+
+ neon_exp0 = vld1q_f64_x4(&C_expm[0]);
+ neon_exp1 = vld1q_f64_x3(&C_expm[8]);
+ neon_ccs = vdupq_n_f64(ccs);
+ neon_ccs = vmulq_n_f64(neon_ccs, fpr_ptwo63);
+
+ // x | x
+ neon_x = vdupq_n_f64(x);
+ // 1 | x
+ neon_1x = vsetq_lane_f64(1.0, neon_x, 0);
+ neon_x2 = vmulq_f64(neon_x, neon_x);
+ neon_x4 = vmulq_f64(neon_x2, neon_x2);
+ neon_x8 = vmulq_f64(neon_x4, neon_x4);
+ neon_x12 = vmulq_f64(neon_x8, neon_x4);
+
+ vfmla(y1, neon_exp0.val[0], neon_exp0.val[1], neon_x2);
+ vfmla(y2, neon_exp0.val[2], neon_exp0.val[3], neon_x2);
+ vfmla(y3, neon_exp1.val[0], neon_exp1.val[1], neon_x2);
+
+ y1 = vmulq_f64(y1, neon_1x);
+ y2 = vmulq_f64(y2, neon_1x);
+ y3 = vmulq_f64(y3, neon_1x);
+
+ vfmla(y, y1, y2, neon_x4);
+ vfmla(y, y, y3, neon_x8);
+ vfmla(y, y, neon_exp1.val[2], neon_x12);
+ y = vmulq_f64( y, neon_ccs);
+ ret = vaddvq_f64(y);
+
+ return (uint64_t) ret;
+}
+
+#define fpr_p2_tab PQCLEAN_FALCONPADDED512_AARCH64_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+#define fpr_tab_log2 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log2
+#define fpr_tab_log3 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log3
+#define fpr_tab_log4 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log4
+#define fpr_tab_log5 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log5
+#define fpr_tab_log6 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log6
+#define fpr_tab_log7 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log7
+#define fpr_tab_log8 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log8
+#define fpr_tab_log9 PQCLEAN_FALCONPADDED512_AARCH64_fpr_tab_log9
+#define fpr_table PQCLEAN_FALCONPADDED512_AARCH64_fpr_table
+
+extern const fpr fpr_tab_log2[];
+extern const fpr fpr_tab_log3[];
+extern const fpr fpr_tab_log4[];
+extern const fpr fpr_tab_log5[];
+extern const fpr fpr_tab_log6[];
+extern const fpr fpr_tab_log7[];
+extern const fpr fpr_tab_log8[];
+extern const fpr fpr_tab_log9[];
+extern const fpr *fpr_table[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/inner.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/inner.h
new file mode 100644
index 000000000..65b0e7799
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/inner.h
@@ -0,0 +1,825 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+#include "params.h"
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ * - All public functions (i.e. the non-static ones) must be referenced
+ * with the PQCLEAN_FALCONPADDED512_AARCH64_ macro (e.g. PQCLEAN_FALCONPADDED512_AARCH64_verify_raw for the verify_raw()
+ * function). That macro adds a prefix to the name, which is
+ * configurable with the FALCON_PREFIX macro. This allows compiling
+ * the code into a specific "namespace" and potentially including
+ * several versions of this code into a single application (e.g. to
+ * have an AVX2 and a non-AVX2 variants and select the one to use at
+ * runtime based on availability of AVX2 opcodes).
+ *
+ * - Functions that need temporary buffers expects them as a final
+ * tmp[] array of type uint8_t*, with a size which is documented for
+ * each function. However, most have some alignment requirements,
+ * because they will use the array to store 16-bit, 32-bit or 64-bit
+ * values (e.g. uint64_t or double). The caller must ensure proper
+ * alignment. What happens on unaligned access depends on the
+ * underlying architecture, ranging from a slight time penalty
+ * to immediate termination of the process.
+ *
+ * - Some functions rely on specific rounding rules and precision for
+ * floating-point numbers. On some systems (in particular 32-bit x86
+ * with the 387 FPU), this requires setting an hardware control
+ * word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ * oldcw = set_fpu_cw(2);
+ * PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(...);
+ * set_fpu_cw(oldcw);
+ *
+ * On systems where the native floating-point precision is already
+ * proper, or integer-based emulation is used, the set_fpu_cw()
+ * function does nothing, so it can be called systematically.
+ */
+
+#include
+#include
+#include
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+ return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context shake256incctx
+#define inner_shake256_init(sc) shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ * - modq: sequence of values modulo 12289, each encoded over exactly
+ * 14 bits. The encoder and decoder verify that integers are within
+ * the valid range (0..12288). Values are arrays of uint16.
+ *
+ * - trim: sequence of signed integers, a specified number of bits
+ * each. The number of bits is provided as parameter and includes
+ * the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ * (which means that the -2^(bits-1) value is forbidden); encode and
+ * decode functions check that property. Values are arrays of
+ * int16_t or int8_t, corresponding to names 'trim_i16' and
+ * 'trim_i8', respectively.
+ *
+ * - comp: variable-length encoding for signed integers; each integer
+ * uses a minimum of 9 bits, possibly more. This is normally used
+ * only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED512_AARCH64_modq_encode(void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(void *out, size_t max_out_len, const int8_t *x, uint8_t bits);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_comp_encode(void *out, size_t max_out_len, const int16_t *x);
+
+size_t PQCLEAN_FALCONPADDED512_AARCH64_modq_decode(uint16_t *x, const void *in,
+ size_t max_in_len, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(int8_t *x, unsigned bits, const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AARCH64_comp_decode(int16_t *x, const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AARCH64_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_vartime(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short(const int16_t *s1, const int16_t *s2);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(int16_t *s1tmp, int16_t *s2tmp,
+ const int16_t *hm, const double *t0,
+ const double *t1);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+/*
+ * Convert a public key to NTT. Conversion is done in place.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt(int16_t *h);
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt_monty(int16_t *h);
+
+/*
+ * Internal signature verification code:
+ * c0[] contains the hashed nonce+message
+ * s2[] is the decoded signature
+ * h[] contains the public key, in NTT + Montgomery format
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+ int16_t *h, int16_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_compute_public(int16_t *h, const int8_t *f,
+ const int8_t *g, int16_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_complete_private(int8_t *G, const int8_t *f,
+ const int8_t *g, const int8_t *F,
+ uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ * h[] receives the public key (NOT in NTT/Montgomery format)
+ * c0[] contains the hashed nonce+message
+ * s1[] is the first signature half
+ * s2[] is the second signature half
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+ const int16_t *s1, const int16_t *s2,
+ uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ * fpr
+ * type for a real number
+ *
+ * fpr fpr_of(int64_t i)
+ * cast an integer into a real number; source must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_scaled(int64_t i, int sc)
+ * compute i*2^sc as a real number; source 'i' must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_ldexp(fpr x, int e)
+ * compute x*2^e
+ *
+ * int64_t fpr_rint(fpr x)
+ * round x to the nearest integer; x must be in the -(2^63-1)
+ * to +(2^63-1) range
+ *
+ * int64_t fpr_trunc(fpr x)
+ * round to an integer; this rounds towards zero; value must
+ * be in the -(2^63-1) to +(2^63-1) range
+ *
+ * fpr fpr_add(fpr x, fpr y)
+ * compute x + y
+ *
+ * fpr fpr_sub(fpr x, fpr y)
+ * compute x - y
+ *
+ * fpr fpr_neg(fpr x)
+ * compute -x
+ *
+ * fpr fpr_half(fpr x)
+ * compute x/2
+ *
+ * fpr fpr_double(fpr x)
+ * compute x*2
+ *
+ * fpr fpr_mul(fpr x, fpr y)
+ * compute x * y
+ *
+ * fpr fpr_sqr(fpr x)
+ * compute x * x
+ *
+ * fpr fpr_inv(fpr x)
+ * compute 1/x
+ *
+ * fpr fpr_div(fpr x, fpr y)
+ * compute x/y
+ *
+ * fpr fpr_sqrt(fpr x)
+ * compute the square root of x
+ *
+ * int fpr_lt(fpr x, fpr y)
+ * return 1 if x < y, 0 otherwise
+ *
+ * uint64_t fpr_expm_p63(fpr x)
+ * return exp(x), assuming that 0 <= x < log(2). Returned value
+ * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ * rounded to the nearest integer). Computation should have a
+ * precision of at least 45 bits.
+ *
+ * const fpr fpr_gm_tab[]
+ * array of constants for FFT / iFFT
+ *
+ * const fpr fpr_p2_tab[]
+ * precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ * fpr fpr_q 12289
+ * fpr fpr_inverse_of_q 1/12289
+ * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2))
+ * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10)
+ * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10)
+ * fpr fpr_log2 log(2)
+ * fpr fpr_inv_log2 1/log(2)
+ * fpr fpr_bnorm_max 16822.4121
+ * fpr fpr_zero 0
+ * fpr fpr_one 1
+ * fpr fpr_two 2
+ * fpr fpr_onehalf 0.5
+ * fpr fpr_ptwo31 2^31
+ * fpr fpr_ptwo31m1 2^31-1
+ * fpr fpr_mtwo31m1 -(2^31-1)
+ * fpr fpr_ptwo63m1 2^63-1
+ * fpr fpr_mtwo63m1 -(2^63-1)
+ * fpr fpr_ptwo63 2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+ union {
+ uint8_t d[512]; /* MUST be 512, exactly */
+ uint64_t dummy_u64;
+ } buf;
+ size_t ptr;
+ union {
+ uint8_t d[256];
+ uint64_t dummy_u64;
+ } state;
+ int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+ size_t u;
+
+ /*
+ * If there are less than 9 bytes in the buffer, we refill it.
+ * This means that we may drop the last few bytes, but this allows
+ * for faster extraction code. Also, it means that we never leave
+ * an empty buffer.
+ */
+ u = p->ptr;
+ if (u >= (sizeof p->buf.d) - 9) {
+ PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+ u = 0;
+ }
+ p->ptr = u + 8;
+
+ return (uint64_t)p->buf.d[u + 0]
+ | ((uint64_t)p->buf.d[u + 1] << 8)
+ | ((uint64_t)p->buf.d[u + 2] << 16)
+ | ((uint64_t)p->buf.d[u + 3] << 24)
+ | ((uint64_t)p->buf.d[u + 4] << 32)
+ | ((uint64_t)p->buf.d[u + 5] << 40)
+ | ((uint64_t)p->buf.d[u + 6] << 48)
+ | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+ unsigned v;
+
+ v = p->buf.d[p->ptr ++];
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+ }
+ return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(fpr *c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(fpr *c, const fpr *a, const fpr *restrict b, const fpr *restrict d, unsigned logn);
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(fpr *d, fpr *a, const fpr *restrict b, unsigned logn);
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d,
+ const fpr *a, const fpr *restrict b, unsigned logn);
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(fpr *c, const fpr *restrict a, unsigned logn);
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(fpr *c, const fpr *restrict d, const fpr *restrict a, unsigned logn);
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_fft(fpr *restrict c, const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+ const fpr *restrict a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add_muladj_fft(fpr *restrict d,
+ const fpr *restrict F, const fpr *restrict G,
+ const fpr *restrict f, const fpr *restrict g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a, const fpr *restrict b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+ fpr *restrict g01, fpr *restrict g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(fpr *restrict d11, fpr *restrict l10,
+ const fpr *restrict g00, const fpr *restrict g01,
+ const fpr *restrict g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1,
+ const fpr *restrict f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(fpr *restrict f,
+ const fpr *restrict f0, const fpr *restrict f1, unsigned logn);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm, const unsigned falcon_n);
+
+fpr PQCLEAN_FALCONPADDED512_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2);
+
+int32_t PQCLEAN_FALCONPADDED512_AARCH64_poly_small_sqnorm(const int8_t *f); // common.c
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1 136
+#define FALCON_KEYGEN_TEMP_2 272
+#define FALCON_KEYGEN_TEMP_3 224
+#define FALCON_KEYGEN_TEMP_4 448
+#define FALCON_KEYGEN_TEMP_5 896
+#define FALCON_KEYGEN_TEMP_6 1792
+#define FALCON_KEYGEN_TEMP_7 3584
+#define FALCON_KEYGEN_TEMP_8 7168
+#define FALCON_KEYGEN_TEMP_9 14336
+#define FALCON_KEYGEN_TEMP_10 28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_expand_privkey(fpr *restrict expanded_key,
+ const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+ uint8_t *restrict tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED512_AARCH64_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *restrict expanded_key,
+ const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *restrict f, const int8_t *restrict g,
+ const int8_t *restrict F, const int8_t *restrict G,
+ const uint16_t *hm, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ * ctx pointer to the sampler_context structure
+ * mu center for the distribution
+ * isigma inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+ prng p;
+ fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED512_AARCH64_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/keygen.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/keygen.c
new file mode 100644
index 000000000..feee9d483
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/keygen.c
@@ -0,0 +1,4200 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "util.h"
+
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ * Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ * Montgomery multiplication computes (x*y)/R mod p for
+ * operands x and y. Therefore:
+ *
+ * - if operands are x*R and y*R (Montgomery representations of x and
+ * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ * mod p, which is the Montgomery representation of the product x*y;
+ *
+ * - if operands are x*R and y (or x and y*R), then Montgomery
+ * multiplication returns x*y mod p: mixed-representation
+ * multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ * p The prime itself.
+ *
+ * g A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ * s The inverse of the product of all previous primes in the array,
+ * computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+ uint32_t p;
+ uint32_t g;
+ uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+ { 2147473409, 383167813, 10239 },
+ { 2147389441, 211808905, 471403745 },
+ { 2147387393, 37672282, 1329335065 },
+ { 2147377153, 1977035326, 968223422 },
+ { 2147358721, 1067163706, 132460015 },
+ { 2147352577, 1606082042, 598693809 },
+ { 2147346433, 2033915641, 1056257184 },
+ { 2147338241, 1653770625, 421286710 },
+ { 2147309569, 631200819, 1111201074 },
+ { 2147297281, 2038364663, 1042003613 },
+ { 2147295233, 1962540515, 19440033 },
+ { 2147239937, 2100082663, 353296760 },
+ { 2147235841, 1991153006, 1703918027 },
+ { 2147217409, 516405114, 1258919613 },
+ { 2147205121, 409347988, 1089726929 },
+ { 2147196929, 927788991, 1946238668 },
+ { 2147178497, 1136922411, 1347028164 },
+ { 2147100673, 868626236, 701164723 },
+ { 2147082241, 1897279176, 617820870 },
+ { 2147074049, 1888819123, 158382189 },
+ { 2147051521, 25006327, 522758543 },
+ { 2147043329, 327546255, 37227845 },
+ { 2147039233, 766324424, 1133356428 },
+ { 2146988033, 1862817362, 73861329 },
+ { 2146963457, 404622040, 653019435 },
+ { 2146959361, 1936581214, 995143093 },
+ { 2146938881, 1559770096, 634921513 },
+ { 2146908161, 422623708, 1985060172 },
+ { 2146885633, 1751189170, 298238186 },
+ { 2146871297, 578919515, 291810829 },
+ { 2146846721, 1114060353, 915902322 },
+ { 2146834433, 2069565474, 47859524 },
+ { 2146818049, 1552824584, 646281055 },
+ { 2146775041, 1906267847, 1597832891 },
+ { 2146756609, 1847414714, 1228090888 },
+ { 2146744321, 1818792070, 1176377637 },
+ { 2146738177, 1118066398, 1054971214 },
+ { 2146736129, 52057278, 933422153 },
+ { 2146713601, 592259376, 1406621510 },
+ { 2146695169, 263161877, 1514178701 },
+ { 2146656257, 685363115, 384505091 },
+ { 2146650113, 927727032, 537575289 },
+ { 2146646017, 52575506, 1799464037 },
+ { 2146643969, 1276803876, 1348954416 },
+ { 2146603009, 814028633, 1521547704 },
+ { 2146572289, 1846678872, 1310832121 },
+ { 2146547713, 919368090, 1019041349 },
+ { 2146508801, 671847612, 38582496 },
+ { 2146492417, 283911680, 532424562 },
+ { 2146490369, 1780044827, 896447978 },
+ { 2146459649, 327980850, 1327906900 },
+ { 2146447361, 1310561493, 958645253 },
+ { 2146441217, 412148926, 287271128 },
+ { 2146437121, 293186449, 2009822534 },
+ { 2146430977, 179034356, 1359155584 },
+ { 2146418689, 1517345488, 1790248672 },
+ { 2146406401, 1615820390, 1584833571 },
+ { 2146404353, 826651445, 607120498 },
+ { 2146379777, 3816988, 1897049071 },
+ { 2146363393, 1221409784, 1986921567 },
+ { 2146355201, 1388081168, 849968120 },
+ { 2146336769, 1803473237, 1655544036 },
+ { 2146312193, 1023484977, 273671831 },
+ { 2146293761, 1074591448, 467406983 },
+ { 2146283521, 831604668, 1523950494 },
+ { 2146203649, 712865423, 1170834574 },
+ { 2146154497, 1764991362, 1064856763 },
+ { 2146142209, 627386213, 1406840151 },
+ { 2146127873, 1638674429, 2088393537 },
+ { 2146099201, 1516001018, 690673370 },
+ { 2146093057, 1294931393, 315136610 },
+ { 2146091009, 1942399533, 973539425 },
+ { 2146078721, 1843461814, 2132275436 },
+ { 2146060289, 1098740778, 360423481 },
+ { 2146048001, 1617213232, 1951981294 },
+ { 2146041857, 1805783169, 2075683489 },
+ { 2146019329, 272027909, 1753219918 },
+ { 2145986561, 1206530344, 2034028118 },
+ { 2145976321, 1243769360, 1173377644 },
+ { 2145964033, 887200839, 1281344586 },
+ { 2145906689, 1651026455, 906178216 },
+ { 2145875969, 1673238256, 1043521212 },
+ { 2145871873, 1226591210, 1399796492 },
+ { 2145841153, 1465353397, 1324527802 },
+ { 2145832961, 1150638905, 554084759 },
+ { 2145816577, 221601706, 427340863 },
+ { 2145785857, 608896761, 316590738 },
+ { 2145755137, 1712054942, 1684294304 },
+ { 2145742849, 1302302867, 724873116 },
+ { 2145728513, 516717693, 431671476 },
+ { 2145699841, 524575579, 1619722537 },
+ { 2145691649, 1925625239, 982974435 },
+ { 2145687553, 463795662, 1293154300 },
+ { 2145673217, 771716636, 881778029 },
+ { 2145630209, 1509556977, 837364988 },
+ { 2145595393, 229091856, 851648427 },
+ { 2145587201, 1796903241, 635342424 },
+ { 2145525761, 715310882, 1677228081 },
+ { 2145495041, 1040930522, 200685896 },
+ { 2145466369, 949804237, 1809146322 },
+ { 2145445889, 1673903706, 95316881 },
+ { 2145390593, 806941852, 1428671135 },
+ { 2145372161, 1402525292, 159350694 },
+ { 2145361921, 2124760298, 1589134749 },
+ { 2145359873, 1217503067, 1561543010 },
+ { 2145355777, 338341402, 83865711 },
+ { 2145343489, 1381532164, 641430002 },
+ { 2145325057, 1883895478, 1528469895 },
+ { 2145318913, 1335370424, 65809740 },
+ { 2145312769, 2000008042, 1919775760 },
+ { 2145300481, 961450962, 1229540578 },
+ { 2145282049, 910466767, 1964062701 },
+ { 2145232897, 816527501, 450152063 },
+ { 2145218561, 1435128058, 1794509700 },
+ { 2145187841, 33505311, 1272467582 },
+ { 2145181697, 269767433, 1380363849 },
+ { 2145175553, 56386299, 1316870546 },
+ { 2145079297, 2106880293, 1391797340 },
+ { 2145021953, 1347906152, 720510798 },
+ { 2145015809, 206769262, 1651459955 },
+ { 2145003521, 1885513236, 1393381284 },
+ { 2144960513, 1810381315, 31937275 },
+ { 2144944129, 1306487838, 2019419520 },
+ { 2144935937, 37304730, 1841489054 },
+ { 2144894977, 1601434616, 157985831 },
+ { 2144888833, 98749330, 2128592228 },
+ { 2144880641, 1772327002, 2076128344 },
+ { 2144864257, 1404514762, 2029969964 },
+ { 2144827393, 801236594, 406627220 },
+ { 2144806913, 349217443, 1501080290 },
+ { 2144796673, 1542656776, 2084736519 },
+ { 2144778241, 1210734884, 1746416203 },
+ { 2144759809, 1146598851, 716464489 },
+ { 2144757761, 286328400, 1823728177 },
+ { 2144729089, 1347555695, 1836644881 },
+ { 2144727041, 1795703790, 520296412 },
+ { 2144696321, 1302475157, 852964281 },
+ { 2144667649, 1075877614, 504992927 },
+ { 2144573441, 198765808, 1617144982 },
+ { 2144555009, 321528767, 155821259 },
+ { 2144550913, 814139516, 1819937644 },
+ { 2144536577, 571143206, 962942255 },
+ { 2144524289, 1746733766, 2471321 },
+ { 2144512001, 1821415077, 124190939 },
+ { 2144468993, 917871546, 1260072806 },
+ { 2144458753, 378417981, 1569240563 },
+ { 2144421889, 175229668, 1825620763 },
+ { 2144409601, 1699216963, 351648117 },
+ { 2144370689, 1071885991, 958186029 },
+ { 2144348161, 1763151227, 540353574 },
+ { 2144335873, 1060214804, 919598847 },
+ { 2144329729, 663515846, 1448552668 },
+ { 2144327681, 1057776305, 590222840 },
+ { 2144309249, 1705149168, 1459294624 },
+ { 2144296961, 325823721, 1649016934 },
+ { 2144290817, 738775789, 447427206 },
+ { 2144243713, 962347618, 893050215 },
+ { 2144237569, 1655257077, 900860862 },
+ { 2144161793, 242206694, 1567868672 },
+ { 2144155649, 769415308, 1247993134 },
+ { 2144137217, 320492023, 515841070 },
+ { 2144120833, 1639388522, 770877302 },
+ { 2144071681, 1761785233, 964296120 },
+ { 2144065537, 419817825, 204564472 },
+ { 2144028673, 666050597, 2091019760 },
+ { 2144010241, 1413657615, 1518702610 },
+ { 2143952897, 1238327946, 475672271 },
+ { 2143940609, 307063413, 1176750846 },
+ { 2143918081, 2062905559, 786785803 },
+ { 2143899649, 1338112849, 1562292083 },
+ { 2143891457, 68149545, 87166451 },
+ { 2143885313, 921750778, 394460854 },
+ { 2143854593, 719766593, 133877196 },
+ { 2143836161, 1149399850, 1861591875 },
+ { 2143762433, 1848739366, 1335934145 },
+ { 2143756289, 1326674710, 102999236 },
+ { 2143713281, 808061791, 1156900308 },
+ { 2143690753, 388399459, 1926468019 },
+ { 2143670273, 1427891374, 1756689401 },
+ { 2143666177, 1912173949, 986629565 },
+ { 2143645697, 2041160111, 371842865 },
+ { 2143641601, 1279906897, 2023974350 },
+ { 2143635457, 720473174, 1389027526 },
+ { 2143621121, 1298309455, 1732632006 },
+ { 2143598593, 1548762216, 1825417506 },
+ { 2143567873, 620475784, 1073787233 },
+ { 2143561729, 1932954575, 949167309 },
+ { 2143553537, 354315656, 1652037534 },
+ { 2143541249, 577424288, 1097027618 },
+ { 2143531009, 357862822, 478640055 },
+ { 2143522817, 2017706025, 1550531668 },
+ { 2143506433, 2078127419, 1824320165 },
+ { 2143488001, 613475285, 1604011510 },
+ { 2143469569, 1466594987, 502095196 },
+ { 2143426561, 1115430331, 1044637111 },
+ { 2143383553, 9778045, 1902463734 },
+ { 2143377409, 1557401276, 2056861771 },
+ { 2143363073, 652036455, 1965915971 },
+ { 2143260673, 1464581171, 1523257541 },
+ { 2143246337, 1876119649, 764541916 },
+ { 2143209473, 1614992673, 1920672844 },
+ { 2143203329, 981052047, 2049774209 },
+ { 2143160321, 1847355533, 728535665 },
+ { 2143129601, 965558457, 603052992 },
+ { 2143123457, 2140817191, 8348679 },
+ { 2143100929, 1547263683, 694209023 },
+ { 2143092737, 643459066, 1979934533 },
+ { 2143082497, 188603778, 2026175670 },
+ { 2143062017, 1657329695, 377451099 },
+ { 2143051777, 114967950, 979255473 },
+ { 2143025153, 1698431342, 1449196896 },
+ { 2143006721, 1862741675, 1739650365 },
+ { 2142996481, 756660457, 996160050 },
+ { 2142976001, 927864010, 1166847574 },
+ { 2142965761, 905070557, 661974566 },
+ { 2142916609, 40932754, 1787161127 },
+ { 2142892033, 1987985648, 675335382 },
+ { 2142885889, 797497211, 1323096997 },
+ { 2142871553, 2068025830, 1411877159 },
+ { 2142861313, 1217177090, 1438410687 },
+ { 2142830593, 409906375, 1767860634 },
+ { 2142803969, 1197788993, 359782919 },
+ { 2142785537, 643817365, 513932862 },
+ { 2142779393, 1717046338, 218943121 },
+ { 2142724097, 89336830, 416687049 },
+ { 2142707713, 5944581, 1356813523 },
+ { 2142658561, 887942135, 2074011722 },
+ { 2142638081, 151851972, 1647339939 },
+ { 2142564353, 1691505537, 1483107336 },
+ { 2142533633, 1989920200, 1135938817 },
+ { 2142529537, 959263126, 1531961857 },
+ { 2142527489, 453251129, 1725566162 },
+ { 2142502913, 1536028102, 182053257 },
+ { 2142498817, 570138730, 701443447 },
+ { 2142416897, 326965800, 411931819 },
+ { 2142363649, 1675665410, 1517191733 },
+ { 2142351361, 968529566, 1575712703 },
+ { 2142330881, 1384953238, 1769087884 },
+ { 2142314497, 1977173242, 1833745524 },
+ { 2142289921, 95082313, 1714775493 },
+ { 2142283777, 109377615, 1070584533 },
+ { 2142277633, 16960510, 702157145 },
+ { 2142263297, 553850819, 431364395 },
+ { 2142208001, 241466367, 2053967982 },
+ { 2142164993, 1795661326, 1031836848 },
+ { 2142097409, 1212530046, 712772031 },
+ { 2142087169, 1763869720, 822276067 },
+ { 2142078977, 644065713, 1765268066 },
+ { 2142074881, 112671944, 643204925 },
+ { 2142044161, 1387785471, 1297890174 },
+ { 2142025729, 783885537, 1000425730 },
+ { 2142011393, 905662232, 1679401033 },
+ { 2141974529, 799788433, 468119557 },
+ { 2141943809, 1932544124, 449305555 },
+ { 2141933569, 1527403256, 841867925 },
+ { 2141931521, 1247076451, 743823916 },
+ { 2141902849, 1199660531, 401687910 },
+ { 2141890561, 150132350, 1720336972 },
+ { 2141857793, 1287438162, 663880489 },
+ { 2141833217, 618017731, 1819208266 },
+ { 2141820929, 999578638, 1403090096 },
+ { 2141786113, 81834325, 1523542501 },
+ { 2141771777, 120001928, 463556492 },
+ { 2141759489, 122455485, 2124928282 },
+ { 2141749249, 141986041, 940339153 },
+ { 2141685761, 889088734, 477141499 },
+ { 2141673473, 324212681, 1122558298 },
+ { 2141669377, 1175806187, 1373818177 },
+ { 2141655041, 1113654822, 296887082 },
+ { 2141587457, 991103258, 1585913875 },
+ { 2141583361, 1401451409, 1802457360 },
+ { 2141575169, 1571977166, 712760980 },
+ { 2141546497, 1107849376, 1250270109 },
+ { 2141515777, 196544219, 356001130 },
+ { 2141495297, 1733571506, 1060744866 },
+ { 2141483009, 321552363, 1168297026 },
+ { 2141458433, 505818251, 733225819 },
+ { 2141360129, 1026840098, 948342276 },
+ { 2141325313, 945133744, 2129965998 },
+ { 2141317121, 1871100260, 1843844634 },
+ { 2141286401, 1790639498, 1750465696 },
+ { 2141267969, 1376858592, 186160720 },
+ { 2141255681, 2129698296, 1876677959 },
+ { 2141243393, 2138900688, 1340009628 },
+ { 2141214721, 1933049835, 1087819477 },
+ { 2141212673, 1898664939, 1786328049 },
+ { 2141202433, 990234828, 940682169 },
+ { 2141175809, 1406392421, 993089586 },
+ { 2141165569, 1263518371, 289019479 },
+ { 2141073409, 1485624211, 507864514 },
+ { 2141052929, 1885134788, 311252465 },
+ { 2141040641, 1285021247, 280941862 },
+ { 2141028353, 1527610374, 375035110 },
+ { 2141011969, 1400626168, 164696620 },
+ { 2140999681, 632959608, 966175067 },
+ { 2140997633, 2045628978, 1290889438 },
+ { 2140993537, 1412755491, 375366253 },
+ { 2140942337, 719477232, 785367828 },
+ { 2140925953, 45224252, 836552317 },
+ { 2140917761, 1157376588, 1001839569 },
+ { 2140887041, 278480752, 2098732796 },
+ { 2140837889, 1663139953, 924094810 },
+ { 2140788737, 802501511, 2045368990 },
+ { 2140766209, 1820083885, 1800295504 },
+ { 2140764161, 1169561905, 2106792035 },
+ { 2140696577, 127781498, 1885987531 },
+ { 2140684289, 16014477, 1098116827 },
+ { 2140653569, 665960598, 1796728247 },
+ { 2140594177, 1043085491, 377310938 },
+ { 2140579841, 1732838211, 1504505945 },
+ { 2140569601, 302071939, 358291016 },
+ { 2140567553, 192393733, 1909137143 },
+ { 2140557313, 406595731, 1175330270 },
+ { 2140549121, 1748850918, 525007007 },
+ { 2140477441, 499436566, 1031159814 },
+ { 2140469249, 1886004401, 1029951320 },
+ { 2140426241, 1483168100, 1676273461 },
+ { 2140420097, 1779917297, 846024476 },
+ { 2140413953, 522948893, 1816354149 },
+ { 2140383233, 1931364473, 1296921241 },
+ { 2140366849, 1917356555, 147196204 },
+ { 2140354561, 16466177, 1349052107 },
+ { 2140348417, 1875366972, 1860485634 },
+ { 2140323841, 456498717, 1790256483 },
+ { 2140321793, 1629493973, 150031888 },
+ { 2140315649, 1904063898, 395510935 },
+ { 2140280833, 1784104328, 831417909 },
+ { 2140250113, 256087139, 697349101 },
+ { 2140229633, 388553070, 243875754 },
+ { 2140223489, 747459608, 1396270850 },
+ { 2140200961, 507423743, 1895572209 },
+ { 2140162049, 580106016, 2045297469 },
+ { 2140149761, 712426444, 785217995 },
+ { 2140137473, 1441607584, 536866543 },
+ { 2140119041, 346538902, 1740434653 },
+ { 2140090369, 282642885, 21051094 },
+ { 2140076033, 1407456228, 319910029 },
+ { 2140047361, 1619330500, 1488632070 },
+ { 2140041217, 2089408064, 2012026134 },
+ { 2140008449, 1705524800, 1613440760 },
+ { 2139924481, 1846208233, 1280649481 },
+ { 2139906049, 989438755, 1185646076 },
+ { 2139867137, 1522314850, 372783595 },
+ { 2139842561, 1681587377, 216848235 },
+ { 2139826177, 2066284988, 1784999464 },
+ { 2139824129, 480888214, 1513323027 },
+ { 2139789313, 847937200, 858192859 },
+ { 2139783169, 1642000434, 1583261448 },
+ { 2139770881, 940699589, 179702100 },
+ { 2139768833, 315623242, 964612676 },
+ { 2139666433, 331649203, 764666914 },
+ { 2139641857, 2118730799, 1313764644 },
+ { 2139635713, 519149027, 519212449 },
+ { 2139598849, 1526413634, 1769667104 },
+ { 2139574273, 551148610, 820739925 },
+ { 2139568129, 1386800242, 472447405 },
+ { 2139549697, 813760130, 1412328531 },
+ { 2139537409, 1615286260, 1609362979 },
+ { 2139475969, 1352559299, 1696720421 },
+ { 2139455489, 1048691649, 1584935400 },
+ { 2139432961, 836025845, 950121150 },
+ { 2139424769, 1558281165, 1635486858 },
+ { 2139406337, 1728402143, 1674423301 },
+ { 2139396097, 1727715782, 1483470544 },
+ { 2139383809, 1092853491, 1741699084 },
+ { 2139369473, 690776899, 1242798709 },
+ { 2139351041, 1768782380, 2120712049 },
+ { 2139334657, 1739968247, 1427249225 },
+ { 2139332609, 1547189119, 623011170 },
+ { 2139310081, 1346827917, 1605466350 },
+ { 2139303937, 369317948, 828392831 },
+ { 2139301889, 1560417239, 1788073219 },
+ { 2139283457, 1303121623, 595079358 },
+ { 2139248641, 1354555286, 573424177 },
+ { 2139240449, 60974056, 885781403 },
+ { 2139222017, 355573421, 1221054839 },
+ { 2139215873, 566477826, 1724006500 },
+ { 2139150337, 871437673, 1609133294 },
+ { 2139144193, 1478130914, 1137491905 },
+ { 2139117569, 1854880922, 964728507 },
+ { 2139076609, 202405335, 756508944 },
+ { 2139062273, 1399715741, 884826059 },
+ { 2139045889, 1051045798, 1202295476 },
+ { 2139033601, 1707715206, 632234634 },
+ { 2139006977, 2035853139, 231626690 },
+ { 2138951681, 183867876, 838350879 },
+ { 2138945537, 1403254661, 404460202 },
+ { 2138920961, 310865011, 1282911681 },
+ { 2138910721, 1328496553, 103472415 },
+ { 2138904577, 78831681, 993513549 },
+ { 2138902529, 1319697451, 1055904361 },
+ { 2138816513, 384338872, 1706202469 },
+ { 2138810369, 1084868275, 405677177 },
+ { 2138787841, 401181788, 1964773901 },
+ { 2138775553, 1850532988, 1247087473 },
+ { 2138767361, 874261901, 1576073565 },
+ { 2138757121, 1187474742, 993541415 },
+ { 2138748929, 1782458888, 1043206483 },
+ { 2138744833, 1221500487, 800141243 },
+ { 2138738689, 413465368, 1450660558 },
+ { 2138695681, 739045140, 342611472 },
+ { 2138658817, 1355845756, 672674190 },
+ { 2138644481, 608379162, 1538874380 },
+ { 2138632193, 1444914034, 686911254 },
+ { 2138607617, 484707818, 1435142134 },
+ { 2138591233, 539460669, 1290458549 },
+ { 2138572801, 2093538990, 2011138646 },
+ { 2138552321, 1149786988, 1076414907 },
+ { 2138546177, 840688206, 2108985273 },
+ { 2138533889, 209669619, 198172413 },
+ { 2138523649, 1975879426, 1277003968 },
+ { 2138490881, 1351891144, 1976858109 },
+ { 2138460161, 1817321013, 1979278293 },
+ { 2138429441, 1950077177, 203441928 },
+ { 2138400769, 908970113, 628395069 },
+ { 2138398721, 219890864, 758486760 },
+ { 2138376193, 1306654379, 977554090 },
+ { 2138351617, 298822498, 2004708503 },
+ { 2138337281, 441457816, 1049002108 },
+ { 2138320897, 1517731724, 1442269609 },
+ { 2138290177, 1355911197, 1647139103 },
+ { 2138234881, 531313247, 1746591962 },
+ { 2138214401, 1899410930, 781416444 },
+ { 2138202113, 1813477173, 1622508515 },
+ { 2138191873, 1086458299, 1025408615 },
+ { 2138183681, 1998800427, 827063290 },
+ { 2138173441, 1921308898, 749670117 },
+ { 2138103809, 1620902804, 2126787647 },
+ { 2138099713, 828647069, 1892961817 },
+ { 2138085377, 179405355, 1525506535 },
+ { 2138060801, 615683235, 1259580138 },
+ { 2138044417, 2030277840, 1731266562 },
+ { 2138042369, 2087222316, 1627902259 },
+ { 2138032129, 126388712, 1108640984 },
+ { 2138011649, 715026550, 1017980050 },
+ { 2137993217, 1693714349, 1351778704 },
+ { 2137888769, 1289762259, 1053090405 },
+ { 2137853953, 199991890, 1254192789 },
+ { 2137833473, 941421685, 896995556 },
+ { 2137817089, 750416446, 1251031181 },
+ { 2137792513, 798075119, 368077456 },
+ { 2137786369, 878543495, 1035375025 },
+ { 2137767937, 9351178, 1156563902 },
+ { 2137755649, 1382297614, 1686559583 },
+ { 2137724929, 1345472850, 1681096331 },
+ { 2137704449, 834666929, 630551727 },
+ { 2137673729, 1646165729, 1892091571 },
+ { 2137620481, 778943821, 48456461 },
+ { 2137618433, 1730837875, 1713336725 },
+ { 2137581569, 805610339, 1378891359 },
+ { 2137538561, 204342388, 1950165220 },
+ { 2137526273, 1947629754, 1500789441 },
+ { 2137516033, 719902645, 1499525372 },
+ { 2137491457, 230451261, 556382829 },
+ { 2137440257, 979573541, 412760291 },
+ { 2137374721, 927841248, 1954137185 },
+ { 2137362433, 1243778559, 861024672 },
+ { 2137313281, 1341338501, 980638386 },
+ { 2137311233, 937415182, 1793212117 },
+ { 2137255937, 795331324, 1410253405 },
+ { 2137243649, 150756339, 1966999887 },
+ { 2137182209, 163346914, 1939301431 },
+ { 2137171969, 1952552395, 758913141 },
+ { 2137159681, 570788721, 218668666 },
+ { 2137147393, 1896656810, 2045670345 },
+ { 2137141249, 358493842, 518199643 },
+ { 2137139201, 1505023029, 674695848 },
+ { 2137133057, 27911103, 830956306 },
+ { 2137122817, 439771337, 1555268614 },
+ { 2137116673, 790988579, 1871449599 },
+ { 2137110529, 432109234, 811805080 },
+ { 2137102337, 1357900653, 1184997641 },
+ { 2137098241, 515119035, 1715693095 },
+ { 2137090049, 408575203, 2085660657 },
+ { 2137085953, 2097793407, 1349626963 },
+ { 2137055233, 1556739954, 1449960883 },
+ { 2137030657, 1545758650, 1369303716 },
+ { 2136987649, 332602570, 103875114 },
+ { 2136969217, 1499989506, 1662964115 },
+ { 2136924161, 857040753, 4738842 },
+ { 2136895489, 1948872712, 570436091 },
+ { 2136893441, 58969960, 1568349634 },
+ { 2136887297, 2127193379, 273612548 },
+ { 2136850433, 111208983, 1181257116 },
+ { 2136809473, 1627275942, 1680317971 },
+ { 2136764417, 1574888217, 14011331 },
+ { 2136741889, 14011055, 1129154251 },
+ { 2136727553, 35862563, 1838555253 },
+ { 2136721409, 310235666, 1363928244 },
+ { 2136698881, 1612429202, 1560383828 },
+ { 2136649729, 1138540131, 800014364 },
+ { 2136606721, 602323503, 1433096652 },
+ { 2136563713, 182209265, 1919611038 },
+ { 2136555521, 324156477, 165591039 },
+ { 2136549377, 195513113, 217165345 },
+ { 2136526849, 1050768046, 939647887 },
+ { 2136508417, 1886286237, 1619926572 },
+ { 2136477697, 609647664, 35065157 },
+ { 2136471553, 679352216, 1452259468 },
+ { 2136457217, 128630031, 824816521 },
+ { 2136422401, 19787464, 1526049830 },
+ { 2136420353, 698316836, 1530623527 },
+ { 2136371201, 1651862373, 1804812805 },
+ { 2136334337, 326596005, 336977082 },
+ { 2136322049, 63253370, 1904972151 },
+ { 2136297473, 312176076, 172182411 },
+ { 2136248321, 381261841, 369032670 },
+ { 2136242177, 358688773, 1640007994 },
+ { 2136229889, 512677188, 75585225 },
+ { 2136219649, 2095003250, 1970086149 },
+ { 2136207361, 1909650722, 537760675 },
+ { 2136176641, 1334616195, 1533487619 },
+ { 2136158209, 2096285632, 1793285210 },
+ { 2136143873, 1897347517, 293843959 },
+ { 2136133633, 923586222, 1022655978 },
+ { 2136096769, 1464868191, 1515074410 },
+ { 2136094721, 2020679520, 2061636104 },
+ { 2136076289, 290798503, 1814726809 },
+ { 2136041473, 156415894, 1250757633 },
+ { 2135996417, 297459940, 1132158924 },
+ { 2135955457, 538755304, 1688831340 },
+ { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+ uint32_t w;
+
+ w = (uint32_t)x;
+ w += p & -(w >> 31);
+ return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+ return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+ uint32_t y;
+
+ y = 2 - p;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+ /*
+ * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+ * 2^31 - p.
+ */
+ return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a + b - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a - b;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+ a += p & -(a & 1);
+ return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+ uint64_t z, w;
+ uint32_t d;
+
+ z = (uint64_t)a * (uint64_t)b;
+ w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+ d = (uint32_t)((z + w) >> 31) - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+ uint32_t z;
+
+ /*
+ * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+ * representation), then double it with an addition.
+ */
+ z = modp_R(p);
+ z = modp_add(z, z, p);
+
+ /*
+ * Square it five times to obtain 2^32 in Montgomery representation
+ * (i.e. 2^63 mod p).
+ */
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+
+ /*
+ * Halve the value mod p to get 2^62.
+ */
+ z = (z + (p & -(z & 1))) >> 1;
+ return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+ int i;
+ uint32_t r, z;
+
+ /*
+ * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+ * representation of (2^31)^e mod p, where e = x-1.
+ * R2 is 2^31 in Montgomery representation.
+ */
+ x --;
+ r = R2;
+ z = modp_R(p);
+ for (i = 0; (1U << i) <= x; i ++) {
+ if ((x & (1U << i)) != 0) {
+ z = modp_montymul(z, r, p, p0i);
+ }
+ r = modp_montymul(r, r, p, p0i);
+ }
+ return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ * a dividend
+ * b divisor
+ * p odd prime modulus
+ * p0i -1/p mod 2^31
+ * R 2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+ uint32_t z, e;
+ int i;
+
+ e = p - 2;
+ z = R;
+ for (i = 30; i >= 0; i --) {
+ uint32_t z2;
+
+ z = modp_montymul(z, z, p, p0i);
+ z2 = modp_montymul(z, b, p, p0i);
+ z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+ }
+
+ /*
+ * The loop above just assumed that b was in Montgomery
+ * representation, i.e. really contained b*R; under that
+ * assumption, it returns 1/b in Montgomery representation,
+ * which is R/b. But we gave it b in normal representation,
+ * so the loop really returned R/(b/R) = R^2/b.
+ *
+ * We want a/b, so we need one Montgomery multiplication with a,
+ * which also remove one of the R factors, and another such
+ * multiplication to remove the second R factor.
+ */
+ z = modp_montymul(z, 1, p, p0i);
+ return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+ 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832,
+ 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928,
+ 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784,
+ 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976,
+ 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880,
+ 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904,
+ 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808,
+ 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000,
+ 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856,
+ 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952,
+ 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772,
+ 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964,
+ 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868,
+ 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916,
+ 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820,
+ 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012,
+ 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844,
+ 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940,
+ 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796,
+ 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988,
+ 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892,
+ 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898,
+ 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802,
+ 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994,
+ 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850,
+ 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946,
+ 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778,
+ 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970,
+ 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874,
+ 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922,
+ 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826,
+ 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018,
+ 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838,
+ 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934,
+ 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790,
+ 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982,
+ 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886,
+ 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910,
+ 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814,
+ 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006,
+ 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862,
+ 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958,
+ 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769,
+ 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961,
+ 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865,
+ 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913,
+ 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817,
+ 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009,
+ 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841,
+ 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937,
+ 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793,
+ 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985,
+ 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889,
+ 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901,
+ 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805,
+ 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997,
+ 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853,
+ 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949,
+ 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781,
+ 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973,
+ 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877,
+ 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925,
+ 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829,
+ 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021,
+ 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835,
+ 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931,
+ 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787,
+ 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979,
+ 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883,
+ 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907,
+ 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811,
+ 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003,
+ 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859,
+ 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955,
+ 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775,
+ 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967,
+ 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871,
+ 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919,
+ 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823,
+ 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015,
+ 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847,
+ 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943,
+ 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799,
+ 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991,
+ 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895,
+ 255, 767, 511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ * gm[rev(i)] = g^i mod p
+ * igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *restrict gm, uint32_t *restrict igm, unsigned logn,
+ uint32_t g, uint32_t p, uint32_t p0i) {
+ size_t u, n;
+ unsigned k;
+ uint32_t ig, x1, x2, R2;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * We want g such that g^(2N) = 1 mod p, but the provided
+ * generator has order 2048. We must square it a few times.
+ */
+ R2 = modp_R2(p, p0i);
+ g = modp_montymul(g, R2, p, p0i);
+ for (k = logn; k < 10; k ++) {
+ g = modp_montymul(g, g, p, p0i);
+ }
+
+ ig = modp_div(R2, g, p, p0i, modp_R(p));
+ k = 10 - logn;
+ x1 = x2 = modp_R(p);
+ for (u = 0; u < n; u ++) {
+ size_t v;
+
+ v = REV10[u << k];
+ gm[v] = x1;
+ igm[v] = x2;
+ x1 = modp_montymul(x1, g, p, p0i);
+ x2 = modp_montymul(x2, ig, p, p0i);
+ }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, u, v1;
+
+ ht = t >> 1;
+ for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = gm[m + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + ht * stride;
+ for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = modp_montymul(*r2, s, p, p0i);
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_sub(x, y, p);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n, k;
+ uint32_t ni;
+ uint32_t *r;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = 1;
+ for (m = n; m > 1; m >>= 1) {
+ size_t hm, dt, u, v1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = igm[hm + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + t * stride;
+ for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = *r2;
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_montymul(
+ modp_sub(x, y, p), s, p, p0i);;
+ }
+ }
+ t = dt;
+ }
+
+ /*
+ * We need 1/n in Montgomery representation, i.e. R/n. Since
+ * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+ * thus a simple shift will do.
+ */
+ ni = (uint32_t)1 << (31 - logn);
+ for (k = 0, r = a; k < n; k ++, r += stride) {
+ *r = modp_montymul(*r, ni, p, p0i);
+ }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ size_t hn, u;
+
+ hn = (size_t)1 << (logn - 1);
+ for (u = 0; u < hn; u ++) {
+ uint32_t w0, w1;
+
+ w0 = f[(u << 1) + 0];
+ w1 = f[(u << 1) + 1];
+ f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ * - Rebuild the resultant and the polynomial coefficients from their
+ * values modulo small primes (of length 31 bits each).
+ *
+ * - Compute an extended GCD between the two computed resultants.
+ *
+ * - Extract top bits and add scaled values during the successive steps
+ * of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *restrict a, const uint32_t *restrict b, size_t len,
+ uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ cc = 0;
+ m = -ctl;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, w;
+
+ aw = a[u];
+ w = aw - b[u] - cc;
+ cc = w >> 31;
+ aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+ a[u] = aw;
+ }
+ return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < mlen; u ++) {
+ uint64_t z;
+
+ z = (uint64_t)m[u] * (uint64_t)x + cc;
+ m[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ * d is unsigned
+ * p is prime
+ * 2^30 < p < 2^31
+ * p0i = -(1/p) mod 2^31
+ * R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ uint32_t x;
+ size_t u;
+
+ /*
+ * Algorithm: we inject words one by one, starting with the high
+ * word. Each step is:
+ * - multiply x by 2^31
+ * - add new word
+ */
+ x = 0;
+ u = dlen;
+ while (u -- > 0) {
+ uint32_t w;
+
+ x = modp_montymul(x, R2, p, p0i);
+ w = d[u] - p;
+ w += p & -(w >> 31);
+ x = modp_add(x, w, p);
+ }
+ return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+ uint32_t z;
+
+ if (dlen == 0) {
+ return 0;
+ }
+ z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+ z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+ return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *restrict x,
+ const uint32_t *restrict y, size_t len, uint32_t s) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t xw, yw;
+ uint64_t z;
+
+ xw = x[u];
+ yw = y[u];
+ z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *restrict x, const uint32_t *restrict p, size_t len) {
+ size_t u;
+ uint32_t r, bb;
+
+ /*
+ * Compare x with p/2. We use the shifted version of p, and p
+ * is odd, so we really compare with (p-1)/2; we want to perform
+ * the subtraction if and only if x > (p-1)/2.
+ */
+ r = 0;
+ bb = 0;
+ u = len;
+ while (u -- > 0) {
+ uint32_t wx, wp, cc;
+
+ /*
+ * Get the two words to compare in wx and wp (both over
+ * 31 bits exactly).
+ */
+ wx = x[u];
+ wp = (p[u] >> 1) | (bb << 30);
+ bb = p[u] & 1;
+
+ /*
+ * We set cc to -1, 0 or 1, depending on whether wp is
+ * lower than, equal to, or greater than wx.
+ */
+ cc = wp - wx;
+ cc = ((-cc) >> 31) | -(cc >> 31);
+
+ /*
+ * If r != 0 then it is either 1 or -1, and we keep its
+ * value. Otherwise, if r = 0, then we replace it with cc.
+ */
+ r |= cc & ((r & 1) - 1);
+ }
+
+ /*
+ * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+ * is lower than, equal to, or greater than x. We thus want to
+ * do the subtraction only if r = -1.
+ */
+ zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *restrict xx, size_t xlen, size_t xstride,
+ size_t num, const small_prime *primes, int normalize_signed,
+ uint32_t *restrict tmp) {
+ size_t u;
+ uint32_t *x;
+
+ tmp[0] = primes[0].p;
+ for (u = 1; u < xlen; u ++) {
+ /*
+ * At the entry of each loop iteration:
+ * - the first u words of each array have been
+ * reassembled;
+ * - the first u words of tmp[] contains the
+ * product of the prime moduli processed so far.
+ *
+ * We call 'q' the product of all previous primes.
+ */
+ uint32_t p, p0i, s, R2;
+ size_t v;
+
+ p = primes[u].p;
+ s = primes[u].s;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ for (v = 0, x = xx; v < num; v ++, x += xstride) {
+ uint32_t xp, xq, xr;
+ /*
+ * xp = the integer x modulo the prime p for this
+ * iteration
+ * xq = (x mod q) mod p
+ */
+ xp = x[u];
+ xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+ /*
+ * New value is (x mod q) + q * (s * (xp - xq) mod p)
+ */
+ xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+ zint_add_mul_small(x, tmp, u, xr);
+ }
+
+ /*
+ * Update product of primes in tmp[].
+ */
+ tmp[u] = zint_mul_small(tmp, u, p);
+ }
+
+ /*
+ * Normalize the reconstructed values around 0.
+ */
+ if (normalize_signed) {
+ for (u = 0, x = xx; u < num; u ++, x += xstride) {
+ zint_norm_zero(x, tmp, xlen);
+ }
+ }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ /*
+ * If ctl = 1 then we flip the bits of a by XORing with
+ * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+ * with 0 and add 0, which leaves the value unchanged.
+ */
+ cc = ctl;
+ m = -ctl >> 1;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw;
+
+ aw = a[u];
+ aw = (aw ^ m) + cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ * 0 both values were positive
+ * 1 new a had to be negated
+ * 2 new b had to be negated
+ * 3 both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+ int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t nega, negb;
+
+ cca = 0;
+ ccb = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ nega = (uint32_t)((uint64_t)cca >> 63);
+ negb = (uint32_t)((uint64_t)ccb >> 63);
+ zint_negate(a, len, nega);
+ zint_negate(b, len, negb);
+ return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ * if neg = 1, then -m <= a < 0
+ * if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+ size_t u;
+ uint32_t cc, xm, ym;
+
+ /*
+ * First pass: compare a (assumed nonnegative) with m. Note that
+ * if the top word uses 32 bits, subtracting m must yield a
+ * value less than 2^31 since a < 2*m.
+ */
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ cc = (a[u] - m[u] - cc) >> 31;
+ }
+
+ /*
+ * If neg = 1 then we must add m (regardless of cc)
+ * If neg = 0 and cc = 0 then we must subtract m
+ * If neg = 0 and cc = 1 then we must do nothing
+ *
+ * In the loop below, we conditionally subtract either m or -m
+ * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+ * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+ */
+ xm = -neg >> 1;
+ ym = -(neg | (1 - cc));
+ cc = neg;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, mw;
+
+ aw = a[u];
+ mw = (m[u] ^ xm) & ym;
+ aw = aw - mw - cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+ uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t fa, fb;
+
+ /*
+ * These are actually four combined Montgomery multiplications.
+ */
+ cca = 0;
+ ccb = 0;
+ fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+ fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb
+ + m[u] * (uint64_t)fa + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+ + m[u] * (uint64_t)fb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ /*
+ * At this point:
+ * -m <= a < 2*m
+ * -m <= b < 2*m
+ * (this is a case of Montgomery reduction)
+ * The top words of 'a' and 'b' may have a 32-th bit set.
+ * We want to add or subtract the modulus, as required.
+ */
+ zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+ zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ * 0 <= u <= y
+ * 0 <= v <= x
+ * x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *restrict u, uint32_t *restrict v,
+ const uint32_t *restrict x, const uint32_t *restrict y,
+ size_t len, uint32_t *restrict tmp) {
+ /*
+ * Algorithm is an extended binary GCD. We maintain 6 values
+ * a, b, u0, u1, v0 and v1 with the following invariants:
+ *
+ * a = x*u0 - y*v0
+ * b = x*u1 - y*v1
+ * 0 <= a <= x
+ * 0 <= b <= y
+ * 0 <= u0 < y
+ * 0 <= v0 < x
+ * 0 <= u1 <= y
+ * 0 <= v1 < x
+ *
+ * Initial values are:
+ *
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ *
+ * Each iteration reduces either a or b, and maintains the
+ * invariants. Algorithm stops when a = b, at which point their
+ * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+ * the values (u,v) we want to return.
+ *
+ * The formal definition of the algorithm is a sequence of steps:
+ *
+ * - If a is even, then:
+ * a <- a/2
+ * u0 <- u0/2 mod y
+ * v0 <- v0/2 mod x
+ *
+ * - Otherwise, if b is even, then:
+ * b <- b/2
+ * u1 <- u1/2 mod y
+ * v1 <- v1/2 mod x
+ *
+ * - Otherwise, if a > b, then:
+ * a <- (a-b)/2
+ * u0 <- (u0-u1)/2 mod y
+ * v0 <- (v0-v1)/2 mod x
+ *
+ * - Otherwise:
+ * b <- (b-a)/2
+ * u1 <- (u1-u0)/2 mod y
+ * v1 <- (v1-v0)/2 mod y
+ *
+ * We can show that the operations above preserve the invariants:
+ *
+ * - If a is even, then u0 and v0 are either both even or both
+ * odd (since a = x*u0 - y*v0, and x and y are both odd).
+ * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+ * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+ * the a = x*u0 - y*v0 invariant is preserved.
+ *
+ * - The same holds for the case where b is even.
+ *
+ * - If a and b are odd, and a > b, then:
+ *
+ * a-b = x*(u0-u1) - y*(v0-v1)
+ *
+ * In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+ * a-b > 0; therefore, it must be that v0 < v1, and the
+ * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+ * which preserves the invariants. Otherwise, if u0 > u1,
+ * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+ * b >= 0, hence a-b <= x. It follows that, in that case,
+ * v0-v1 >= 0. The first part of the update is then:
+ * (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+ * invariants.
+ *
+ * Either way, once the subtraction is done, the new value of
+ * a, which is the difference of two odd values, is even,
+ * and the remaining of this step is a subcase of the
+ * first algorithm case (i.e. when a is even).
+ *
+ * - If a and b are odd, and b > a, then the a similar
+ * argument holds.
+ *
+ * The values a and b start at x and y, respectively. Since x
+ * and y are odd, their GCD is odd, and it is easily seen that
+ * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+ * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+ * or b is reduced by at least one bit at each iteration, so
+ * the algorithm necessarily converges on the case a = b, at
+ * which point the common value is the GCD.
+ *
+ * In the algorithm expressed above, when a = b, the fourth case
+ * applies, and sets b = 0. Since a contains the GCD of x and y,
+ * which are both odd, a must be odd, and subsequent iterations
+ * (if any) will simply divide b by 2 repeatedly, which has no
+ * consequence. Thus, the algorithm can run for more iterations
+ * than necessary; the final GCD will be in a, and the (u,v)
+ * coefficients will be (u0,v0).
+ *
+ *
+ * The presentation above is bit-by-bit. It can be sped up by
+ * noticing that all decisions are taken based on the low bits
+ * and high bits of a and b. We can extract the two top words
+ * and low word of each of a and b, and compute reduction
+ * parameters pa, pb, qa and qb such that the new values for
+ * a and b are:
+ * a' = (a*pa + b*pb) / (2^31)
+ * b' = (a*qa + b*qb) / (2^31)
+ * the two divisions being exact. The coefficients are obtained
+ * just from the extracted words, and may be slightly off, requiring
+ * an optional correction: if a' < 0, then we replace pa with -pa
+ * and pb with -pb. Each such step will reduce the total length
+ * (sum of lengths of a and b) by at least 30 bits at each
+ * iteration.
+ */
+ uint32_t *u0, *u1, *v0, *v1, *a, *b;
+ uint32_t x0i, y0i;
+ uint32_t num, rc;
+ size_t j;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ /*
+ * u0 and v0 are the u and v result buffers; the four other
+ * values (u1, v1, a and b) are taken from tmp[].
+ */
+ u0 = u;
+ v0 = v;
+ u1 = tmp;
+ v1 = u1 + len;
+ a = v1 + len;
+ b = a + len;
+
+ /*
+ * We'll need the Montgomery reduction coefficients.
+ */
+ x0i = modp_ninv31(x[0]);
+ y0i = modp_ninv31(y[0]);
+
+ /*
+ * Initialize a, b, u0, u1, v0 and v1.
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ * Note that x is odd, so computing x-1 is easy.
+ */
+ memcpy(a, x, len * sizeof * x);
+ memcpy(b, y, len * sizeof * y);
+ u0[0] = 1;
+ memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+ memset(v0, 0, len * sizeof * v0);
+ memcpy(u1, y, len * sizeof * u1);
+ memcpy(v1, x, len * sizeof * v1);
+ v1[0] --;
+
+ /*
+ * Each input operand may be as large as 31*len bits, and we
+ * reduce the total length by at least 30 bits at each iteration.
+ */
+ for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+ uint32_t c0, c1;
+ uint32_t a0, a1, b0, b1;
+ uint64_t a_hi, b_hi;
+ uint32_t a_lo, b_lo;
+ int64_t pa, pb, qa, qb;
+ int i;
+ uint32_t r;
+
+ /*
+ * Extract the top words of a and b. If j is the highest
+ * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+ * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+ * If a and b are down to one word each, then we use
+ * a[0] and b[0].
+ */
+ c0 = (uint32_t) -1;
+ c1 = (uint32_t) -1;
+ a0 = 0;
+ a1 = 0;
+ b0 = 0;
+ b1 = 0;
+ j = len;
+ while (j -- > 0) {
+ uint32_t aw, bw;
+
+ aw = a[j];
+ bw = b[j];
+ a0 ^= (a0 ^ aw) & c0;
+ a1 ^= (a1 ^ aw) & c1;
+ b0 ^= (b0 ^ bw) & c0;
+ b1 ^= (b1 ^ bw) & c1;
+ c1 = c0;
+ c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+ }
+
+ /*
+ * If c1 = 0, then we grabbed two words for a and b.
+ * If c1 != 0 but c0 = 0, then we grabbed one word. It
+ * is not possible that c1 != 0 and c0 != 0, because that
+ * would mean that both integers are zero.
+ */
+ a1 |= a0 & c1;
+ a0 &= ~c1;
+ b1 |= b0 & c1;
+ b0 &= ~c1;
+ a_hi = ((uint64_t)a0 << 31) + a1;
+ b_hi = ((uint64_t)b0 << 31) + b1;
+ a_lo = a[0];
+ b_lo = b[0];
+
+ /*
+ * Compute reduction factors:
+ *
+ * a' = a*pa + b*pb
+ * b' = a*qa + b*qb
+ *
+ * such that a' and b' are both multiple of 2^31, but are
+ * only marginally larger than a and b.
+ */
+ pa = 1;
+ pb = 0;
+ qa = 0;
+ qb = 1;
+ for (i = 0; i < 31; i ++) {
+ /*
+ * At each iteration:
+ *
+ * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+ * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+ * a <- a/2 if: a is even
+ * b <- b/2 if: a is odd, b is even
+ *
+ * We multiply a_lo and b_lo by 2 at each
+ * iteration, thus a division by 2 really is a
+ * non-multiplication by 2.
+ */
+ uint32_t rt, oa, ob, cAB, cBA, cA;
+ uint64_t rz;
+
+ /*
+ * rt = 1 if a_hi > b_hi, 0 otherwise.
+ */
+ rz = b_hi - a_hi;
+ rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+ & (a_hi ^ rz))) >> 63);
+
+ /*
+ * cAB = 1 if b must be subtracted from a
+ * cBA = 1 if a must be subtracted from b
+ * cA = 1 if a must be divided by 2
+ *
+ * Rules:
+ *
+ * cAB and cBA cannot both be 1.
+ * If a is not divided by 2, b is.
+ */
+ oa = (a_lo >> i) & 1;
+ ob = (b_lo >> i) & 1;
+ cAB = oa & ob & rt;
+ cBA = oa & ob & ~rt;
+ cA = cAB | (oa ^ 1);
+
+ /*
+ * Conditional subtractions.
+ */
+ a_lo -= b_lo & -cAB;
+ a_hi -= b_hi & -(uint64_t)cAB;
+ pa -= qa & -(int64_t)cAB;
+ pb -= qb & -(int64_t)cAB;
+ b_lo -= a_lo & -cBA;
+ b_hi -= a_hi & -(uint64_t)cBA;
+ qa -= pa & -(int64_t)cBA;
+ qb -= pb & -(int64_t)cBA;
+
+ /*
+ * Shifting.
+ */
+ a_lo += a_lo & (cA - 1);
+ pa += pa & ((int64_t)cA - 1);
+ pb += pb & ((int64_t)cA - 1);
+ a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+ b_lo += b_lo & -cA;
+ qa += qa & -(int64_t)cA;
+ qb += qb & -(int64_t)cA;
+ b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+ }
+
+ /*
+ * Apply the computed parameters to our values. We
+ * may have to correct pa and pb depending on the
+ * returned value of zint_co_reduce() (when a and/or b
+ * had to be negated).
+ */
+ r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+ pa -= (pa + pa) & -(int64_t)(r & 1);
+ pb -= (pb + pb) & -(int64_t)(r & 1);
+ qa -= (qa + qa) & -(int64_t)(r >> 1);
+ qb -= (qb + qb) & -(int64_t)(r >> 1);
+ zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+ zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+ }
+
+ /*
+ * At that point, array a[] should contain the GCD, and the
+ * results (u,v) should already be set. We check that the GCD
+ * is indeed 1. We also check that the two operands x and y
+ * are odd.
+ */
+ rc = a[0] ^ 1;
+ for (j = 1; j < len; j ++) {
+ rc |= a[j];
+ }
+ return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *restrict x, size_t xlen,
+ const uint32_t *restrict y, size_t ylen, int32_t k,
+ uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ int32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t wy, wys, ccu;
+ uint64_t z;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ wy = v < ylen ? y[v] : ysign;
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ /*
+ * The expression below does not overflow.
+ */
+ z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+ /*
+ * Right-shifting the signed value z would yield
+ * implementation-defined results (arithmetic shift is
+ * not guaranteed). However, we can cast to unsigned,
+ * and get the next carry as an unsigned word. We can
+ * then convert it back to signed by using the guaranteed
+ * fact that 'int32_t' uses two's complement with no
+ * trap representation or padding bit, and with a layout
+ * compatible with that of 'uint32_t'.
+ */
+ ccu = (uint32_t)(z >> 31);
+ cc = *(int32_t *)&ccu;
+ }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *restrict x, size_t xlen,
+ const uint32_t *restrict y, size_t ylen, uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ uint32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t w, wy, wys;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ wy = v < ylen ? y[v] : ysign;
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ w = x[u] - wys - cc;
+ x[u] = w & 0x7FFFFFFF;
+ cc = w >> 31;
+ }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+ uint32_t w;
+
+ w = x[0];
+ w |= (w & 0x40000000) << 1;
+ return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+ unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ if (flen == 0) {
+ for (u = 0; u < n; u ++) {
+ d[u] = fpr_zero;
+ }
+ return;
+ }
+ for (u = 0; u < n; u ++, f += fstride) {
+ size_t v;
+ uint32_t neg, cc, xm;
+ fpr x, fsc;
+
+ /*
+ * Get sign of the integer; if it is negative, then we
+ * will load its absolute value instead, and negate the
+ * result.
+ */
+ neg = -(f[flen - 1] >> 30);
+ xm = neg >> 1;
+ cc = neg & 1;
+ x = fpr_zero;
+ fsc = fpr_one;
+ for (v = 0; v < flen; v++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+ uint32_t w;
+
+ w = (f[v] ^ xm) + cc;
+ cc = w >> 31;
+ w &= 0x7FFFFFFF;
+ w -= (w << 1) & neg;
+ x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+ }
+ d[u] = x;
+ }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = zint_one_to_plain(s + u);
+ if (z < -lim || z > lim) {
+ return 0;
+ }
+ d[u] = (int8_t)z;
+ }
+ return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *restrict F, size_t Flen, size_t Fstride,
+ const uint32_t *restrict f, size_t flen, size_t fstride,
+ const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t kf;
+ size_t v;
+ uint32_t *x;
+ const uint32_t *y;
+
+ kf = -k[u];
+ x = F + u * Fstride;
+ y = f;
+ for (v = 0; v < n; v ++) {
+ zint_add_scaled_mul_small(
+ x, Flen, y, flen, kf, sch, scl);
+ if (u + v == n - 1) {
+ x = F;
+ kf = -kf;
+ } else {
+ x += Fstride;
+ }
+ y += fstride;
+ }
+ }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *restrict F, size_t Flen, size_t Fstride,
+ const uint32_t *restrict f, size_t flen, size_t fstride,
+ const int32_t *restrict k, uint32_t sch, uint32_t scl, unsigned logn,
+ uint32_t *restrict tmp) {
+ uint32_t *gm, *igm, *fk, *t1, *x;
+ const uint32_t *y;
+ size_t n, u, tlen;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ tlen = flen + 1;
+ gm = tmp;
+ igm = gm + MKN(logn);
+ fk = igm + MKN(logn);
+ t1 = fk + n * tlen;
+
+ primes = PRIMES;
+
+ /*
+ * Compute k*f in fk[], in RNS notation.
+ */
+ for (u = 0; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0; v < n; v ++) {
+ t1[v] = modp_set(k[v], p);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, y = f, x = fk + u;
+ v < n; v ++, y += fstride, x += tlen) {
+ *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+ }
+ modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+ for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+ *x = modp_montymul(
+ modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+ }
+ modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild k*f.
+ */
+ zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+ /*
+ * Subtract k*f, scaled, from F.
+ */
+ for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+ zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+ }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+ /*
+ * We enforce little-endian representation.
+ */
+
+ uint8_t tmp[8];
+
+ inner_shake256_extract(rng, tmp, sizeof tmp);
+ return (uint64_t)tmp[0]
+ | ((uint64_t)tmp[1] << 8)
+ | ((uint64_t)tmp[2] << 16)
+ | ((uint64_t)tmp[3] << 24)
+ | ((uint64_t)tmp[4] << 32)
+ | ((uint64_t)tmp[5] << 40)
+ | ((uint64_t)tmp[6] << 48)
+ | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ * D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+ 1283868770400643928u, 6416574995475331444u, 4078260278032692663u,
+ 2353523259288686585u, 1227179971273316331u, 575931623374121527u,
+ 242543240509105209u, 91437049221049666u, 30799446349977173u,
+ 9255276791179340u, 2478152334826140u, 590642893610164u,
+ 125206034929641u, 23590435911403u, 3948334035941u,
+ 586753615614u, 77391054539u, 9056793210u,
+ 940121950u, 86539696u, 7062824u,
+ 510971u, 32764u, 1862u,
+ 94u, 4u, 0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+ unsigned u, g;
+ int val;
+
+ g = 1U << (10 - logn);
+ val = 0;
+ for (u = 0; u < g; u ++) {
+ /*
+ * Each iteration generates one value with the
+ * Gaussian distribution for N = 1024.
+ *
+ * We use two random 64-bit values. First value
+ * decides on whether the generated value is 0, and,
+ * if not, the sign of the value. Second random 64-bit
+ * word is used to generate the non-zero value.
+ *
+ * For constant-time code we have to read the complete
+ * table. This has negligible cost, compared with the
+ * remainder of the keygen process (solving the NTRU
+ * equation).
+ */
+ uint64_t r;
+ uint32_t f, v, k, neg;
+
+ /*
+ * First value:
+ * - flag 'neg' is randomly selected to be 0 or 1.
+ * - flag 'f' is set to 1 if the generated value is zero,
+ * or set to 0 otherwise.
+ */
+ r = get_rng_u64(rng);
+ neg = (uint32_t)(r >> 63);
+ r &= ~((uint64_t)1 << 63);
+ f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+ /*
+ * We produce a new random 63-bit integer r, and go over
+ * the array, starting at index 1. We store in v the
+ * index of the first array element which is not greater
+ * than r, unless the flag f was already 1.
+ */
+ v = 0;
+ r = get_rng_u64(rng);
+ r &= ~((uint64_t)1 << 63);
+ for (k = 1; k < (sizeof gauss_1024_12289)
+ / (sizeof gauss_1024_12289[0]); k ++) {
+ uint32_t t;
+
+ t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+ v |= k & -(t & (f ^ 1));
+ f |= t;
+ }
+
+ /*
+ * We apply the sign ('neg' flag). If the value is zero,
+ * the sign has no effect.
+ */
+ v = (v ^ -neg) + neg;
+
+ /*
+ * Generated value is added to val.
+ */
+ val += *(int32_t *)&v;
+ }
+ return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ * MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ * - Within an array, values grow.
+ *
+ * - The 'SMALL' array must have an entry for maximum depth, corresponding
+ * to the size of values used in the binary GCD. There is no such value
+ * for the 'LARGE' array (the binary GCD yields already reduced
+ * coefficients).
+ *
+ * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ * - Values must be large enough to handle the common cases, with some
+ * margins.
+ *
+ * - Values must not be "too large" either because we will convert some
+ * integers into floating-point values by considering the top 10 words,
+ * i.e. 310 bits; hence, for values of length more than 10 words, we
+ * should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51)
+ * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55)
+ * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77)
+ * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31)
+ * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04)
+ * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87)
+ * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38)
+ * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39)
+ * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73)
+ * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41)
+ * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+ 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+ 2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+ int avg;
+ int std;
+} BITLENGTH[] = {
+ { 4, 0 },
+ { 11, 1 },
+ { 24, 1 },
+ { 50, 1 },
+ { 102, 1 },
+ { 202, 2 },
+ { 401, 4 },
+ { 794, 5 },
+ { 1577, 8 },
+ { 3138, 13 },
+ { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG 4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = MKN(logn);
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = f[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(fpr);
+ if (km) {
+ k += (sizeof(fpr)) - km;
+ }
+ return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(uint32_t);
+ if (km) {
+ k += (sizeof(uint32_t)) - km;
+ }
+ return (uint32_t *)(cb + k);
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+ int in_ntt, int out_ntt) {
+ size_t n, hn, u;
+ size_t slen, tlen;
+ uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+ const small_prime *primes;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ slen = MAX_BL_SMALL[depth];
+ tlen = MAX_BL_SMALL[depth + 1];
+ primes = PRIMES;
+
+ /*
+ * Prepare room for the result.
+ */
+ fd = data;
+ gd = fd + hn * tlen;
+ fs = gd + hn * tlen;
+ gs = fs + n * slen;
+ gm = gs + n * slen;
+ igm = gm + n;
+ t1 = igm + n;
+ memmove(fs, data, 2 * n * slen * sizeof * data);
+
+ /*
+ * First slen words: we use the input values directly, and apply
+ * inverse NTT as we go.
+ */
+ for (u = 0; u < slen; u ++) {
+ uint32_t p, p0i, R2;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+ }
+
+ for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+
+ /*
+ * Since the fs and gs words have been de-NTTized, we can use the
+ * CRT to rebuild the values.
+ */
+ zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+ zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+ /*
+ * Remaining words: use modular reductions to extract the values.
+ */
+ for (u = slen; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+ for (v = 0, x = fs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ for (v = 0, x = gs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ * 0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+ unsigned logn, unsigned depth, int out_ntt) {
+ size_t n, u;
+ uint32_t *ft, *gt, p0;
+ unsigned d;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ ft = data;
+ gt = ft + n;
+ primes = PRIMES;
+ p0 = primes[0].p;
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p0);
+ gt[u] = modp_set(g[u], p0);
+ }
+
+ if (depth == 0 && out_ntt) {
+ uint32_t *gm, *igm;
+ uint32_t p, p0i;
+
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ gm = gt + n;
+ igm = gm + MKN(logn);
+ modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ return;
+ }
+
+ for (d = 0; d < depth; d ++) {
+ make_fg_step(data, logn - d, d,
+ d != 0, (d + 1) < depth || out_ntt);
+ }
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t len;
+ uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+ const small_prime *primes;
+
+ len = MAX_BL_SMALL[logn_top];
+ primes = PRIMES;
+
+ Fp = tmp;
+ Gp = Fp + len;
+ fp = Gp + len;
+ gp = fp + len;
+ t1 = gp + len;
+
+ make_fg(fp, f, g, logn_top, logn_top, 0);
+
+ /*
+ * We use the CRT to rebuild the resultants as big integers.
+ * There are two such big integers. The resultants are always
+ * nonnegative.
+ */
+ zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+ /*
+ * Apply the binary GCD. The zint_bezout() function works only
+ * if both inputs are odd.
+ *
+ * We can test on the result and return 0 because that would
+ * imply failure of the NTRU solving equation, and the (f,g)
+ * values will be abandoned in that case.
+ */
+ if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+ return 0;
+ }
+
+ /*
+ * Multiply the two values by the target value q. Values must
+ * fit in the destination arrays.
+ * We can again test on the returned words: a non-zero output
+ * of zint_mul_small() means that we exceeded our array
+ * capacity, and that implies failure and rejection of (f,g).
+ */
+ q = 12289;
+ if (zint_mul_small(Fp, len, q) != 0
+ || zint_mul_small(Gp, len, q) != 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+ const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+ /*
+ * In this function, 'logn' is the log2 of the degree for
+ * this step. If N = 2^logn, then:
+ * - the F and G values already in fk->tmp (from the deeper
+ * levels) have degree N/2;
+ * - this function should return F and G of degree N.
+ */
+ unsigned logn;
+ size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+ int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+ uint32_t *x, *y;
+ int32_t *k;
+ const small_prime *primes;
+
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2 or N/3)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+ primes = PRIMES;
+
+ /*
+ * Fd and Gd are the F and G from the deeper level.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+
+ /*
+ * Compute the input f and g for this level. Note that we get f
+ * and g in RNS + NTT representation.
+ */
+ ft = Gd + dlen * hn;
+ make_fg(ft, f, g, logn_top, depth, 1);
+
+ /*
+ * Move the newly computed f and g to make room for our candidate
+ * F and G (unreduced).
+ */
+ Ft = tmp;
+ Gt = Ft + n * llen;
+ t1 = Gt + n * llen;
+ memmove(t1, ft, 2 * n * slen * sizeof * ft);
+ ft = t1;
+ gt = ft + slen * n;
+ t1 = gt + slen * n;
+
+ /*
+ * Move Fd and Gd _after_ f and g.
+ */
+ memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+ Fd = t1;
+ Gd = Fd + hn * dlen;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt (only n/2 values in each).
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * We do not need Fd and Gd after that point.
+ */
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * If we processed slen words, then f and g have been
+ * de-NTTized, and are in RNS; we can rebuild them.
+ */
+ if (u == slen) {
+ zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+ zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+ }
+
+ gm = t1;
+ igm = gm + n;
+ fx = igm + n;
+ gx = fx + n;
+
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ if (u < slen) {
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = *x;
+ gx[v] = *y;
+ }
+ modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+ modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+ } else {
+ uint32_t Rx;
+
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ for (v = 0, x = ft, y = gt;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = zint_mod_small_signed(x, slen,
+ p, p0i, R2, Rx);
+ gx[v] = zint_mod_small_signed(y, slen,
+ p, p0i, R2, Rx);
+ }
+ modp_NTT2(fx, gm, logn, p, p0i);
+ modp_NTT2(gx, gm, logn, p, p0i);
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed in
+ * a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * General case:
+ *
+ * we divide degree by d = 2 or 3
+ * f'(x^d) = N(f)(x^d) = f * adj(f)
+ * g'(x^d) = N(g)(x^d) = g * adj(g)
+ * f'*G' - g'*F' = q
+ * F = F'(x^d) * adj(g)
+ * G = G'(x^d) * adj(f)
+ *
+ * We compute things in the NTT. We group roots of phi
+ * such that all roots x in a group share the same x^d.
+ * If the roots in a group are x_1, x_2... x_d, then:
+ *
+ * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+ *
+ * Thus, we have:
+ *
+ * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * ...
+ * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+ *
+ * In all cases, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, in our chosen NTT representation, roots
+ * from the same group are consecutive in RAM.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+ v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild F and G with the CRT.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+ zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+ /*
+ * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+ * order).
+ */
+
+ /*
+ * Apply Babai reduction to bring back F and G to size slen.
+ *
+ * We use the FFT to compute successive approximations of the
+ * reduction coefficient. We first isolate the top bits of
+ * the coefficients of f and g, and convert them to floating
+ * point; with the FFT, we compute adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)).
+ *
+ * Then, we repeatedly apply the following:
+ *
+ * - Get the top bits of the coefficients of F and G into
+ * floating point, and use the FFT to compute:
+ * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+ *
+ * - Convert back that value into normal representation, and
+ * round it to the nearest integers, yielding a polynomial k.
+ * Proper scaling is applied to f, g, F and G so that the
+ * coefficients fit on 32 bits (signed).
+ *
+ * - Subtract k*f from F and k*g from G.
+ *
+ * Under normal conditions, this process reduces the size of F
+ * and G by some bits at each iteration. For constant-time
+ * operation, we do not want to measure the actual length of
+ * F and G; instead, we do the following:
+ *
+ * - f and g are converted to floating-point, with some scaling
+ * if necessary to keep values in the representable range.
+ *
+ * - For each iteration, we _assume_ a maximum size for F and G,
+ * and use the values at that size. If we overreach, then
+ * we get zeros, which is harmless: the resulting coefficients
+ * of k will be 0 and the value won't be reduced.
+ *
+ * - We conservatively assume that F and G will be reduced by
+ * at least 25 bits at each iteration.
+ *
+ * Even when reaching the bottom of the reduction, reduction
+ * coefficient will remain low. If it goes out-of-range, then
+ * something wrong occurred and the whole NTRU solving fails.
+ */
+
+ /*
+ * Memory layout:
+ * - We need to compute and keep adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+ * respectively).
+ * - At each iteration we need two extra fp buffer (N fp values),
+ * and produce a k (N 32-bit words). k will be shared with one
+ * of the fp buffers.
+ * - To compute k*f and k*g efficiently (with the NTT), we need
+ * some extra room; we reuse the space of the temporary buffers.
+ *
+ * Arrays of 'fpr' are obtained from the temporary array itself.
+ * We ensure that the base is at a properly aligned offset (the
+ * source array tmp[] is supposed to be already aligned).
+ */
+
+ rt3 = align_fpr(tmp, t1);
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt1 = rt5 + (n >> 1);
+ k = (int32_t *)align_u32(tmp, rt1);
+ rt2 = align_fpr(tmp, k + n);
+ if (rt2 < (rt1 + n)) {
+ rt2 = rt1 + n;
+ }
+ t1 = (uint32_t *)k + n;
+
+ /*
+ * Get f and g into rt3 and rt4 as floating-point approximations.
+ *
+ * We need to "scale down" the floating-point representation of
+ * coefficients when they are too big. We want to keep the value
+ * below 2^310 or so. Thus, when values are larger than 10 words,
+ * we consider only the top 10 words. Array lengths have been
+ * computed so that average maximum length will fall in the
+ * middle or the upper half of these top 10 words.
+ */
+ rlen = (slen > 10) ? 10 : slen;
+ poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+ poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+ /*
+ * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+ */
+ scale_fg = 31 * (int)(slen - rlen);
+
+ /*
+ * Estimated boundaries for the maximum size (in bits) of the
+ * coefficients of (f,g). We use the measured average, and
+ * allow for a deviation of at most six times the standard
+ * deviation.
+ */
+ minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+ maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+ /*
+ * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+ * and adj(g) in rt3 and rt4, respectively.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt3, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt4, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt4, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+
+ /*
+ * Reduce F and G repeatedly.
+ *
+ * The expected maximum bit length of coefficients of F and G
+ * is kept in maxbl_FG, with the corresponding word length in
+ * FGlen.
+ */
+ FGlen = llen;
+ maxbl_FG = 31 * (int)llen;
+
+ /*
+ * Each reduction operation computes the reduction polynomial
+ * "k". We need that polynomial to have coefficients that fit
+ * on 32-bit signed integers, with some scaling; thus, we use
+ * a descending sequence of scaling values, down to zero.
+ *
+ * The size of the coefficients of k is (roughly) the difference
+ * between the size of the coefficients of (F,G) and the size
+ * of the coefficients of (f,g). Thus, the maximum size of the
+ * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+ * this is our starting scale value for k.
+ *
+ * We need to estimate the size of (F,G) during the execution of
+ * the algorithm; we are allowed some overestimation but not too
+ * much (poly_big_to_fp() uses a 310-bit window). Generally
+ * speaking, after applying a reduction with k scaled to
+ * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+ * where 'dd' is a few bits to account for the fact that the
+ * reduction is never perfect (intuitively, dd is on the order
+ * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+ * bits).
+ *
+ * The size of (f,g) is not known exactly, but maxbl_fg is an
+ * upper bound.
+ */
+ scale_k = maxbl_FG - minbl_fg;
+
+ for (;;) {
+ int scale_FG, dc, new_maxbl_FG;
+ uint32_t scl, sch;
+ fpr pdc, pt;
+
+ /*
+ * Convert current F and G into floating-point. We apply
+ * scaling if the current length is more than 10 words.
+ */
+ rlen = (FGlen > 10) ? 10 : FGlen;
+ scale_FG = 31 * (int)(FGlen - rlen);
+ poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+ poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt1, rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt2, rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_add(rt2, rt2, rt1, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt5, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt2, logn);
+
+ /*
+ * (f,g) are scaled by 'scale_fg', meaning that the
+ * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+ * to have their true mathematical value.
+ *
+ * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+ * the value we computed in rt2 is scaled by
+ * 'scale_FG-scale_fg'.
+ *
+ * We want that value to be scaled by 'scale_k', hence we
+ * apply a corrective scaling. After scaling, the values
+ * should fit in -2^31-1..+2^31-1.
+ */
+ dc = scale_k - scale_FG + scale_fg;
+
+ /*
+ * We will need to multiply values by 2^(-dc). The value
+ * 'dc' is not secret, so we can compute 2^(-dc) with a
+ * non-constant-time process.
+ * (We could use ldexp(), but we prefer to avoid any
+ * dependency on libm. When using FP emulation, we could
+ * use our fpr_ldexp(), which is constant-time.)
+ */
+ if (dc < 0) {
+ dc = -dc;
+ pt = fpr_two;
+ } else {
+ pt = fpr_onehalf;
+ }
+ pdc = fpr_one;
+ while (dc != 0) {
+ if ((dc & 1) != 0) {
+ pdc = fpr_mul(pdc, pt);
+ }
+ dc >>= 1;
+ pt = fpr_sqr(pt);
+ }
+
+ for (u = 0; u < n; u ++) {
+ fpr xv;
+
+ xv = fpr_mul(rt2[u], pdc);
+
+ /*
+ * Sometimes the values can be out-of-bounds if
+ * the algorithm fails; we must not call
+ * fpr_rint() (and cast to int32_t) if the value
+ * is not in-bounds. Note that the test does not
+ * break constant-time discipline, since any
+ * failure here implies that we discard the current
+ * secret key (f,g).
+ */
+ if (!fpr_lt(fpr_mtwo31m1, xv)
+ || !fpr_lt(xv, fpr_ptwo31m1)) {
+ return 0;
+ }
+ k[u] = (int32_t)fpr_rint(xv);
+ }
+
+ /*
+ * Values in k[] are integers. They really are scaled
+ * down by maxbl_FG - minbl_fg bits.
+ *
+ * If we are at low depth, then we use the NTT to
+ * compute k*f and k*g.
+ */
+ sch = (uint32_t)(scale_k / 31);
+ scl = (uint32_t)(scale_k % 31);
+ if (depth <= DEPTH_INT_FG) {
+ poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn, t1);
+ poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn, t1);
+ } else {
+ poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn);
+ poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn);
+ }
+
+ /*
+ * We compute the new maximum size of (F,G), assuming that
+ * (f,g) has _maximal_ length (i.e. that reduction is
+ * "late" instead of "early". We also adjust FGlen
+ * accordingly.
+ */
+ new_maxbl_FG = scale_k + maxbl_fg + 10;
+ if (new_maxbl_FG < maxbl_FG) {
+ maxbl_FG = new_maxbl_FG;
+ if ((int)FGlen * 31 >= maxbl_FG + 31) {
+ FGlen --;
+ }
+ }
+
+ /*
+ * We suppose that scaling down achieves a reduction by
+ * at least 25 bits per iteration. We stop when we have
+ * done the loop with an unscaled k.
+ */
+ if (scale_k <= 0) {
+ break;
+ }
+ scale_k -= 25;
+ if (scale_k < 0) {
+ scale_k = 0;
+ }
+ }
+
+ /*
+ * If (F,G) length was lowered below 'slen', then we must take
+ * care to re-extend the sign.
+ */
+ if (FGlen < slen) {
+ for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+ size_t v;
+ uint32_t sw;
+
+ sw = -(Ft[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Ft[v] = sw;
+ }
+ sw = -(Gt[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Gt[v] = sw;
+ }
+ }
+ }
+
+ /*
+ * Compress encoding of all values to 'slen' words (this is the
+ * expected output format).
+ */
+ for (u = 0, x = tmp, y = tmp;
+ u < (n << 1); u ++, x += slen, y += llen) {
+ memmove(x, y, slen * sizeof * y);
+ }
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ /*
+ * The first half of this function is a copy of the corresponding
+ * part in solve_NTRU_intermediate(), for the reconstruction of
+ * the unreduced F and G. The second half (Babai reduction) is
+ * done differently, because the unreduced F and G fit in 53 bits
+ * of precision, allowing a much simpler process with lower RAM
+ * usage.
+ */
+ unsigned depth, logn;
+ size_t n_top, n, hn, slen, dlen, llen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+ uint32_t *x, *y;
+
+ depth = 1;
+ n_top = (size_t)1 << logn_top;
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ */
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+
+ /*
+ * Fd and Gd are the F and G from the deeper level. Ft and Gt
+ * are the destination arrays for the unreduced F and G.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+ Ft = Gd + dlen * hn;
+ Gt = Ft + llen * n;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * Now Fd and Gd are not needed anymore; we can squeeze them out.
+ */
+ memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+ Ft = tmp;
+ memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+ Gt = Ft + llen * n;
+ ft = Gt + llen * n;
+ gt = ft + slen * n;
+
+ t1 = gt + slen * n;
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ unsigned e;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * We recompute things from the source f and g, of full
+ * degree. However, we will need only the n first elements
+ * of the inverse NTT table (igm); the call to modp_mkgm()
+ * below will fill n_top elements in igm[] (thus overflowing
+ * into fx[]) but later code will overwrite these extra
+ * elements.
+ */
+ gm = t1;
+ igm = gm + n_top;
+ fx = igm + n;
+ gx = fx + n_top;
+ modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+ /*
+ * Set ft and gt to f and g modulo p, respectively.
+ */
+ for (v = 0; v < n_top; v ++) {
+ fx[v] = modp_set(f[v], p);
+ gx[v] = modp_set(g[v], p);
+ }
+
+ /*
+ * Convert to NTT and compute our f and g.
+ */
+ modp_NTT2(fx, gm, logn_top, p, p0i);
+ modp_NTT2(gx, gm, logn_top, p, p0i);
+ for (e = logn_top; e > logn; e --) {
+ modp_poly_rec_res(fx, e, p, p0i, R2);
+ modp_poly_rec_res(gx, e, p, p0i, R2);
+ }
+
+ /*
+ * From that point onward, we only need tables for
+ * degree n, so we can save some space.
+ */
+ if (depth > 0) { /* always true */
+ memmove(gm + n, igm, n * sizeof * igm);
+ igm = gm + n;
+ memmove(igm + n, fx, n * sizeof * ft);
+ fx = igm + n;
+ memmove(fx + n, gx, n * sizeof * gt);
+ gx = fx + n;
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed
+ * in a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * Equations are:
+ *
+ * f'(x^2) = N(f)(x^2) = f * adj(f)
+ * g'(x^2) = N(g)(x^2) = g * adj(g)
+ *
+ * f'*G' - g'*F' = q
+ *
+ * F = F'(x^2) * adj(g)
+ * G = G'(x^2) * adj(f)
+ *
+ * The NTT representation of f is f(w) for all w which
+ * are roots of phi. In the binary case, as well as in
+ * the ternary case for all depth except the deepest,
+ * these roots can be grouped in pairs (w,-w), and we
+ * then have:
+ *
+ * f(w) = adj(f)(-w)
+ * f(-w) = adj(f)(w)
+ *
+ * and w^2 is then a root for phi at the half-degree.
+ *
+ * At the deepest level in the ternary case, this still
+ * holds, in the following sense: the roots of x^2-x+1
+ * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+ * have:
+ *
+ * f(w) = adj(f)(-w^2)
+ * f(-w^2) = adj(f)(w)
+ *
+ * In all case, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, the two roots for each pair are consecutive
+ * in our bit-reversal encoding.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+ /*
+ * Also save ft and gt (only up to size slen).
+ */
+ if (u < slen) {
+ modp_iNTT2(fx, igm, logn, p, p0i);
+ modp_iNTT2(gx, igm, logn, p, p0i);
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ *x = fx[v];
+ *y = gx[v];
+ }
+ }
+ }
+
+ /*
+ * Rebuild f, g, F and G with the CRT. Note that the elements of F
+ * and G are consecutive, and thus can be rebuilt in a single
+ * loop; similarly, the elements of f and g are consecutive.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+ zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+ /*
+ * Here starts the Babai reduction, specialized for depth = 1.
+ *
+ * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+ * are converted to floating point. There is no scaling, and a
+ * single pass is sufficient.
+ */
+
+ /*
+ * Convert F and G into floating point (rt1 and rt2).
+ */
+ rt1 = align_fpr(tmp, gt + slen * n);
+ rt2 = rt1 + n;
+ poly_big_to_fp(rt1, Ft, llen, llen, logn);
+ poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+ /*
+ * Integer representation of F and G is no longer needed, we
+ * can remove it.
+ */
+ memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+ ft = tmp;
+ gt = ft + slen * n;
+ rt3 = align_fpr(tmp, gt + slen * n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * Convert f and g into floating point (rt3 and rt4).
+ */
+ poly_big_to_fp(rt3, ft, slen, slen, logn);
+ poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+ /*
+ * Remove unneeded ft and gt.
+ */
+ memmove(tmp, rt1, 4 * n * sizeof * rt1);
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt6 = rt5 + n;
+
+ /*
+ * We now have:
+ * rt1 = F
+ * rt2 = G
+ * rt3 = f
+ * rt4 = g
+ * in that order in RAM. We convert all of them to FFT.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = F*adj(f) + G*adj(g)
+ * rt6 = 1 / (f*adj(f) + g*adj(g))
+ * (Note that rt6 is half-length.)
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt5, rt5, rt6, logn);
+
+ /*
+ * Compute k as the rounded version of rt5. Check that none of
+ * the values is larger than 2^63-1 (in absolute value)
+ * because that would make the fpr_rint() do something undefined;
+ * note that any out-of-bounds value here implies a failure and
+ * (f,g) will be discarded, so we can make a simple test.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt5, logn);
+ for (u = 0; u < n; u ++) {
+ fpr z;
+
+ z = rt5[u];
+ if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+ return 0;
+ }
+ rt5[u] = fpr_of(fpr_rint(z));
+ }
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt5, logn);
+
+ /*
+ * Subtract k*f from F, and k*g from G.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt3, rt3, rt5, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(rt1, rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt1, logn);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(rt4, rt4, rt5, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(rt2, rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt2, logn);
+
+ /*
+ * Convert back F and G to integers, and return.
+ */
+ Ft = tmp;
+ Gt = Ft + n;
+ rt3 = align_fpr(tmp, Gt + n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ for (u = 0; u < n; u ++) {
+ Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+ Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t n, hn, u;
+ uint32_t p, p0i, R2;
+ uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+ uint32_t *gm, *igm, *ft, *gt;
+ fpr *rt2, *rt3;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ *
+ * Everything should fit in 31-bit integers, hence we can just use
+ * the first small prime p = 2147473409.
+ */
+ p = PRIMES[0].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ Fp = tmp;
+ Gp = Fp + hn;
+ ft = Gp + hn;
+ gt = ft + n;
+ gm = gt + n;
+ igm = gm + n;
+
+ modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F' anf G' in NTT representation.
+ */
+ for (u = 0; u < hn; u ++) {
+ Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+ Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Load f and g and convert them to NTT representation.
+ */
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+
+ /*
+ * Build the unreduced F,G in ft and gt.
+ */
+ for (u = 0; u < n; u += 2) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = ft[u + 0];
+ ftB = ft[u + 1];
+ gtA = gt[u + 0];
+ gtB = gt[u + 1];
+ mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+ mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+ ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+ ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+ gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+ gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2(ft, igm, logn, p, p0i);
+ modp_iNTT2(gt, igm, logn, p, p0i);
+
+ Gp = Fp + n;
+ t1 = Gp + n;
+ memmove(Fp, ft, 2 * n * sizeof * ft);
+
+ /*
+ * We now need to apply the Babai reduction. At that point,
+ * we have F and G in two n-word arrays.
+ *
+ * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+ * modulo p, using the NTT. We still move memory around in
+ * order to save RAM.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+
+ /*
+ * Compute the NTT tables in t1 and t2. We do not keep t2
+ * (we'll recompute it later on).
+ */
+ modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F and G to NTT.
+ */
+ modp_NTT2(Fp, t1, logn, p, p0i);
+ modp_NTT2(Gp, t1, logn, p, p0i);
+
+ /*
+ * Load f and adj(f) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(f[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[n - u] = modp_set(-f[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Compute F*adj(f) in t2, and f*adj(f) in t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_montymul(w, Fp[u], p, p0i);
+ t3[u] = modp_montymul(w, t4[u], p, p0i);
+ }
+
+ /*
+ * Load g and adj(g) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(g[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(g[u], p);
+ t5[n - u] = modp_set(-g[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Add G*adj(g) to t2, and g*adj(g) to t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_add(t2[u],
+ modp_montymul(w, Gp[u], p, p0i), p);
+ t3[u] = modp_add(t3[u],
+ modp_montymul(w, t4[u], p, p0i), p);
+ }
+
+ /*
+ * Convert back t2 and t3 to normal representation (normalized
+ * around 0), and then
+ * move them to t1 and t2. We first need to recompute the
+ * inverse table for NTT.
+ */
+ modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+ modp_iNTT2(t2, t4, logn, p, p0i);
+ modp_iNTT2(t3, t4, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint32_t)modp_norm(t2[u], p);
+ t2[u] = (uint32_t)modp_norm(t3[u], p);
+ }
+
+ /*
+ * At that point, array contents are:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * F*adj(f)+G*adj(g) (t1)
+ * f*adj(f)+g*adj(g) (t2)
+ *
+ * We want to divide t1 by t2. The result is not integral; it
+ * must be rounded. We thus need to use the FFT.
+ */
+
+ /*
+ * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+ * polynomial is auto-adjoint, all its coordinates in FFT
+ * representation are actually real, so we can truncate off
+ * the imaginary parts.
+ */
+ rt3 = align_fpr(tmp, t3);
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t2)[u]);
+ }
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+ rt2 = align_fpr(tmp, t2);
+ memmove(rt2, rt3, hn * sizeof * rt3);
+
+ /*
+ * Convert F*adj(f)+G*adj(g) in FFT representation.
+ */
+ rt3 = rt2 + hn;
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t1)[u]);
+ }
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt3, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+ * its rounded normal representation in t1.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_div_autoadj_fft(rt3, rt3, rt2, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt3, logn);
+ for (u = 0; u < n; u ++) {
+ t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+ }
+
+ /*
+ * RAM contents are now:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * k (t1)
+ *
+ * We want to compute F-k*f, and G-k*g.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+ modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(t1, t2, logn, p, p0i);
+ modp_NTT2(t4, t2, logn, p, p0i);
+ modp_NTT2(t5, t2, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t kw;
+
+ kw = modp_montymul(t1[u], R2, p, p0i);
+ Fp[u] = modp_sub(Fp[u],
+ modp_montymul(kw, t4[u], p, p0i), p);
+ Gp[u] = modp_sub(Gp[u],
+ modp_montymul(kw, t5[u], p, p0i), p);
+ }
+ modp_iNTT2(Fp, t3, logn, p, p0i);
+ modp_iNTT2(Gp, t3, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+ Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+ }
+
+ return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+ const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+ size_t n, u;
+ uint32_t *ft, *gt, *Ft, *Gt, *gm;
+ uint32_t p, p0i, r;
+ const small_prime *primes;
+
+ n = MKN(logn);
+
+ if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+ return 0;
+ }
+
+ /*
+ * For logn <= 2, we need to use solve_NTRU_intermediate()
+ * directly, because coefficients are a bit too large and
+ * do not fit the hypotheses in solve_NTRU_binary_depth0().
+ */
+ if (logn <= 2) {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 0) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ } else {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 2) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+ return 0;
+ }
+ if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+ return 0;
+ }
+ }
+
+ /*
+ * If no buffer has been provided for G, use a temporary one.
+ */
+ if (G == NULL) {
+ G = (int8_t *)(tmp + 2 * n);
+ }
+
+ /*
+ * Final F and G are in fk->tmp, one word per coefficient
+ * (signed value over 31 bits).
+ */
+ if (!poly_big_to_small(F, tmp, lim, logn)
+ || !poly_big_to_small(G, tmp + n, lim, logn)) {
+ return 0;
+ }
+
+ /*
+ * Verify that the NTRU equation is fulfilled. Since all elements
+ * have short lengths, verifying modulo a small prime p works, and
+ * allows using the NTT.
+ *
+ * We put Gt[] first in tmp[], and process it first, so that it does
+ * not overlap with G[] in case we allocated it ourselves.
+ */
+ Gt = tmp;
+ ft = Gt + n;
+ gt = ft + n;
+ Ft = gt + n;
+ gm = Ft + n;
+
+ primes = PRIMES;
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Gt[u] = modp_set(G[u], p);
+ }
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ Ft[u] = modp_set(F[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ modp_NTT2(Ft, gm, logn, p, p0i);
+ modp_NTT2(Gt, gm, logn, p, p0i);
+ r = modp_montymul(12289, 1, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t z;
+
+ z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+ modp_montymul(gt[u], Ft[u], p, p0i), p);
+ if (z != r) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+ size_t n, u;
+ unsigned mod2;
+
+ n = MKN(logn);
+ mod2 = 0;
+ for (u = 0; u < n; u ++) {
+ int s;
+
+restart:
+ s = mkgauss(rng, logn);
+
+ /*
+ * We need the coefficient to fit within -127..+127;
+ * realistically, this is always the case except for
+ * the very low degrees (N = 2 or 4), for which there
+ * is no real security anyway.
+ */
+ if (s < -127 || s > 127) {
+ goto restart;
+ }
+
+ /*
+ * We need the sum of all coefficients to be 1; otherwise,
+ * the resultant of the polynomial with X^N+1 will be even,
+ * and the binary GCD will fail.
+ */
+ if (u == n - 1) {
+ if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+ goto restart;
+ }
+ } else {
+ mod2 ^= (unsigned)(s & 1);
+ }
+ f[u] = (int8_t)s;
+ }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp) {
+ /*
+ * Algorithm is the following:
+ *
+ * - Generate f and g with the Gaussian distribution.
+ *
+ * - If either Res(f,phi) or Res(g,phi) is even, try again.
+ *
+ * - If ||(f,g)|| is too large, try again.
+ *
+ * - If ||B~_{f,g}|| is too large, try again.
+ *
+ * - If f is not invertible mod phi mod q, try again.
+ *
+ * - Compute h = g/f mod phi mod q.
+ *
+ * - Solve the NTRU equation fG - gF = q; if the solving fails,
+ * try again. Usual failure condition is when Res(f,phi)
+ * and Res(g,phi) are not prime to each other.
+ */
+ size_t n, u;
+ int16_t *h2, *tmp2;
+ RNG_CONTEXT *rc;
+
+ n = MKN(logn);
+ rc = rng;
+
+ /*
+ * We need to generate f and g randomly, until we find values
+ * such that the norm of (g,-f), and of the orthogonalized
+ * vector, are satisfying. The orthogonalized vector is:
+ * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+ * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+ *
+ * In the binary case, coefficients of f and g are generated
+ * independently of each other, with a discrete Gaussian
+ * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+ * the two vectors have expected norm 1.17*sqrt(q), which is
+ * also our acceptance bound: we require both vectors to be no
+ * larger than that (this will be satisfied about 1/4th of the
+ * time, thus we expect sampling new (f,g) about 4 times for that
+ * step).
+ *
+ * We require that Res(f,phi) and Res(g,phi) are both odd (the
+ * NTRU equation solver requires it).
+ */
+ for (;;) {
+ fpr *rt1, *rt2, *rt3;
+ fpr bnorm;
+ uint32_t normf, normg, norm;
+ int lim;
+
+ /*
+ * The poly_small_mkgauss() function makes sure
+ * that the sum of coefficients is 1 modulo 2
+ * (i.e. the resultant of the polynomial with phi
+ * will be odd).
+ */
+ poly_small_mkgauss(rc, f, logn);
+ poly_small_mkgauss(rc, g, logn);
+
+ /*
+ * Verify that all coefficients are within the bounds
+ * defined in max_fg_bits. This is the case with
+ * overwhelming probability; this guarantees that the
+ * key will be encodable with FALCON_COMP_TRIM.
+ */
+ lim = 1 << (PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[logn] - 1);
+ for (u = 0; u < n; u ++) {
+ /*
+ * We can use non-CT tests since on any failure
+ * we will discard f and g.
+ */
+ if (f[u] >= lim || f[u] <= -lim
+ || g[u] >= lim || g[u] <= -lim) {
+ lim = -1;
+ break;
+ }
+ }
+ if (lim < 0) {
+ continue;
+ }
+
+ /*
+ * Bound is 1.17*sqrt(q). We compute the squared
+ * norms. With q = 12289, the squared bound is:
+ * (1.17^2)* 12289 = 16822.4121
+ * Since f and g are integral, the squared norm
+ * of (g,-f) is an integer.
+ */
+ normf = poly_small_sqnorm(f, logn);
+ normg = poly_small_sqnorm(g, logn);
+ norm = (normf + normg) | -((normf | normg) >> 31);
+ if (norm >= 16823) {
+ continue;
+ }
+
+ /*
+ * We compute the orthogonalized vector norm.
+ */
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+
+ poly_small_to_fp(rt1, f, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt1, rt1, logn);
+
+ poly_small_to_fp(rt2, g, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(rt2, rt2, logn);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(rt1, rt1, fpr_q, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt1, rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt1, logn);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(rt2, rt2, fpr_q, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(rt2, rt2, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(rt2, logn);
+
+ bnorm = PQCLEAN_FALCONPADDED512_AARCH64_compute_bnorm(rt1, rt2);
+
+ if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+ continue;
+ }
+
+ /*
+ * Compute public key h = g/f mod X^N+1 mod q. If this
+ * fails, we must restart.
+ */
+ if (h == NULL) {
+ h2 = (int16_t *)tmp;
+ tmp2 = h2 + n;
+ } else {
+ h2 = (int16_t *)h;
+ tmp2 = (int16_t *)tmp;
+ }
+
+ if (!PQCLEAN_FALCONPADDED512_AARCH64_compute_public(h2, f, g, tmp2)) {
+ continue;
+ }
+
+ /*
+ * Solve the NTRU equation to get F and G.
+ */
+ lim = (1 << (PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[logn] - 1)) - 1;
+ if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+ continue;
+ }
+
+ /*
+ * Key pair is generated.
+ */
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrof.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrof.h
new file mode 100644
index 000000000..c8f82991e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrof.h
@@ -0,0 +1,125 @@
+/*
+ * 64-bit Floating point NEON macro x1
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+
+// c <= addr x1
+#define vload(c, addr) c = vld1q_f64(addr);
+// c <= addr interleave 2
+#define vload2(c, addr) c = vld2q_f64(addr);
+// c <= addr interleave 4
+#define vload4(c, addr) c = vld4q_f64(addr);
+
+#define vstore(addr, c) vst1q_f64(addr, c);
+// addr <= c
+#define vstore2(addr, c) vst2q_f64(addr, c);
+// addr <= c
+#define vstore4(addr, c) vst4q_f64(addr, c);
+
+// c <= addr x2
+#define vloadx2(c, addr) c = vld1q_f64_x2(addr);
+// c <= addr x3
+#define vloadx3(c, addr) c = vld1q_f64_x3(addr);
+
+// addr <= c
+#define vstorex2(addr, c) vst1q_f64_x2(addr, c);
+
+// c = a - b
+#define vfsub(c, a, b) c = vsubq_f64(a, b);
+
+// c = a + b
+#define vfadd(c, a, b) c = vaddq_f64(a, b);
+
+// c = a * b
+#define vfmul(c, a, b) c = vmulq_f64(a, b);
+
+// c = a * n (n is constant)
+#define vfmuln(c, a, n) c = vmulq_n_f64(a, n);
+
+// Swap from a|b to b|a
+#define vswap(c, a) c = vextq_f64(a, a, 1);
+
+// c = a * b[i]
+#define vfmul_lane(c, a, b, i) c = vmulq_laneq_f64(a, b, i);
+
+// c = 1/a
+#define vfinv(c, a) c = vdivq_f64(vdupq_n_f64(1.0), a);
+
+// c = -a
+#define vfneg(c, a) c = vnegq_f64(a);
+
+#define transpose_f64(a, b, t, ia, ib, it) \
+ t.val[it] = a.val[ia]; \
+ a.val[ia] = vzip1q_f64(a.val[ia], b.val[ib]); \
+ b.val[ib] = vzip2q_f64(t.val[it], b.val[ib]);
+
+/*
+ * c = a + jb
+ * c[0] = a[0] - b[1]
+ * c[1] = a[1] + b[0]
+ */
+#define vfcaddj(c, a, b) c = vcaddq_rot90_f64(a, b);
+
+/*
+ * c = a - jb
+ * c[0] = a[0] + b[1]
+ * c[1] = a[1] - b[0]
+ */
+#define vfcsubj(c, a, b) c = vcaddq_rot270_f64(a, b);
+
+// c[0] = c[0] + b[0]*a[0], c[1] = c[1] + b[1]*a[0]
+#define vfcmla(c, a, b) c = vcmlaq_f64(c, a, b);
+
+// c[0] = c[0] - b[1]*a[1], c[1] = c[1] + b[0]*a[1]
+#define vfcmla_90(c, a, b) c = vcmlaq_rot90_f64(c, a, b);
+
+// c[0] = c[0] - b[0]*a[0], c[1] = c[1] - b[1]*a[0]
+#define vfcmla_180(c, a, b) c = vcmlaq_rot180_f64(c, a, b);
+
+// c[0] = c[0] + b[1]*a[1], c[1] = c[1] - b[0]*a[1]
+#define vfcmla_270(c, a, b) c = vcmlaq_rot270_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a*b
+ * c[0] = a[0]*b[0] - a[1]*b[1]
+ * c[1] = a[0]*b[1] + a[1]*b[0]
+ */
+#define FPC_CMUL(c, a, b) \
+ c = vmulq_laneq_f64(b, a, 0); \
+ c = vcmlaq_rot90_f64(c, a, b);
+
+/*
+ * Complex MUL: c = a * conjugate(b) = a * (b[0], -b[1])
+ * c[0] = b[0]*a[0] + b[1]*a[1]
+ * c[1] = + b[0]*a[1] - b[1]*a[0]
+ */
+#define FPC_CMUL_CONJ(c, a, b) \
+ c = vmulq_laneq_f64(a, b, 0); \
+ c = vcmlaq_rot270_f64(c, b, a);
+
+// d = c + a *b
+#define vfmla(d, c, a, b) d = vfmaq_f64(c, a, b);
+// d = c - a * b
+#define vfmls(d, c, a, b) d = vfmsq_f64(c, a, b);
+// d = c + a * b[i]
+#define vfmla_lane(d, c, a, b, i) d = vfmaq_laneq_f64(c, a, b, i);
+// d = c - a * b[i]
+#define vfmls_lane(d, c, a, b, i) d = vfmsq_laneq_f64(c, a, b, i);
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrofx4.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrofx4.h
new file mode 100644
index 000000000..e6b70e64e
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrofx4.h
@@ -0,0 +1,430 @@
+/*
+ * 64-bit Floating point NEON macro x4
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+#include "macrof.h"
+
+#define vloadx4(c, addr) c = vld1q_f64_x4(addr);
+
+#define vstorex4(addr, c) vst1q_f64_x4(addr, c);
+
+#define vfdupx4(c, constant) \
+ c.val[0] = vdupq_n_f64(constant); \
+ c.val[1] = vdupq_n_f64(constant); \
+ c.val[2] = vdupq_n_f64(constant); \
+ c.val[3] = vdupq_n_f64(constant);
+
+#define vfnegx4(c, a) \
+ c.val[0] = vnegq_f64(a.val[0]); \
+ c.val[1] = vnegq_f64(a.val[1]); \
+ c.val[2] = vnegq_f64(a.val[2]); \
+ c.val[3] = vnegq_f64(a.val[3]);
+
+#define vfmulnx4(c, a, n) \
+ c.val[0] = vmulq_n_f64(a.val[0], n); \
+ c.val[1] = vmulq_n_f64(a.val[1], n); \
+ c.val[2] = vmulq_n_f64(a.val[2], n); \
+ c.val[3] = vmulq_n_f64(a.val[3], n);
+
+// c = a - b
+#define vfsubx4(c, a, b) \
+ c.val[0] = vsubq_f64(a.val[0], b.val[0]); \
+ c.val[1] = vsubq_f64(a.val[1], b.val[1]); \
+ c.val[2] = vsubq_f64(a.val[2], b.val[2]); \
+ c.val[3] = vsubq_f64(a.val[3], b.val[3]);
+
+// c = a + b
+#define vfaddx4(c, a, b) \
+ c.val[0] = vaddq_f64(a.val[0], b.val[0]); \
+ c.val[1] = vaddq_f64(a.val[1], b.val[1]); \
+ c.val[2] = vaddq_f64(a.val[2], b.val[2]); \
+ c.val[3] = vaddq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4(c, a, b) \
+ c.val[0] = vmulq_f64(a.val[0], b.val[0]); \
+ c.val[1] = vmulq_f64(a.val[1], b.val[1]); \
+ c.val[2] = vmulq_f64(a.val[2], b.val[2]); \
+ c.val[3] = vmulq_f64(a.val[3], b.val[3]);
+
+#define vfmulx4_i(c, a, b) \
+ c.val[0] = vmulq_f64(a.val[0], b); \
+ c.val[1] = vmulq_f64(a.val[1], b); \
+ c.val[2] = vmulq_f64(a.val[2], b); \
+ c.val[3] = vmulq_f64(a.val[3], b);
+
+#define vfinvx4(c, a) \
+ c.val[0] = vdivq_f64(vdupq_n_f64(1.0), a.val[0]); \
+ c.val[1] = vdivq_f64(vdupq_n_f64(1.0), a.val[1]); \
+ c.val[2] = vdivq_f64(vdupq_n_f64(1.0), a.val[2]); \
+ c.val[3] = vdivq_f64(vdupq_n_f64(1.0), a.val[3]);
+
+#define vfcvtx4(c, a) \
+ c.val[0] = vcvtq_f64_s64(a.val[0]); \
+ c.val[1] = vcvtq_f64_s64(a.val[1]); \
+ c.val[2] = vcvtq_f64_s64(a.val[2]); \
+ c.val[3] = vcvtq_f64_s64(a.val[3]);
+
+#define vfmlax4(d, c, a, b) \
+ vfmla(d.val[0], c.val[0], a.val[0], b.val[0]); \
+ vfmla(d.val[1], c.val[1], a.val[1], b.val[1]); \
+ vfmla(d.val[2], c.val[2], a.val[2], b.val[2]); \
+ vfmla(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfmlsx4(d, c, a, b) \
+ vfmls(d.val[0], c.val[0], a.val[0], b.val[0]); \
+ vfmls(d.val[1], c.val[1], a.val[1], b.val[1]); \
+ vfmls(d.val[2], c.val[2], a.val[2], b.val[2]); \
+ vfmls(d.val[3], c.val[3], a.val[3], b.val[3]);
+
+#define vfrintx4(c, a) \
+ c.val[0] = vcvtnq_s64_f64(a.val[0]); \
+ c.val[1] = vcvtnq_s64_f64(a.val[1]); \
+ c.val[2] = vcvtnq_s64_f64(a.val[2]); \
+ c.val[3] = vcvtnq_s64_f64(a.val[3]);
+
+/*
+ * Wrapper for FFT, split/merge and poly_float.c
+ */
+
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re, a_re, b_re); \
+ vfmls(d_re, d_re, a_im, b_im); \
+ vfmul(d_im, a_re, b_im); \
+ vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MULx2(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmul(d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmul(d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmul(d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MULx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmul(d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmul(d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+ vfmul(d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+ vfmul(d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmul(d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+ vfmul(d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+ vfmul(d_im.val[3], a_re.val[3], b_im.val[3]); \
+ vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re, d_re, a_re, b_re); \
+ vfmls(d_re, d_re, a_im, b_im); \
+ vfmla(d_im, d_im, a_re, b_im); \
+ vfmla(d_im, d_im, a_im, b_re);
+
+#define FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]);
+
+#define FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmls(d_re.val[0], d_re.val[0], a_im.val[0], b_im.val[0]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmls(d_re.val[1], d_re.val[1], a_im.val[1], b_im.val[1]); \
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmls(d_re.val[2], d_re.val[2], a_im.val[2], b_im.val[2]); \
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmls(d_re.val[3], d_re.val[3], a_im.val[3], b_im.val[3]); \
+ vfmla(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[0], d_im.val[0], a_im.val[0], b_re.val[0]); \
+ vfmla(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[1], d_im.val[1], a_im.val[1], b_re.val[1]); \
+ vfmla(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmla(d_im.val[2], d_im.val[2], a_im.val[2], b_re.val[2]); \
+ vfmla(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]); \
+ vfmla(d_im.val[3], d_im.val[3], a_im.val[3], b_re.val[3]);
+
+#define FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re.val[0], b_im.val[0], a_im.val[0]); \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmul(d_re.val[1], b_im.val[1], a_im.val[1]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmul(d_re.val[2], b_im.val[2], a_im.val[2]); \
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmul(d_re.val[3], b_im.val[3], a_im.val[3]); \
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmul(d_im.val[0], b_re.val[0], a_im.val[0]); \
+ vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmul(d_im.val[1], b_re.val[1], a_im.val[1]); \
+ vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmul(d_im.val[2], b_re.val[2], a_im.val[2]); \
+ vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmul(d_im.val[3], b_re.val[3], a_im.val[3]); \
+ vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmla(d_re.val[0], d_re.val[0], b_im.val[0], a_im.val[0]); \
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], b_re.val[0]); \
+ vfmla(d_re.val[1], d_re.val[1], b_im.val[1], a_im.val[1]); \
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], b_re.val[1]); \
+ vfmla(d_re.val[2], d_re.val[2], b_im.val[2], a_im.val[2]); \
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], b_re.val[2]); \
+ vfmla(d_re.val[3], d_re.val[3], b_im.val[3], a_im.val[3]); \
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], b_re.val[3]); \
+ vfmla(d_im.val[0], d_im.val[0], b_re.val[0], a_im.val[0]); \
+ vfmls(d_im.val[0], d_im.val[0], a_re.val[0], b_im.val[0]); \
+ vfmla(d_im.val[1], d_im.val[1], b_re.val[1], a_im.val[1]); \
+ vfmls(d_im.val[1], d_im.val[1], a_re.val[1], b_im.val[1]); \
+ vfmla(d_im.val[2], d_im.val[2], b_re.val[2], a_im.val[2]); \
+ vfmls(d_im.val[2], d_im.val[2], a_re.val[2], b_im.val[2]); \
+ vfmla(d_im.val[3], d_im.val[3], b_re.val[3], a_im.val[3]); \
+ vfmls(d_im.val[3], d_im.val[3], a_re.val[3], b_im.val[3]);
+
+#define FPC_MUL_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re, a_re, b_re_im, 0); \
+ vfmls_lane(d_re, d_re, a_im, b_re_im, 1); \
+ vfmul_lane(d_im, a_re, b_re_im, 1); \
+ vfmla_lane(d_im, d_im, a_im, b_re_im, 0);
+
+#define FPC_MUL_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0); \
+ vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+ vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0); \
+ vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+ vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0); \
+ vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+ vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0); \
+ vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+ vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 1); \
+ vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 0); \
+ vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 1); \
+ vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 0); \
+ vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 1); \
+ vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 0); \
+ vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 1); \
+ vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 0);
+
+#define FWD_TOP(t_re, t_im, b_re, b_im, zeta_re, zeta_im) \
+ FPC_MUL(t_re, t_im, b_re, b_im, zeta_re, zeta_im);
+
+#define FWD_TOP_LANE(t_re, t_im, b_re, b_im, zeta) \
+ FPC_MUL_LANE(t_re, t_im, b_re, b_im, zeta);
+
+#define FWD_TOP_LANEx4(t_re, t_im, b_re, b_im, zeta) \
+ FPC_MUL_LANEx4(t_re, t_im, b_re, b_im, zeta);
+
+/*
+ * FPC
+ */
+
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vsubq_f64(a_re, b_re); \
+ d_im = vsubq_f64(a_im, b_im);
+
+#define FPC_SUBx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vsubq_f64(a_re.val[0], b_re.val[0]); \
+ d_im.val[0] = vsubq_f64(a_im.val[0], b_im.val[0]); \
+ d_re.val[1] = vsubq_f64(a_re.val[1], b_re.val[1]); \
+ d_im.val[1] = vsubq_f64(a_im.val[1], b_im.val[1]); \
+ d_re.val[2] = vsubq_f64(a_re.val[2], b_re.val[2]); \
+ d_im.val[2] = vsubq_f64(a_im.val[2], b_im.val[2]); \
+ d_re.val[3] = vsubq_f64(a_re.val[3], b_re.val[3]); \
+ d_im.val[3] = vsubq_f64(a_im.val[3], b_im.val[3]);
+
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vaddq_f64(a_re, b_re); \
+ d_im = vaddq_f64(a_im, b_im);
+
+#define FPC_ADDx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vaddq_f64(a_re.val[0], b_re.val[0]); \
+ d_im.val[0] = vaddq_f64(a_im.val[0], b_im.val[0]); \
+ d_re.val[1] = vaddq_f64(a_re.val[1], b_re.val[1]); \
+ d_im.val[1] = vaddq_f64(a_im.val[1], b_im.val[1]); \
+ d_re.val[2] = vaddq_f64(a_re.val[2], b_re.val[2]); \
+ d_im.val[2] = vaddq_f64(a_im.val[2], b_im.val[2]); \
+ d_re.val[3] = vaddq_f64(a_re.val[3], b_re.val[3]); \
+ d_im.val[3] = vaddq_f64(a_im.val[3], b_im.val[3]);
+
+#define FWD_BOT(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUB(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADD(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUBx4(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADDx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+/*
+ * FPC_J
+ */
+
+#define FPC_ADDJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vsubq_f64(a_re, b_im); \
+ d_im = vaddq_f64(a_im, b_re);
+
+#define FPC_ADDJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vsubq_f64(a_re.val[0], b_im.val[0]); \
+ d_im.val[0] = vaddq_f64(a_im.val[0], b_re.val[0]); \
+ d_re.val[1] = vsubq_f64(a_re.val[1], b_im.val[1]); \
+ d_im.val[1] = vaddq_f64(a_im.val[1], b_re.val[1]); \
+ d_re.val[2] = vsubq_f64(a_re.val[2], b_im.val[2]); \
+ d_im.val[2] = vaddq_f64(a_im.val[2], b_re.val[2]); \
+ d_re.val[3] = vsubq_f64(a_re.val[3], b_im.val[3]); \
+ d_im.val[3] = vaddq_f64(a_im.val[3], b_re.val[3]);
+
+#define FPC_SUBJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re = vaddq_f64(a_re, b_im); \
+ d_im = vsubq_f64(a_im, b_re);
+
+#define FPC_SUBJx4(d_re, d_im, a_re, a_im, b_re, b_im) \
+ d_re.val[0] = vaddq_f64(a_re.val[0], b_im.val[0]); \
+ d_im.val[0] = vsubq_f64(a_im.val[0], b_re.val[0]); \
+ d_re.val[1] = vaddq_f64(a_re.val[1], b_im.val[1]); \
+ d_im.val[1] = vsubq_f64(a_im.val[1], b_re.val[1]); \
+ d_re.val[2] = vaddq_f64(a_re.val[2], b_im.val[2]); \
+ d_im.val[2] = vsubq_f64(a_im.val[2], b_re.val[2]); \
+ d_re.val[3] = vaddq_f64(a_re.val[3], b_im.val[3]); \
+ d_im.val[3] = vsubq_f64(a_im.val[3], b_re.val[3]);
+
+#define FWD_BOTJ(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUBJ(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADDJ(a_re, a_im, a_re, a_im, t_re, t_im);
+
+#define FWD_BOTJx4(a_re, a_im, b_re, b_im, t_re, t_im) \
+ FPC_SUBJx4(b_re, b_im, a_re, a_im, t_re, t_im); \
+ FPC_ADDJx4(a_re, a_im, a_re, a_im, t_re, t_im);
+
+//============== Inverse FFT
+/*
+ * FPC_J
+ * a * conj(b)
+ * Original (without swap):
+ * d_re = b_im * a_im + a_re * b_re;
+ * d_im = b_re * a_im - a_re * b_im;
+ */
+#define FPC_MUL_BOTJ_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re, a_re, b_re_im, 0); \
+ vfmla_lane(d_re, d_re, a_im, b_re_im, 1); \
+ vfmul_lane(d_im, a_im, b_re_im, 0); \
+ vfmls_lane(d_im, d_im, a_re, b_re_im, 1);
+
+#define FPC_MUL_BOTJ_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 0); \
+ vfmla_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 1); \
+ vfmul_lane(d_im.val[0], a_im.val[0], b_re_im, 0); \
+ vfmls_lane(d_im.val[0], d_im.val[0], a_re.val[0], b_re_im, 1); \
+ vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 0); \
+ vfmla_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 1); \
+ vfmul_lane(d_im.val[1], a_im.val[1], b_re_im, 0); \
+ vfmls_lane(d_im.val[1], d_im.val[1], a_re.val[1], b_re_im, 1); \
+ vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 0); \
+ vfmla_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 1); \
+ vfmul_lane(d_im.val[2], a_im.val[2], b_re_im, 0); \
+ vfmls_lane(d_im.val[2], d_im.val[2], a_re.val[2], b_re_im, 1); \
+ vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 0); \
+ vfmla_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 1); \
+ vfmul_lane(d_im.val[3], a_im.val[3], b_re_im, 0); \
+ vfmls_lane(d_im.val[3], d_im.val[3], a_re.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJ(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re, b_im, a_im); \
+ vfmla(d_re, d_re, a_re, b_re); \
+ vfmul(d_im, b_re, a_im); \
+ vfmls(d_im, d_im, a_re, b_im);
+
+#define INV_TOPJ(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im); \
+ FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUBx4(t_re, t_im, a_re, a_im, b_re, b_im); \
+ FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+ FPC_MUL_BOTJ(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJ_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJ_LANEx4(b_re, b_im, t_re, t_im, zeta);
+
+/*
+ * FPC_Jm
+ * a * -conj(b)
+ * d_re = a_re * b_im - a_im * b_re;
+ * d_im = a_im * b_im + a_re * b_re;
+ */
+#define FPC_MUL_BOTJm_LANE(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re, a_re, b_re_im, 1); \
+ vfmls_lane(d_re, d_re, a_im, b_re_im, 0); \
+ vfmul_lane(d_im, a_re, b_re_im, 0); \
+ vfmla_lane(d_im, d_im, a_im, b_re_im, 1);
+
+#define FPC_MUL_BOTJm_LANEx4(d_re, d_im, a_re, a_im, b_re_im) \
+ vfmul_lane(d_re.val[0], a_re.val[0], b_re_im, 1); \
+ vfmls_lane(d_re.val[0], d_re.val[0], a_im.val[0], b_re_im, 0); \
+ vfmul_lane(d_im.val[0], a_re.val[0], b_re_im, 0); \
+ vfmla_lane(d_im.val[0], d_im.val[0], a_im.val[0], b_re_im, 1); \
+ vfmul_lane(d_re.val[1], a_re.val[1], b_re_im, 1); \
+ vfmls_lane(d_re.val[1], d_re.val[1], a_im.val[1], b_re_im, 0); \
+ vfmul_lane(d_im.val[1], a_re.val[1], b_re_im, 0); \
+ vfmla_lane(d_im.val[1], d_im.val[1], a_im.val[1], b_re_im, 1); \
+ vfmul_lane(d_re.val[2], a_re.val[2], b_re_im, 1); \
+ vfmls_lane(d_re.val[2], d_re.val[2], a_im.val[2], b_re_im, 0); \
+ vfmul_lane(d_im.val[2], a_re.val[2], b_re_im, 0); \
+ vfmla_lane(d_im.val[2], d_im.val[2], a_im.val[2], b_re_im, 1); \
+ vfmul_lane(d_re.val[3], a_re.val[3], b_re_im, 1); \
+ vfmls_lane(d_re.val[3], d_re.val[3], a_im.val[3], b_re_im, 0); \
+ vfmul_lane(d_im.val[3], a_re.val[3], b_re_im, 0); \
+ vfmla_lane(d_im.val[3], d_im.val[3], a_im.val[3], b_re_im, 1);
+
+#define FPC_MUL_BOTJm(d_re, d_im, a_re, a_im, b_re, b_im) \
+ vfmul(d_re, a_re, b_im); \
+ vfmls(d_re, d_re, a_im, b_re); \
+ vfmul(d_im, a_im, b_im); \
+ vfmla(d_im, d_im, a_re, b_re);
+
+#define INV_TOPJm(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUB(t_re, t_im, b_re, b_im, a_re, a_im); \
+ FPC_ADD(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_TOPJmx4(t_re, t_im, a_re, a_im, b_re, b_im) \
+ FPC_SUBx4(t_re, t_im, b_re, b_im, a_re, a_im); \
+ FPC_ADDx4(a_re, a_im, a_re, a_im, b_re, b_im);
+
+#define INV_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im) \
+ FPC_MUL_BOTJm(b_re, b_im, t_re, t_im, zeta_re, zeta_im);
+
+#define INV_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJm_LANE(b_re, b_im, t_re, t_im, zeta);
+
+#define INV_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta) \
+ FPC_MUL_BOTJm_LANEx4(b_re, b_im, t_re, t_im, zeta);
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrous.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrous.h
new file mode 100644
index 000000000..dfee8bc12
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/macrous.h
@@ -0,0 +1,469 @@
+/*
+ * Macro for sign/unsigned integer
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+
+#define vmull_lo(c, a, b) c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+
+#define vmull_hi(c, a, b) c = vmull_high_s16(a, b);
+
+#define vmulla_lo(d, c, a, b) d = vmlal_s16(c, vget_low_s16(a), vget_low_s16(b));
+
+#define vmulla_hi(d, c, a, b) d = vmlal_high_s16(c, a, b);
+
+#define vadd(c, a, b) c = vaddq_u32(a, b);
+
+#define vaddv(c, a) c = vaddvq_u32(a);
+
+#define vor(c, a, b) c = vorrq_u32(a, b);
+
+// Macro for NTT operation. Using signed 16-bit.
+#define vload_s16_4(c, addr) c = vld4q_s16(addr);
+#define vload_s16_x2(c, addr) c = vld1q_s16_x2(addr);
+#define vload_s16_x4(c, addr) c = vld1q_s16_x4(addr);
+
+#define vstore_s16_x4(addr, c) vst1q_s16_x4(addr, c);
+#define vstore_s16_x2(addr, c) vst1q_s16_x2(addr, c);
+#define vstore_s16_4(add, c) vst4q_s16(add, c);
+
+/*
+ * Strategy for NTT:
+ * - Forward and Inverse NTT multiply with constant, use either Barrett or Montgomery *Rounding* arithmetic
+ * - Pointwise multiplication must use Montgomery *Doubling* arithmetic
+ *
+ * Rounding because:
+ *
+ * - Montgomery need one coefficient to be *odd*, it only works with precomputed coefficient
+ * => Tried this approach, very strict on coefficient input range.
+ * => E.g a*b: a in [-R/2, R/2]. b in [-Q/2, Q/2] then c in [-2Q, 2Q]
+ *
+ * - Barrett multiplication seem to work better with no restriction
+ * => Proved to be good. E.g c=a*b, a in [-R, R], b in [-Q/2, Q/2] then c in [-3Q/2, 3Q/2]
+ * However, depend on the input bound, the output bound is varies. By using this knowledge, we can further
+ * optimize Barrett point by carefully check the output bound according to input bound.
+ *
+ * - Barrett reduction with c = a % Q. a in [-R, R] then c in [-Q/2, Q/2]
+ *
+ *
+ * Doubling because
+ * - Montgomery Doubling work with two unknown coefficient, no constaint at all
+ * => c = a*b. a,b in [-R, R] c in [-Q, Q]
+ */
+
+// ------------ Forward NTT and Inverse NTT ------------
+/*
+ * GS Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define gsbf_br(a, b, zl, zh, QMVQ, t) \
+ t = vsubq_s16(a, b); \
+ a = vaddq_s16(a, b); \
+ b = vqrdmulhq_s16(t, zh); \
+ t = vmulq_s16(t, zl); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri(a, b, zl, zh, i, QMVQ, t) \
+ t = vsubq_s16(a, b); \
+ a = vaddq_s16(a, b); \
+ b = vqrdmulhq_laneq_s16(t, zh, i); \
+ t = vmulq_laneq_s16(t, zl, i); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ t.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+ t.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+ t.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+ t.val[3] = vsubq_s16(a.val[3], b.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], b.val[3]); \
+ b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0); \
+ b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1); \
+ b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2); \
+ b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3); \
+ t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0); \
+ b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+ t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1); \
+ b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+ t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2); \
+ b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+ t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3); \
+ b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top_x4(a, b, t) \
+ t.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+ t.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+ t.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+ t.val[3] = vsubq_s16(a.val[3], b.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
+#define gsbf_bri_bot_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ b.val[0] = vqrdmulhq_laneq_s16(t.val[0], zh, i0); \
+ b.val[1] = vqrdmulhq_laneq_s16(t.val[1], zh, i1); \
+ b.val[2] = vqrdmulhq_laneq_s16(t.val[2], zh, i2); \
+ b.val[3] = vqrdmulhq_laneq_s16(t.val[3], zh, i3); \
+ t.val[0] = vmulq_laneq_s16(t.val[0], zl, i0); \
+ b.val[0] = vmlsq_laneq_s16(t.val[0], b.val[0], QMVQ, 0); \
+ t.val[1] = vmulq_laneq_s16(t.val[1], zl, i1); \
+ b.val[1] = vmlsq_laneq_s16(t.val[1], b.val[1], QMVQ, 0); \
+ t.val[2] = vmulq_laneq_s16(t.val[2], zl, i2); \
+ b.val[2] = vmlsq_laneq_s16(t.val[2], b.val[2], QMVQ, 0); \
+ t.val[3] = vmulq_laneq_s16(t.val[3], zl, i3); \
+ b.val[3] = vmlsq_laneq_s16(t.val[3], b.val[3], QMVQ, 0);
+
+#define gsbf_top(a, b, t) \
+ t = vsubq_s16(a, b); \
+ a = vaddq_s16(a, b);
+
+#define gsbf_bri_bot(b, zl, zh, i, QMVQ, t) \
+ b = vqrdmulhq_laneq_s16(t, zh, i); \
+ t = vmulq_laneq_s16(t, zl, i); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+
+#define gsbf_br_bot(b, zl, zh, QMVQ, t) \
+ b = vqrdmulhq_s16(t, zh); \
+ t = vmulq_s16(t, zl); \
+ b = vmlsq_laneq_s16(t, b, QMVQ, 0);
+/*
+ * Barrett multiplication via *Rounding* use for Inverse NTT
+ * Input: a, b, zl, zh, Q. a in [-R, R]
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define barmul_invntt(a, zl, zh, i, QMVQ, t) \
+ t = vqrdmulhq_laneq_s16(a, zh, i); \
+ a = vmulq_laneq_s16(a, zl, i); \
+ a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barmul_invntt_x2(a, zl, zh, i, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i); \
+ a.val[0] = vmulq_laneq_s16(a.val[0], zl, i); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+ a.val[1] = vmulq_laneq_s16(a.val[1], zl, i); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0);
+
+#define barmul_invntt_x4(a, zl, zh, i, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], zh, i); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], zh, i); \
+ t.val[2] = vqrdmulhq_laneq_s16(a.val[2], zh, i); \
+ t.val[3] = vqrdmulhq_laneq_s16(a.val[3], zh, i); \
+ a.val[0] = vmulq_laneq_s16(a.val[0], zl, i); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+ a.val[1] = vmulq_laneq_s16(a.val[1], zl, i); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+ a.val[2] = vmulq_laneq_s16(a.val[2], zl, i); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+ a.val[3] = vmulq_laneq_s16(a.val[3], zl, i); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+/*
+ * Convert coefficients to Montgomery domain
+ */
+#define barmuli_mont(a, QMVM, t) \
+ t = vqrdmulhq_laneq_s16(a, QMVM, 6); \
+ a = vmulq_laneq_s16(a, QMVM, 2); \
+ a = vmlsq_laneq_s16(a, t, QMVM, 0);
+
+#define barmuli_mont_x8(a, b, QMVM, t, t2) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 6); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 6); \
+ t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 6); \
+ t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 6); \
+ t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 6); \
+ t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 6); \
+ t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 6); \
+ t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 6); \
+ a.val[0] = vmulq_laneq_s16(a.val[0], QMVM, 2); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0); \
+ a.val[1] = vmulq_laneq_s16(a.val[1], QMVM, 2); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0); \
+ a.val[2] = vmulq_laneq_s16(a.val[2], QMVM, 2); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0); \
+ a.val[3] = vmulq_laneq_s16(a.val[3], QMVM, 2); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0); \
+ b.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 2); \
+ b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+ b.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 2); \
+ b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+ b.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 2); \
+ b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+ b.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 2); \
+ b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * Convert coefficients to Montgomery domain and embeded n^-1
+ */
+
+#define barmuli_mont_ninv_x8(a, b, QMVM, t, t2) \
+ t.val[0] = vqrdmulhq_laneq_s16(a.val[0], QMVM, 7); \
+ t.val[1] = vqrdmulhq_laneq_s16(a.val[1], QMVM, 7); \
+ t.val[2] = vqrdmulhq_laneq_s16(a.val[2], QMVM, 7); \
+ t.val[3] = vqrdmulhq_laneq_s16(a.val[3], QMVM, 7); \
+ t2.val[0] = vqrdmulhq_laneq_s16(b.val[0], QMVM, 7); \
+ t2.val[1] = vqrdmulhq_laneq_s16(b.val[1], QMVM, 7); \
+ t2.val[2] = vqrdmulhq_laneq_s16(b.val[2], QMVM, 7); \
+ t2.val[3] = vqrdmulhq_laneq_s16(b.val[3], QMVM, 7); \
+ a.val[0] = vshlq_n_s16(a.val[0], FALCON_LOG2_NINV_MONT); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVM, 0); \
+ a.val[1] = vshlq_n_s16(a.val[1], FALCON_LOG2_NINV_MONT); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVM, 0); \
+ a.val[2] = vshlq_n_s16(a.val[2], FALCON_LOG2_NINV_MONT); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVM, 0); \
+ a.val[3] = vshlq_n_s16(a.val[3], FALCON_LOG2_NINV_MONT); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVM, 0); \
+ b.val[0] = vshlq_n_s16(b.val[0], FALCON_LOG2_NINV_MONT); \
+ b.val[0] = vmlsq_laneq_s16(b.val[0], t2.val[0], QMVM, 0); \
+ b.val[1] = vshlq_n_s16(b.val[1], FALCON_LOG2_NINV_MONT); \
+ b.val[1] = vmlsq_laneq_s16(b.val[1], t2.val[1], QMVM, 0); \
+ b.val[2] = vshlq_n_s16(b.val[2], FALCON_LOG2_NINV_MONT); \
+ b.val[2] = vmlsq_laneq_s16(b.val[2], t2.val[2], QMVM, 0); \
+ b.val[3] = vshlq_n_s16(b.val[3], FALCON_LOG2_NINV_MONT); \
+ b.val[3] = vmlsq_laneq_s16(b.val[3], t2.val[3], QMVM, 0);
+
+/*
+ * CT Butterfly with Barrett *Rounding* reduction
+ * Input: a in [-R, R], zl = w, zh = precomp_w, N, t
+ * Output: c = a * b % Q. c in [-3Q/2, 3Q/2]
+ */
+#define ctbf_br(a, b, zl, zh, QMVQ, t) \
+ t = vqrdmulhq_s16(b, zh); \
+ b = vmulq_s16(b, zl); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0); \
+ b = vsubq_s16(a, t); \
+ a = vaddq_s16(a, t);
+
+#define ctbf_bri(a, b, zl, zh, i, QMVQ, t) \
+ t = vqrdmulhq_laneq_s16(b, zh, i); \
+ b = vmulq_laneq_s16(b, zl, i); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0); \
+ b = vsubq_s16(a, t); \
+ a = vaddq_s16(a, t);
+
+#define ctbf_br_top(b, zl, zh, QMVQ, t) \
+ t = vqrdmulhq_s16(b, zh); \
+ b = vmulq_s16(b, zl); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bri_top(b, zl, zh, i, QMVQ, t) \
+ t = vqrdmulhq_laneq_s16(b, zh, i); \
+ b = vmulq_laneq_s16(b, zl, i); \
+ t = vmlsq_laneq_s16(b, t, QMVQ, 0);
+
+#define ctbf_bot(a, b, t) \
+ b = vsubq_s16(a, t); \
+ a = vaddq_s16(a, t);
+
+#define ctbf_bri_top_x4(b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0); \
+ t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1); \
+ t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2); \
+ t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3); \
+ b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0); \
+ t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+ b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1); \
+ t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+ b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2); \
+ t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+ b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3); \
+ t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0);
+
+#define ctbf_bot_x4(a, b, t) \
+ b.val[0] = vsubq_s16(a.val[0], t.val[0]); \
+ b.val[1] = vsubq_s16(a.val[1], t.val[1]); \
+ b.val[2] = vsubq_s16(a.val[2], t.val[2]); \
+ b.val[3] = vsubq_s16(a.val[3], t.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], t.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], t.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], t.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+#define ctbf_bri_x4(a, b, zl, zh, i0, i1, i2, i3, QMVQ, t) \
+ t.val[0] = vqrdmulhq_laneq_s16(b.val[0], zh, i0); \
+ t.val[1] = vqrdmulhq_laneq_s16(b.val[1], zh, i1); \
+ t.val[2] = vqrdmulhq_laneq_s16(b.val[2], zh, i2); \
+ t.val[3] = vqrdmulhq_laneq_s16(b.val[3], zh, i3); \
+ b.val[0] = vmulq_laneq_s16(b.val[0], zl, i0); \
+ t.val[0] = vmlsq_laneq_s16(b.val[0], t.val[0], QMVQ, 0); \
+ b.val[1] = vmulq_laneq_s16(b.val[1], zl, i1); \
+ t.val[1] = vmlsq_laneq_s16(b.val[1], t.val[1], QMVQ, 0); \
+ b.val[2] = vmulq_laneq_s16(b.val[2], zl, i2); \
+ t.val[2] = vmlsq_laneq_s16(b.val[2], t.val[2], QMVQ, 0); \
+ b.val[3] = vmulq_laneq_s16(b.val[3], zl, i3); \
+ t.val[3] = vmlsq_laneq_s16(b.val[3], t.val[3], QMVQ, 0); \
+ b.val[0] = vsubq_s16(a.val[0], t.val[0]); \
+ b.val[1] = vsubq_s16(a.val[1], t.val[1]); \
+ b.val[2] = vsubq_s16(a.val[2], t.val[2]); \
+ b.val[3] = vsubq_s16(a.val[3], t.val[3]); \
+ a.val[0] = vaddq_s16(a.val[0], t.val[0]); \
+ a.val[1] = vaddq_s16(a.val[1], t.val[1]); \
+ a.val[2] = vaddq_s16(a.val[2], t.val[2]); \
+ a.val[3] = vaddq_s16(a.val[3], t.val[3]);
+
+// ------------ Pointwise Multiplication ------------
+/*
+ * Montgomery multiplication via *Doubling*
+ * Input: a, b, bNinv, Q
+ * Output: c = ab * R^-1
+ */
+#define montmul(c, a, b, QMVM, t) \
+ c = vqdmulhq_s16(a, b); \
+ t = vmulq_laneq_s16(b, QMVM, 1); \
+ t = vmulq_s16(a, t); \
+ t = vqdmulhq_laneq_s16(t, QMVM, 0); \
+ c = vhsubq_s16(c, t);
+
+#define montmul_x4(z, a, b, QMVM, t) \
+ z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]); \
+ z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]); \
+ z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]); \
+ z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]); \
+ t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1); \
+ t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1); \
+ t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1); \
+ t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1); \
+ t.val[0] = vmulq_s16(a.val[0], t.val[0]); \
+ t.val[1] = vmulq_s16(a.val[1], t.val[1]); \
+ t.val[2] = vmulq_s16(a.val[2], t.val[2]); \
+ t.val[3] = vmulq_s16(a.val[3], t.val[3]); \
+ t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+ z.val[0] = vhsubq_s16(z.val[0], t.val[0]); \
+ t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+ z.val[1] = vhsubq_s16(z.val[1], t.val[1]); \
+ t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+ z.val[2] = vhsubq_s16(z.val[2], t.val[2]); \
+ t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+ z.val[3] = vhsubq_s16(z.val[3], t.val[3]);
+
+#define montmul_x8(z, w, a, b, e, f, QMVM, t, k) \
+ z.val[0] = vqdmulhq_s16(a.val[0], b.val[0]); \
+ z.val[1] = vqdmulhq_s16(a.val[1], b.val[1]); \
+ z.val[2] = vqdmulhq_s16(a.val[2], b.val[2]); \
+ z.val[3] = vqdmulhq_s16(a.val[3], b.val[3]); \
+ w.val[0] = vqdmulhq_s16(e.val[0], f.val[0]); \
+ w.val[1] = vqdmulhq_s16(e.val[1], f.val[1]); \
+ w.val[2] = vqdmulhq_s16(e.val[2], f.val[2]); \
+ w.val[3] = vqdmulhq_s16(e.val[3], f.val[3]); \
+ t.val[0] = vmulq_laneq_s16(b.val[0], QMVM, 1); \
+ t.val[1] = vmulq_laneq_s16(b.val[1], QMVM, 1); \
+ t.val[2] = vmulq_laneq_s16(b.val[2], QMVM, 1); \
+ t.val[3] = vmulq_laneq_s16(b.val[3], QMVM, 1); \
+ k.val[0] = vmulq_laneq_s16(f.val[0], QMVM, 1); \
+ k.val[1] = vmulq_laneq_s16(f.val[1], QMVM, 1); \
+ k.val[2] = vmulq_laneq_s16(f.val[2], QMVM, 1); \
+ k.val[3] = vmulq_laneq_s16(f.val[3], QMVM, 1); \
+ t.val[0] = vmulq_s16(a.val[0], t.val[0]); \
+ t.val[1] = vmulq_s16(a.val[1], t.val[1]); \
+ t.val[2] = vmulq_s16(a.val[2], t.val[2]); \
+ t.val[3] = vmulq_s16(a.val[3], t.val[3]); \
+ k.val[0] = vmulq_s16(e.val[0], k.val[0]); \
+ k.val[1] = vmulq_s16(e.val[1], k.val[1]); \
+ k.val[2] = vmulq_s16(e.val[2], k.val[2]); \
+ k.val[3] = vmulq_s16(e.val[3], k.val[3]); \
+ t.val[0] = vqdmulhq_laneq_s16(t.val[0], QMVM, 0); \
+ z.val[0] = vhsubq_s16(z.val[0], t.val[0]); \
+ t.val[1] = vqdmulhq_laneq_s16(t.val[1], QMVM, 0); \
+ z.val[1] = vhsubq_s16(z.val[1], t.val[1]); \
+ t.val[2] = vqdmulhq_laneq_s16(t.val[2], QMVM, 0); \
+ z.val[2] = vhsubq_s16(z.val[2], t.val[2]); \
+ t.val[3] = vqdmulhq_laneq_s16(t.val[3], QMVM, 0); \
+ z.val[3] = vhsubq_s16(z.val[3], t.val[3]); \
+ k.val[0] = vqdmulhq_laneq_s16(k.val[0], QMVM, 0); \
+ w.val[0] = vhsubq_s16(w.val[0], k.val[0]); \
+ k.val[1] = vqdmulhq_laneq_s16(k.val[1], QMVM, 0); \
+ w.val[1] = vhsubq_s16(w.val[1], k.val[1]); \
+ k.val[2] = vqdmulhq_laneq_s16(k.val[2], QMVM, 0); \
+ w.val[2] = vhsubq_s16(w.val[2], k.val[2]); \
+ k.val[3] = vqdmulhq_laneq_s16(k.val[3], QMVM, 0); \
+ w.val[3] = vhsubq_s16(w.val[3], k.val[3]);
+
+// ------------ Barrett Reduction ------------
+/*
+ * Barrett reduction, return [-Q/2, Q/2]
+ * `v` = 5461, `n` = 11
+ */
+#define barrett(a, QMVQ, t) \
+ t = vqdmulhq_laneq_s16(a, QMVQ, 4); \
+ t = vrshrq_n_s16(t, 11); \
+ a = vmlsq_laneq_s16(a, t, QMVQ, 0);
+
+#define barrett_x2(a, i, j, m, n, QMVQ, t) \
+ t.val[m] = vqdmulhq_laneq_s16(a.val[i], QMVQ, 4); \
+ t.val[m] = vrshrq_n_s16(t.val[m], 11); \
+ t.val[n] = vqdmulhq_laneq_s16(a.val[j], QMVQ, 4); \
+ t.val[n] = vrshrq_n_s16(t.val[n], 11); \
+ a.val[i] = vmlsq_laneq_s16(a.val[i], t.val[m], QMVQ, 0); \
+ a.val[j] = vmlsq_laneq_s16(a.val[j], t.val[n], QMVQ, 0);
+
+#define barrett_x4(a, QMVQ, t) \
+ t.val[0] = vqdmulhq_laneq_s16(a.val[0], QMVQ, 4); \
+ t.val[0] = vrshrq_n_s16(t.val[0], 11); \
+ t.val[1] = vqdmulhq_laneq_s16(a.val[1], QMVQ, 4); \
+ t.val[1] = vrshrq_n_s16(t.val[1], 11); \
+ t.val[2] = vqdmulhq_laneq_s16(a.val[2], QMVQ, 4); \
+ t.val[2] = vrshrq_n_s16(t.val[2], 11); \
+ t.val[3] = vqdmulhq_laneq_s16(a.val[3], QMVQ, 4); \
+ t.val[3] = vrshrq_n_s16(t.val[3], 11); \
+ a.val[0] = vmlsq_laneq_s16(a.val[0], t.val[0], QMVQ, 0); \
+ a.val[1] = vmlsq_laneq_s16(a.val[1], t.val[1], QMVQ, 0); \
+ a.val[2] = vmlsq_laneq_s16(a.val[2], t.val[2], QMVQ, 0); \
+ a.val[3] = vmlsq_laneq_s16(a.val[3], t.val[3], QMVQ, 0);
+
+// ------------ Matrix Transpose ------------
+/*
+ * Matrix 4x4 transpose: v
+ * Input: int16x8x4_t v, tmp
+ * Output: int16x8x4_t v
+ */
+#define transpose(v, tmp) \
+ tmp.val[0] = vtrn1q_s16(v.val[0], v.val[1]); \
+ tmp.val[1] = vtrn2q_s16(v.val[0], v.val[1]); \
+ tmp.val[2] = vtrn1q_s16(v.val[2], v.val[3]); \
+ tmp.val[3] = vtrn2q_s16(v.val[2], v.val[3]); \
+ v.val[0] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+ v.val[2] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[0], (int32x4_t)tmp.val[2]); \
+ v.val[1] = (int16x8_t)vtrn1q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]); \
+ v.val[3] = (int16x8_t)vtrn2q_s32((int32x4_t)tmp.val[1], (int32x4_t)tmp.val[3]);
+
+// ------------ Re-arrange vector ------------
+#define arrange(v_out, v_in, i, j, m, n, a, b, c, d) \
+ v_out.val[a] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+ v_out.val[b] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[i], (int64x2_t)v_in.val[j]); \
+ v_out.val[c] = (int16x8_t)vtrn1q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]); \
+ v_out.val[d] = (int16x8_t)vtrn2q_s64((int64x2_t)v_in.val[m], (int64x2_t)v_in.val[n]);
+
+// ------------ Addition/Subtraction ------------
+#define vsub_x4(c, a, b) \
+ c.val[0] = vsubq_s16(a.val[0], b.val[0]); \
+ c.val[1] = vsubq_s16(a.val[1], b.val[1]); \
+ c.val[2] = vsubq_s16(a.val[2], b.val[2]); \
+ c.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+#define vadd_x4(c, a, b) \
+ c.val[0] = vaddq_s16(a.val[0], b.val[0]); \
+ c.val[1] = vaddq_s16(a.val[1], b.val[1]); \
+ c.val[2] = vaddq_s16(a.val[2], b.val[2]); \
+ c.val[3] = vaddq_s16(a.val[3], b.val[3]);
+
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt.c
new file mode 100644
index 000000000..9b8c7e92f
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt.c
@@ -0,0 +1,822 @@
+/*
+ * High-speed vectorize NTT for N = 512, 1024
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrous.h"
+#include "ntt_consts.h"
+#include "poly.h"
+
+#include
+
+/*
+ * Assume Input in the range [-Q/2, Q/2]
+ * Total Barrett point for N = 512, 1024: 2048, 4096
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont) {
+ // Total SIMD registers 29 = 16 + 12 + 1
+ int16x8x4_t v0, v1, v2, v3; // 16
+ int16x8x4_t zl, zh, t, t2; // 12
+ int16x8x2_t zlh, zhh; // 4
+ int16x8_t neon_qmvq; // 1
+ const int16_t *ptr_ntt_br = PQCLEAN_FALCONPADDED512_AARCH64_ntt_br;
+ const int16_t *ptr_ntt_qinv_br = PQCLEAN_FALCONPADDED512_AARCH64_ntt_qinv_br;
+
+ neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+ zl.val[0] = vld1q_s16(ptr_ntt_br);
+ zh.val[0] = vld1q_s16(ptr_ntt_qinv_br);
+ ptr_ntt_br += 8;
+ ptr_ntt_qinv_br += 8;
+
+ // Layer 8, 7
+ for (unsigned j = 0; j < 128; j += 32) {
+ vload_s16_x4(v0, &a[j]);
+ vload_s16_x4(v1, &a[j + 128]);
+ vload_s16_x4(v2, &a[j + 256]);
+ vload_s16_x4(v3, &a[j + 384]);
+
+ // v0: .5
+ // v1: .5
+ // v2: .5
+ // v3: .5
+
+ // Layer 8
+ // v0 - v2, v1 - v3
+ ctbf_bri_top_x4(v2, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t);
+ ctbf_bri_top_x4(v3, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t2);
+
+ ctbf_bot_x4(v0, v2, t);
+ ctbf_bot_x4(v1, v3, t2);
+
+ // v0: 1.2
+ // v1: 1.2
+ // v2: 1.2
+ // v3: 1.2
+
+ // Layer 7
+ // v0 - v1, v2 - v3
+ ctbf_bri_top_x4(v1, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t);
+ ctbf_bri_top_x4(v3, zl.val[0], zh.val[0], 3, 3, 3, 3, neon_qmvq, t2);
+
+ ctbf_bot_x4(v0, v1, t);
+ ctbf_bot_x4(v2, v3, t2);
+
+ // 2.14 -> 0.5
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // Store at 0.5Q
+ vstore_s16_x4(&a[j], v0);
+ vstore_s16_x4(&a[j + 128], v1);
+ vstore_s16_x4(&a[j + 256], v2);
+ vstore_s16_x4(&a[j + 384], v3);
+ }
+
+ // Layer 6, 5, 4, 3, 2, 1, 0
+ for (unsigned j = 0; j < FALCON_N; j += 128) {
+ vload_s16_x4(v0, &a[j]);
+ vload_s16_x4(v1, &a[j + 32]);
+ vload_s16_x4(v2, &a[j + 64]);
+ vload_s16_x4(v3, &a[j + 96]);
+
+ vload_s16_x2(zlh, ptr_ntt_br);
+ vload_s16_x2(zhh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 16;
+ ptr_ntt_qinv_br += 16;
+
+ // Layer 6
+ // v0 - v2, v1 - v3
+ ctbf_bri_top_x4(v2, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+ ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 0, 0, 0, 0, neon_qmvq, t2);
+
+ ctbf_bot_x4(v0, v2, t);
+ ctbf_bot_x4(v1, v3, t2);
+
+ // 1.3
+
+ // Layer 5
+ // v0 - v1, v2 - v3
+ ctbf_bri_top_x4(v1, zlh.val[0], zhh.val[0], 1, 1, 1, 1, neon_qmvq, t);
+ ctbf_bri_top_x4(v3, zlh.val[0], zhh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+
+ ctbf_bot_x4(v0, v1, t);
+ ctbf_bot_x4(v2, v3, t2);
+
+ // 2.3 -> 0.5
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // Layer 4
+ // v0(0, 1 - 2, 3)
+ // v1(0, 1 - 2, 3)
+ // v2(0, 1 - 2, 3)
+ // v3(0, 1 - 2, 3)
+ ctbf_bri_top(v0.val[2], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[0]);
+ ctbf_bri_top(v0.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[1]);
+ ctbf_bri_top(v1.val[2], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[2]);
+ ctbf_bri_top(v1.val[3], zlh.val[0], zhh.val[0], 4, neon_qmvq, t.val[3]);
+
+ ctbf_bri_top(v2.val[2], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[0]);
+ ctbf_bri_top(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+ ctbf_bri_top(v3.val[2], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+ ctbf_bri_top(v3.val[3], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+ ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+ ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+ ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+ ctbf_bot(v2.val[0], v2.val[2], t2.val[0]);
+ ctbf_bot(v2.val[1], v2.val[3], t2.val[1]);
+ ctbf_bot(v3.val[0], v3.val[2], t2.val[2]);
+ ctbf_bot(v3.val[1], v3.val[3], t2.val[3]);
+
+ // 1.3
+
+ // Layer 3
+ // v0(0, 2 - 1, 3)
+ // v1(0, 2 - 1, 3)
+ // v2(0, 2 - 1, 3)
+ // v3(0, 2 - 1, 3)
+ ctbf_bri_top(v0.val[1], zlh.val[0], zhh.val[0], 7, neon_qmvq, t.val[0]);
+ ctbf_bri_top(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+ ctbf_bri_top(v1.val[1], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+ ctbf_bri_top(v1.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t.val[3]);
+
+ ctbf_bri_top(v2.val[1], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[0]);
+ ctbf_bri_top(v2.val[3], zlh.val[1], zhh.val[1], 4, neon_qmvq, t2.val[1]);
+ ctbf_bri_top(v3.val[1], zlh.val[1], zhh.val[1], 5, neon_qmvq, t2.val[2]);
+ ctbf_bri_top(v3.val[3], zlh.val[1], zhh.val[1], 6, neon_qmvq, t2.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+ ctbf_bot(v0.val[2], v0.val[3], t.val[1]);
+ ctbf_bot(v1.val[0], v1.val[1], t.val[2]);
+ ctbf_bot(v1.val[2], v1.val[3], t.val[3]);
+
+ ctbf_bot(v2.val[0], v2.val[1], t2.val[0]);
+ ctbf_bot(v2.val[2], v2.val[3], t2.val[1]);
+ ctbf_bot(v3.val[0], v3.val[1], t2.val[2]);
+ ctbf_bot(v3.val[2], v3.val[3], t2.val[3]);
+
+ // 2.3 -> 0.5
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // Layer 2
+ // Input:
+ // 0, 1, 2, 3 | 4, 5, 6, 7
+ // 8, 9, 10, 11 | 12, 13, 14, 15
+ // 16, 17, 18, 19 | 20, 21, 22, 23
+ // 24, 25, 26, 27 | 28, 29, 30, 31
+ arrange(t, v0, 0, 2, 1, 3, 0, 1, 2, 3);
+ v0 = t;
+ arrange(t, v1, 0, 2, 1, 3, 0, 1, 2, 3);
+ v1 = t;
+ arrange(t2, v2, 0, 2, 1, 3, 0, 1, 2, 3);
+ v2 = t2;
+ arrange(t2, v3, 0, 2, 1, 3, 0, 1, 2, 3);
+ v3 = t2;
+ // Output:
+ // 0, 1, 2, 3 | 16, 17, 18, 19
+ // 4, 5, 6, 7 | 20, 21, 22, 23
+ // 8, 9, 10, 11 | 24, 25, 26, 27
+ // 12, 13, 14, 15 | 28, 29, 30, 31
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+ ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+ ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+ ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+ ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+ ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+ ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+ // 1.3
+
+ // Layer 1: v0.val[0] x v0.val[2] | v0.val[1] x v0.val[3]
+ // v0.val[0]: 0, 1, 2, 3 | 16, 17, 18, 19
+ // v0.val[1]: 4, 5, 6, 7 | 20, 21, 22, 23
+ // v0.val[2]: 8, 9, 10, 11 | 24, 25, 26, 27
+ // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+ // transpose 4x4
+ transpose(v0, t);
+ transpose(v1, t);
+ transpose(v2, t2);
+ transpose(v3, t2);
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[2], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[1]);
+ ctbf_br_top(v1.val[2], zl.val[1], zh.val[1], neon_qmvq, t.val[2]);
+ ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[2], t.val[0]);
+ ctbf_bot(v0.val[1], v0.val[3], t.val[1]);
+ ctbf_bot(v1.val[0], v1.val[2], t.val[2]);
+ ctbf_bot(v1.val[1], v1.val[3], t.val[3]);
+
+ ctbf_br_top(v2.val[2], zl.val[2], zh.val[2], neon_qmvq, t.val[0]);
+ ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[1]);
+ ctbf_br_top(v3.val[2], zl.val[3], zh.val[3], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v2.val[0], v2.val[2], t.val[0]);
+ ctbf_bot(v2.val[1], v2.val[3], t.val[1]);
+ ctbf_bot(v3.val[0], v3.val[2], t.val[2]);
+ ctbf_bot(v3.val[1], v3.val[3], t.val[3]);
+
+ // 2.3 -> 0.5
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // Layer 0
+ // v(0, 2 - 1, 3)
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[1], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[1], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[1], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[1], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[0], v0.val[1], t.val[0]);
+ ctbf_bot(v1.val[0], v1.val[1], t.val[1]);
+ ctbf_bot(v2.val[0], v2.val[1], t.val[2]);
+ ctbf_bot(v3.val[0], v3.val[1], t.val[3]);
+
+ vload_s16_x4(zl, ptr_ntt_br);
+ vload_s16_x4(zh, ptr_ntt_qinv_br);
+ ptr_ntt_br += 32;
+ ptr_ntt_qinv_br += 32;
+
+ ctbf_br_top(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t.val[0]);
+ ctbf_br_top(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t.val[1]);
+ ctbf_br_top(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t.val[2]);
+ ctbf_br_top(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t.val[3]);
+
+ ctbf_bot(v0.val[2], v0.val[3], t.val[0]);
+ ctbf_bot(v1.val[2], v1.val[3], t.val[1]);
+ ctbf_bot(v2.val[2], v2.val[3], t.val[2]);
+ ctbf_bot(v3.val[2], v3.val[3], t.val[3]);
+
+ // 1.3
+ if (mont == NTT_MONT) {
+ // Convert to Montgomery domain by multiply with FALCON_MONT
+ barmuli_mont_x8(v0, v1, neon_qmvq, t, t2);
+ barmuli_mont_x8(v2, v3, neon_qmvq, t, t2);
+ } else if (mont == NTT_MONT_INV) {
+ barmuli_mont_ninv_x8(v0, v1, neon_qmvq, t, t2);
+ barmuli_mont_ninv_x8(v2, v3, neon_qmvq, t, t2);
+ }
+
+ vstore_s16_4(&a[j], v0);
+ vstore_s16_4(&a[j + 32], v1);
+ vstore_s16_4(&a[j + 64], v2);
+ vstore_s16_4(&a[j + 96], v3);
+ }
+}
+
+/*
+ * Assume input in range [-Q, Q]
+ * Total Barrett point N = 512, 1024: 1792, 3840
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv) {
+ // Total SIMD registers: 29 = 16 + 12 + 1
+ int16x8x4_t v0, v1, v2, v3; // 16
+ int16x8x4_t zl, zh, t, t2; // 12
+ int16x8x2_t zlh, zhh; // 4
+ int16x8_t neon_qmvq; // 1
+ const int16_t *ptr_invntt_br = PQCLEAN_FALCONPADDED512_AARCH64_invntt_br;
+ const int16_t *ptr_invntt_qinv_br = PQCLEAN_FALCONPADDED512_AARCH64_invntt_qinv_br;
+
+ neon_qmvq = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+ unsigned j;
+
+ // Layer 0, 1, 2, 3, 4, 5, 6
+ for (j = 0; j < FALCON_N; j += 128) {
+ vload_s16_4(v0, &a[j]);
+ vload_s16_4(v1, &a[j + 32]);
+ vload_s16_4(v2, &a[j + 64]);
+ vload_s16_4(v3, &a[j + 96]);
+
+ // Layer 0
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+
+ gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+ gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+ gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+ gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+ gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+ gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+ gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+ gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ // 0 - 1*, 2 - 3*
+ gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+ gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+ gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+ vload_s16_x4(zl, ptr_invntt_br);
+ vload_s16_x4(zh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 32;
+ ptr_invntt_qinv_br += 32;
+
+ gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+ gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+ gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+ gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+ // 0: 2
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ barrett(v0.val[0], neon_qmvq, t.val[0]);
+ barrett(v1.val[0], neon_qmvq, t.val[1]);
+ barrett(v2.val[0], neon_qmvq, t.val[2]);
+ barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+ // 0: 0.5
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ // Layer 1
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+ // 0 - 2*, 1 - 3*
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+ gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+ gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+ gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+ gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+ gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+ gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+ gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+ gsbf_br_bot(v0.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+ gsbf_br_bot(v0.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t.val[1]);
+ gsbf_br_bot(v1.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t.val[2]);
+ gsbf_br_bot(v1.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_br_bot(v2.val[2], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[0]);
+ gsbf_br_bot(v2.val[3], zlh.val[0], zhh.val[0], neon_qmvq, t2.val[1]);
+ gsbf_br_bot(v3.val[2], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[2]);
+ gsbf_br_bot(v3.val[3], zlh.val[1], zhh.val[1], neon_qmvq, t2.val[3]);
+
+ // 0: 2.5
+ // 1: 2.6
+ // 2: 1.5
+ // 3: 1.5
+
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // 0: 0.5
+ // 1: 0.5
+ // 2: 0.5
+ // 3: 0.5
+
+ // Layer 2
+ // Before Transpose
+ // v0.val[0]: 0, 4, 8, 12 | 16, 20, 24, 28
+ // v0.val[1]: 1, 5, 9, 13 | 17, 21, 25, 29
+ // v0.val[2]: 2, 6, 10, 14 | 18, 22, 26, 30
+ // v0.val[3]: 3, 7, 11, 15 | 19, 23, 27, 31
+ transpose(v0, t);
+ transpose(v1, t);
+ transpose(v2, t2);
+ transpose(v3, t2);
+
+ // After Transpose
+ // v0.val[0]: 0, 1, 2, 3 | 16, 17, 18, 19
+ // v0.val[1]: 4, 5, 6, 7 | 20, 21, 22, 23
+ // v0.val[2]: 8, 9, 10, 11 | 24, 25, 26, 27
+ // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+ // 0 - 1*, 2 - 3*
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+ gsbf_top(v1.val[0], v1.val[1], t.val[1]);
+ gsbf_top(v2.val[0], v2.val[1], t.val[2]);
+ gsbf_top(v3.val[0], v3.val[1], t.val[3]);
+
+ gsbf_top(v0.val[2], v0.val[3], t2.val[0]);
+ gsbf_top(v1.val[2], v1.val[3], t2.val[1]);
+ gsbf_top(v2.val[2], v2.val[3], t2.val[2]);
+ gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+ gsbf_br_bot(v0.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[0]);
+ gsbf_br_bot(v1.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[1]);
+
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_br_bot(v2.val[1], zlh.val[0], zhh.val[0], neon_qmvq, t.val[2]);
+ gsbf_br_bot(v3.val[1], zlh.val[1], zhh.val[1], neon_qmvq, t.val[3]);
+
+ vload_s16_x4(zl, ptr_invntt_br);
+ vload_s16_x4(zh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 32;
+ ptr_invntt_qinv_br += 32;
+
+ gsbf_br_bot(v0.val[3], zl.val[0], zh.val[0], neon_qmvq, t2.val[0]);
+ gsbf_br_bot(v1.val[3], zl.val[1], zh.val[1], neon_qmvq, t2.val[1]);
+ gsbf_br_bot(v2.val[3], zl.val[2], zh.val[2], neon_qmvq, t2.val[2]);
+ gsbf_br_bot(v3.val[3], zl.val[3], zh.val[3], neon_qmvq, t2.val[3]);
+
+ // 0: 1
+ // 1: 0.9
+ // 2: 1
+ // 3: 0.9
+
+ // Layer 3
+ // Re-arrange vector from
+ // v0.val[0]: 0, 1, 2, 3 | 16, 17, 18, 19
+ // v0.val[1]: 4, 5, 6, 7 | 20, 21, 22, 23
+ // v0.val[2]: 8, 9, 10, 11 | 24, 25, 26, 27
+ // v0.val[3]: 12, 13, 14, 15 | 28, 29, 30, 31
+ // Compiler will handle register re-naming
+ arrange(t, v0, 0, 1, 2, 3, 0, 2, 1, 3);
+ v0 = t;
+
+ // Compiler will handle register re-naming
+ arrange(t, v1, 0, 1, 2, 3, 0, 2, 1, 3);
+ v1 = t;
+
+ // Compiler will handle register re-naming
+ arrange(t2, v2, 0, 1, 2, 3, 0, 2, 1, 3);
+ v2 = t2;
+
+ // Compiler will handle register re-naming
+ arrange(t2, v3, 0, 1, 2, 3, 0, 2, 1, 3);
+ v3 = t2;
+ // To
+ // v0.val[0]: 0, 1, 2, 3 | 4, 5, 6, 7
+ // v0.val[1]: 8, 9, 10, 11 | 12, 13, 14, 15
+ // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+ // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+ // 0 - 1, 2 - 3
+ vload_s16_x2(zlh, ptr_invntt_br);
+ vload_s16_x2(zhh, ptr_invntt_qinv_br);
+ ptr_invntt_br += 16;
+ ptr_invntt_qinv_br += 16;
+
+ gsbf_top(v0.val[0], v0.val[1], t.val[0]);
+ gsbf_top(v0.val[2], v0.val[3], t.val[1]);
+ gsbf_top(v1.val[0], v1.val[1], t.val[2]);
+ gsbf_top(v1.val[2], v1.val[3], t.val[3]);
+
+ gsbf_top(v2.val[0], v2.val[1], t2.val[0]);
+ gsbf_top(v2.val[2], v2.val[3], t2.val[1]);
+ gsbf_top(v3.val[0], v3.val[1], t2.val[2]);
+ gsbf_top(v3.val[2], v3.val[3], t2.val[3]);
+
+ gsbf_bri_bot(v0.val[1], zlh.val[0], zhh.val[0], 0, neon_qmvq, t.val[0]);
+ gsbf_bri_bot(v0.val[3], zlh.val[0], zhh.val[0], 1, neon_qmvq, t.val[1]);
+ gsbf_bri_bot(v1.val[1], zlh.val[0], zhh.val[0], 2, neon_qmvq, t.val[2]);
+ gsbf_bri_bot(v1.val[3], zlh.val[0], zhh.val[0], 3, neon_qmvq, t.val[3]);
+
+ gsbf_bri_bot(v2.val[1], zlh.val[0], zhh.val[0], 4, neon_qmvq, t2.val[0]);
+ gsbf_bri_bot(v2.val[3], zlh.val[0], zhh.val[0], 5, neon_qmvq, t2.val[1]);
+ gsbf_bri_bot(v3.val[1], zlh.val[0], zhh.val[0], 6, neon_qmvq, t2.val[2]);
+ gsbf_bri_bot(v3.val[3], zlh.val[0], zhh.val[0], 7, neon_qmvq, t2.val[3]);
+
+ // 0: 2
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ barrett(v0.val[0], neon_qmvq, t.val[0]);
+ barrett(v1.val[0], neon_qmvq, t.val[1]);
+ barrett(v2.val[0], neon_qmvq, t.val[2]);
+ barrett(v3.val[0], neon_qmvq, t.val[3]);
+
+ // 0: 0.5
+ // 1: 1.3
+ // 2: 2
+ // 3: 1.3
+
+ // Layer 4
+ // v0.val[0]: 0, 1, 2, 3 | 4, 5, 6, 7
+ // v0.val[1]: 8, 9, 10, 11 | 12, 13, 14, 15
+ // v0.val[2]: 16, 17, 18, 19 | 20, 21, 22, 23
+ // v0.val[3]: 24, 25, 26, 27 | 28, 29, 30, 31
+ // 0 - 2, 1 - 3
+
+ gsbf_top(v0.val[0], v0.val[2], t.val[0]);
+ gsbf_top(v0.val[1], v0.val[3], t.val[1]);
+ gsbf_top(v1.val[0], v1.val[2], t.val[2]);
+ gsbf_top(v1.val[1], v1.val[3], t.val[3]);
+
+ gsbf_top(v2.val[0], v2.val[2], t2.val[0]);
+ gsbf_top(v2.val[1], v2.val[3], t2.val[1]);
+ gsbf_top(v3.val[0], v3.val[2], t2.val[2]);
+ gsbf_top(v3.val[1], v3.val[3], t2.val[3]);
+
+ gsbf_bri_bot(v0.val[2], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[0]);
+ gsbf_bri_bot(v0.val[3], zlh.val[1], zhh.val[1], 0, neon_qmvq, t.val[1]);
+ gsbf_bri_bot(v1.val[2], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[2]);
+ gsbf_bri_bot(v1.val[3], zlh.val[1], zhh.val[1], 1, neon_qmvq, t.val[3]);
+
+ gsbf_bri_bot(v2.val[2], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[0]);
+ gsbf_bri_bot(v2.val[3], zlh.val[1], zhh.val[1], 2, neon_qmvq, t2.val[1]);
+ gsbf_bri_bot(v3.val[2], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[2]);
+ gsbf_bri_bot(v3.val[3], zlh.val[1], zhh.val[1], 3, neon_qmvq, t2.val[3]);
+
+ // 0: 2.5
+ // 1: 2.5
+ // 2: 1.5
+ // 3: 1.5
+
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // 0: 0.5
+ // 1: 0.5
+ // 2: 0.5
+ // 3: 0.5
+
+ // Layer 5
+ // Cross block
+ // v0.0->3 - v1.0->3
+ gsbf_top_x4(v0, v1, t);
+ gsbf_top_x4(v2, v3, t2);
+
+ gsbf_bri_bot_x4(v1, zlh.val[1], zhh.val[1], 4, 4, 4, 4, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 5, 5, 5, 5, neon_qmvq, t2);
+
+ // v0: 1
+ // v1: 0.9
+ // v2: 1
+ // v3: 0.9
+
+ // Layer 6
+ // Cross block
+ // v0.0->3 - v2.0->3
+ gsbf_top_x4(v0, v2, t);
+ gsbf_top_x4(v1, v3, t2);
+
+ gsbf_bri_bot_x4(v2, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zlh.val[1], zhh.val[1], 6, 6, 6, 6, neon_qmvq, t2);
+
+ // v0: 2
+ // v1: 1.8
+ // v2: 1.3
+ // v3: 1.2
+
+ vstore_s16_x4(&a[j], v0);
+ vstore_s16_x4(&a[j + 32], v1);
+ vstore_s16_x4(&a[j + 64], v2);
+ vstore_s16_x4(&a[j + 96], v3);
+ }
+
+ zl.val[0] = vld1q_s16(ptr_invntt_br);
+ zh.val[0] = vld1q_s16(ptr_invntt_qinv_br);
+
+ // Layer 7, 8
+ for (j = 0; j < 64; j += 32) {
+ vload_s16_x4(v0, &a[j]);
+ vload_s16_x4(v1, &a[j + 128]);
+ vload_s16_x4(v2, &a[j + 256]);
+ vload_s16_x4(v3, &a[j + 384]);
+
+ // 2
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // v0: .5
+ // v1: .5
+ // v2: .5
+ // v3: .5
+
+ // Layer 7
+ // v0 - v1, v2 - v3
+ gsbf_top_x4(v0, v1, t);
+ gsbf_top_x4(v2, v3, t2);
+
+ gsbf_bri_bot_x4(v1, zl.val[0], zh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t2);
+
+ // v0: 1
+ // v1: .87
+ // v2: 1
+ // v3: .87
+
+ // Layer 8
+ // v0 - v2, v1 - v3
+ gsbf_top_x4(v0, v2, t);
+ gsbf_top_x4(v1, v3, t2);
+
+ // v0: 2
+ // v1: 1.75
+ // v2: 1.25
+ // v3: 1.15
+ if (ninv == INVNTT_NINV) {
+ gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+ barmul_invntt_x4(v0, zl.val[0], zh.val[0], 3, neon_qmvq, t);
+ barmul_invntt_x4(v1, zl.val[0], zh.val[0], 3, neon_qmvq, t2);
+ } else {
+ gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t2);
+ }
+
+ // v0: 1.25
+ // v1: 1.15
+ // v2: 1.25
+ // v3: 1.15
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+
+ // v0: 0.5
+ // v1: 0.5
+ // v2: 0.97
+ // v3: 0.93
+
+ vstore_s16_x4(&a[j], v0);
+ vstore_s16_x4(&a[j + 128], v1);
+ vstore_s16_x4(&a[j + 256], v2);
+ vstore_s16_x4(&a[j + 384], v3);
+ }
+ for (; j < 128; j += 32) {
+ vload_s16_x4(v0, &a[j]);
+ vload_s16_x4(v1, &a[j + 128]);
+ vload_s16_x4(v2, &a[j + 256]);
+ vload_s16_x4(v3, &a[j + 384]);
+
+ // v0: 1.3
+ // v1: 1.3
+ // v2: 1.3
+ // v3: 1.3
+
+ // Layer 7
+ // v0 - v1, v2 - v3
+ gsbf_top_x4(v0, v1, t);
+ gsbf_top_x4(v2, v3, t2);
+
+ gsbf_bri_bot_x4(v1, zl.val[0], zh.val[0], 0, 0, 0, 0, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 1, 1, 1, 1, neon_qmvq, t2);
+
+ // v0: 2.6
+ // v1: 1.5
+ // v2: 2.6
+ // v3: 1.5
+
+ barrett_x4(v0, neon_qmvq, t);
+ barrett_x4(v1, neon_qmvq, t);
+ barrett_x4(v2, neon_qmvq, t2);
+ barrett_x4(v3, neon_qmvq, t2);
+
+ // v0: 0.5
+ // v1: 0.5
+ // v2: 0.5
+ // v3: 0.5
+
+ // Layer 8
+ // v0 - v2, v1 - v3
+ gsbf_top_x4(v0, v2, t);
+ gsbf_top_x4(v1, v3, t2);
+
+ // v0: 1
+ // v1: 1
+ // v2: .87
+ // v3: .87
+ if (ninv == INVNTT_NINV) {
+ gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 2, 2, 2, 2, neon_qmvq, t2);
+ barmul_invntt_x4(v0, zl.val[0], zh.val[0], 3, neon_qmvq, t);
+ barmul_invntt_x4(v1, zl.val[0], zh.val[0], 3, neon_qmvq, t2);
+ } else {
+ gsbf_bri_bot_x4(v2, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t);
+ gsbf_bri_bot_x4(v3, zl.val[0], zh.val[0], 4, 4, 4, 4, neon_qmvq, t2);
+ }
+
+ // v0: .87
+ // v1: .87
+ // v2: .83
+ // v3: .83
+
+ vstore_s16_x4(&a[j], v0);
+ vstore_s16_x4(&a[j + 128], v1);
+ vstore_s16_x4(&a[j + 256], v2);
+ vstore_s16_x4(&a[j + 384], v3);
+ }
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+ // Total SIMD registers: 29 = 28 + 1
+ int16x8x4_t a, b, c, d, e1, e2, t, k; // 28
+ int16x8_t neon_qmvm; // 1
+ neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &f[i]);
+ vload_s16_x4(b, &g[i]);
+ vload_s16_x4(c, &f[i + 32]);
+ vload_s16_x4(d, &g[i + 32]);
+
+ montmul_x8(e1, e2, a, b, c, d, neon_qmvm, t, k);
+
+ vstore_s16_x4(&f[i], e1);
+ vstore_s16_x4(&f[i + 32], e2);
+ }
+}
+
+/* ===================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.c
new file mode 100644
index 000000000..1f0076ebd
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.c
@@ -0,0 +1,377 @@
+#include "ntt_consts.h"
+#include "params.h"
+
+#define PADDING 0
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_qmvq[8] = {FALCON_Q, FALCON_QINV,
+ FALCON_MONT, FALCON_NINV_MONT,
+ FALCON_V, 0,
+ FALCON_MONT_BR, FALCON_NINV_MONT_BR
+ };
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_br[] = {
+ PADDING, -1479, -5146, 4043, PADDING, PADDING, PADDING, PADDING,
+ -1305, 3542, -3504, -4821, 2639, -2625, -949, 2319,
+ -1170, -955, -790, -3201, 3014, 5086, -1326, PADDING,
+ 1260, 1260, 1260, 1260, 4632, 4632, 4632, 4632,
+ 2426, 2426, 2426, 2426, 1428, 1428, 1428, 1428,
+ 2013, 2013, 2013, 2013, 729, 729, 729, 729,
+ 2881, 2881, 2881, 2881, -5092, -5092, -5092, -5092,
+ 4388, 4388, 4388, 4388, -5755, -5755, -5755, -5755,
+ 334, 334, 334, 334, 1696, 1696, 1696, 1696,
+ -3289, -3289, -3289, -3289, 3241, 3241, 3241, 3241,
+ 3284, 3284, 3284, 3284, -2089, -2089, -2089, -2089,
+ 2401, 442, -5101, -1067, 390, 773, -3833, 3778,
+ 354, 4861, -2912, 5698, 5012, -2481, 2859, -1045,
+ 1017, -4885, 1632, -5084, 27, -3066, -3763, -1440,
+ 1537, 242, 4714, -4143, -2678, 3704, 5019, -545,
+ 49, 5915, -2500, -1583, 1512, -1815, 5369, -3202,
+ -2738, -5735, -3009, 174, -1975, 347, -3315, 1858,
+ 3030, 2361, 2908, 3434, 3963, 6142, 1954, -2882,
+ 3991, -2767, 2281, -2031, 3772, 5908, 5429, -4737,
+ 1263, 1483, -1489, -5942, 350, 5383, -2057, 4493,
+ -5868, 2655, 1693, 723, -3757, 2925, -426, 4754,
+ 4115, -1843, 218, -3529, 576, -2447, -2051, -1805,
+ -3969, 156, 5876, 5333, 418, -453, -4774, 1293,
+ 722, -2545, 3621, -563, -2975, -3006, -2744, 4846,
+ -2747, -3135, 3712, 4805, -3553, -1062, -2294, PADDING,
+ -3694, -3694, -3694, -3694, -1759, -1759, -1759, -1759,
+ 3382, 3382, 3382, 3382, -2548, -2548, -2548, -2548,
+ 3637, 3637, 3637, 3637, 145, 145, 145, 145,
+ -2731, -2731, -2731, -2731, -4890, -4890, -4890, -4890,
+ -5179, -5179, -5179, -5179, -3707, -3707, -3707, -3707,
+ -355, -355, -355, -355, -4231, -4231, -4231, -4231,
+ 3459, 3459, 3459, 3459, -5542, -5542, -5542, -5542,
+ -3932, -3932, -3932, -3932, -5911, -5911, -5911, -5911,
+ 1002, 5011, 5088, -4284, -4976, -1607, -3780, -875,
+ -2437, 3646, 6022, 2987, -2566, -2187, -6039, -2422,
+ -1065, 2143, -404, -4645, 1168, 5277, -1207, 3248,
+ 493, -4096, -5444, 2381, -4337, -435, 1378, 1912,
+ 295, 5766, -4016, -3762, 325, -1146, 5990, -3728,
+ 3329, -168, 5961, -1962, -6122, -5184, 1360, -6119,
+ -4079, 922, 1958, 1112, 4046, -3150, 4240, -6065,
+ 2459, 3656, -1566, -2948, -3123, -3054, -4433, 3834,
+ 6099, 652, 4077, -2919, -1404, -948, 1159, -4049,
+ 4298, 2692, -5106, 1594, -2555, -1200, 3956, 5297,
+ -1058, 441, 4322, 2078, 709, 1319, -3570, -835,
+ 683, -64, 5782, -2503, -1747, -5486, -5919, -5257,
+ 5736, -1646, 1212, 5728, -4591, 5023, 5828, 3091,
+ -81, -4320, -1000, -2963, -4896, -3051, 2366, PADDING,
+ -2842, -2842, -2842, -2842, 1022, 1022, 1022, 1022,
+ -2468, -2468, -2468, -2468, 5791, 5791, 5791, 5791,
+ -1673, -1673, -1673, -1673, -5331, -5331, -5331, -5331,
+ -4177, -4177, -4177, -4177, 1381, 1381, 1381, 1381,
+ 480, 480, 480, 480, 9, 9, 9, 9,
+ 339, 339, 339, 339, 544, 544, 544, 544,
+ 4278, 4278, 4278, 4278, -4989, -4989, -4989, -4989,
+ -3584, -3584, -3584, -3584, -2525, -2525, -2525, -2525,
+ 2166, 3915, -113, -4919, -160, 3149, -3, 4437,
+ 3636, 4938, 5291, 2704, -1426, -4654, 1663, -1777,
+ 3364, 1689, 4057, -3271, -2847, -4414, 2174, 4372,
+ -5042, -2305, 4053, 2645, 5195, -2780, -4895, 1484,
+ -5241, -4169, -5468, -3482, 5057, 4780, -192, 4912,
+ 677, -6055, 1323, -52, 1579, -2505, 3957, 151,
+ -58, 3532, 1956, -885, 3477, 142, -2844, -975,
+ -3029, 4782, -4213, 2302, -421, 3602, -3600, 6077,
+ -2920, -3127, 1010, 787, 4698, -3445, 1321, -2049,
+ -5874, -3336, -2766, 3174, -431, 5906, -2839, -2127,
+ -241, -1003, -5009, -6008, -5681, -1105, 3438, 4212,
+ -5594, 5886, 504, -605, -4080, 6068, 3263, -4624,
+ -4134, 3195, 5860, -3328, -5777, -4978, 1351, -1177,
+ -4255, -1635, -2768, -140, -1853, -4611, -726, PADDING,
+ -953, -953, -953, -953, 827, 827, 827, 827,
+ 2476, 2476, 2476, 2476, 2197, 2197, 2197, 2197,
+ 3949, 3949, 3949, 3949, 4452, 4452, 4452, 4452,
+ -4354, -4354, -4354, -4354, 2837, 2837, 2837, 2837,
+ -3748, -3748, -3748, -3748, 5767, 5767, 5767, 5767,
+ 118, 118, 118, 118, -5067, -5067, -5067, -5067,
+ -3296, -3296, -3296, -3296, 2396, 2396, 2396, 2396,
+ 130, 130, 130, 130, -5374, -5374, -5374, -5374,
+ -3247, -2686, -3978, -2969, -2370, 2865, 5332, 3510,
+ 1630, -2126, 5407, 3186, -1153, -2884, -2249, -4048,
+ -2399, -3400, -5191, -3136, -3000, 671, 3016, 243,
+ -5559, 420, -2178, 1544, 3985, 4905, 3531, 476,
+ -4467, -5537, 4449, -147, 6118, 1190, 3860, -4536,
+ 5079, 2169, -4324, -4075, -1278, 1973, -3514, 5925,
+ 654, 1702, -5529, 3199, 6136, -5415, 4948, 400,
+ 5339, 3710, 468, 316, -2033, 3879, -1359, 973,
+ -4789, 4749, -5456, -3789, -3818, -2683, 5445, -1050,
+ -3262, -522, 4916, 5315, -2344, -5574, -1041, -1018,
+ 3565, 1987, 5206, -56, -5862, -3643, -6137, -1728,
+ 5446, 6093, -3988, -382, -3998, 1922, -5435, -1254,
+}; // 512->712
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_qinv_br[] = {
+ PADDING, -3943, -13721, 10780, PADDING, PADDING, PADDING, PADDING,
+ -3479, 9444, -9343, -12854, 7036, -6999, -2530, 6183,
+ -3119, -2546, -2106, -8535, 8036, 13561, -3535, PADDING,
+ 3359, 3359, 3359, 3359, 12350, 12350, 12350, 12350,
+ 6468, 6468, 6468, 6468, 3807, 3807, 3807, 3807,
+ 5367, 5367, 5367, 5367, 1943, 1943, 1943, 1943,
+ 7682, 7682, 7682, 7682, -13577, -13577, -13577, -13577,
+ 11700, 11700, 11700, 11700, -15345, -15345, -15345, -15345,
+ 890, 890, 890, 890, 4522, 4522, 4522, 4522,
+ -8769, -8769, -8769, -8769, 8641, 8641, 8641, 8641,
+ 8756, 8756, 8756, 8756, -5570, -5570, -5570, -5570,
+ 6402, 1178, -13601, -2845, 1039, 2061, -10220, 10073,
+ 943, 12961, -7764, 15193, 13364, -6615, 7623, -2786,
+ 2711, -13025, 4351, -13556, 71, -8175, -10033, -3839,
+ 4098, 645, 12569, -11047, -7140, 9876, 13382, -1453,
+ 130, 15772, -6666, -4220, 4031, -4839, 14316, -8537,
+ -7300, -15292, -8023, 463, -5266, 925, -8839, 4954,
+ 8079, 6295, 7754, 9156, 10567, 16377, 5210, -7684,
+ 10641, -7378, 6082, -5415, 10057, 15753, 14476, -12630,
+ 3367, 3954, -3970, -15844, 933, 14353, -5484, 11980,
+ -15646, 7079, 4514, 1927, -10017, 7799, -1135, 12676,
+ 10972, -4914, 581, -9409, 1535, -6524, -5468, -4812,
+ -10583, 415, 15668, 14220, 1114, -1207, -12729, 3447,
+ 1925, -6786, 9655, -1501, -7932, -8015, -7316, 12921,
+ -7324, -8359, 9897, 12812, -9473, -2831, -6116, PADDING,
+ -9849, -9849, -9849, -9849, -4690, -4690, -4690, -4690,
+ 9017, 9017, 9017, 9017, -6794, -6794, -6794, -6794,
+ 9697, 9697, 9697, 9697, 386, 386, 386, 386,
+ -7282, -7282, -7282, -7282, -13038, -13038, -13038, -13038,
+ -13809, -13809, -13809, -13809, -9884, -9884, -9884, -9884,
+ -946, -946, -946, -946, -11281, -11281, -11281, -11281,
+ 9223, 9223, 9223, 9223, -14777, -14777, -14777, -14777,
+ -10484, -10484, -10484, -10484, -15761, -15761, -15761, -15761,
+ 2671, 13361, 13566, -11423, -13268, -4284, -10079, -2333,
+ -6498, 9721, 16057, 7964, -6842, -5831, -16102, -6458,
+ -2839, 5714, -1077, -12385, 3114, 14070, -3218, 8660,
+ 1314, -10921, -14516, 6348, -11564, -1159, 3674, 5098,
+ 786, 15374, -10708, -10031, 866, -3055, 15972, -9940,
+ 8876, -447, 15894, -5231, -16324, -13822, 3626, -16316,
+ -10876, 2458, 5220, 2965, 10788, -8399, 11305, -16172,
+ 6556, 9748, -4175, -7860, -8327, -8143, -11820, 10223,
+ 16262, 1738, 10871, -7783, -3743, -2527, 3090, -10796,
+ 11460, 7178, -13614, 4250, -6812, -3199, 10548, 14124,
+ -2821, 1175, 11524, 5540, 1890, 3517, -9519, -2226,
+ 1821, -170, 15417, -6674, -4658, -14628, -15782, -14017,
+ 15294, -4388, 3231, 15273, -12241, 13393, 15540, 8241,
+ -215, -11519, -2666, -7900, -13054, -8135, 6308, PADDING,
+ -7578, -7578, -7578, -7578, 2725, 2725, 2725, 2725,
+ -6580, -6580, -6580, -6580, 15441, 15441, 15441, 15441,
+ -4460, -4460, -4460, -4460, -14214, -14214, -14214, -14214,
+ -11137, -11137, -11137, -11137, 3682, 3682, 3682, 3682,
+ 1279, 1279, 1279, 1279, 23, 23, 23, 23,
+ 903, 903, 903, 903, 1450, 1450, 1450, 1450,
+ 11407, 11407, 11407, 11407, -13302, -13302, -13302, -13302,
+ -9556, -9556, -9556, -9556, -6732, -6732, -6732, -6732,
+ 5775, 10439, -301, -13116, -426, 8396, -7, 11831,
+ 9695, 13166, 14108, 7210, -3802, -12409, 4434, -4738,
+ 8969, 4503, 10817, -8721, -7591, -11769, 5796, 11657,
+ -13444, -6146, 10807, 7052, 13852, -7412, -13052, 3957,
+ -13974, -11116, -14580, -9284, 13484, 12745, -511, 13097,
+ 1805, -16145, 3527, -138, 4210, -6679, 10551, 402,
+ -154, 9417, 5215, -2359, 9271, 378, -7583, -2599,
+ -8076, 12750, -11233, 6138, -1122, 9604, -9599, 16204,
+ -7786, -8337, 2693, 2098, 12526, -9185, 3522, -5463,
+ -15662, -8895, -7375, 8463, -1149, 15748, -7570, -5671,
+ -642, -2674, -13356, -16020, -15148, -2946, 9167, 11231,
+ -14916, 15694, 1343, -1613, -10879, 16180, 8700, -12329,
+ -11023, 8519, 15625, -8873, -15404, -13273, 3602, -3138,
+ -11345, -4359, -7380, -373, -4940, -12294, -1935, PADDING,
+ -2541, -2541, -2541, -2541, 2205, 2205, 2205, 2205,
+ 6602, 6602, 6602, 6602, 5858, 5858, 5858, 5858,
+ 10529, 10529, 10529, 10529, 11871, 11871, 11871, 11871,
+ -11609, -11609, -11609, -11609, 7564, 7564, 7564, 7564,
+ -9993, -9993, -9993, -9993, 15377, 15377, 15377, 15377,
+ 314, 314, 314, 314, -13510, -13510, -13510, -13510,
+ -8788, -8788, -8788, -8788, 6388, 6388, 6388, 6388,
+ 346, 346, 346, 346, -14329, -14329, -14329, -14329,
+ -8657, -7162, -10607, -7916, -6319, 7639, 14217, 9359,
+ 4346, -5668, 14417, 8495, -3074, -7690, -5996, -10793,
+ -6396, -9065, -13841, -8361, -7999, 1789, 8042, 647,
+ -14822, 1119, -5807, 4116, 10625, 13078, 9415, 1269,
+ -11911, -14764, 11863, -391, 16313, 3173, 10292, -12095,
+ 13542, 5783, -11529, -10865, -3407, 5260, -9369, 15798,
+ 1743, 4538, -14742, 8529, 16361, -14438, 13193, 1066,
+ 14236, 9892, 1247, 842, -5420, 10343, -3623, 2594,
+ -12769, 12662, -14548, -10103, -10180, -7154, 14518, -2799,
+ -8697, -1391, 13108, 14172, -6250, -14862, -2775, -2714,
+ 9505, 5298, 13881, -149, -15630, -9713, -16364, -4607,
+ 14521, 16246, -10633, -1018, -10660, 5124, -14492, -3343,
+}; // 712
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_br[] = {
+ 1254, 5435, -1922, 3998, 382, 3988, -6093, -5446,
+ 1728, 6137, 3643, 5862, 56, -5206, -1987, -3565,
+ 1018, 1041, 5574, 2344, -5315, -4916, 522, 3262,
+ 1050, -5445, 2683, 3818, 3789, 5456, -4749, 4789,
+ -973, 1359, -3879, 2033, -316, -468, -3710, -5339,
+ -400, -4948, 5415, -6136, -3199, 5529, -1702, -654,
+ -5925, 3514, -1973, 1278, 4075, 4324, -2169, -5079,
+ 4536, -3860, -1190, -6118, 147, -4449, 5537, 4467,
+ -476, -3531, -4905, -3985, -1544, 2178, -420, 5559,
+ -243, -3016, -671, 3000, 3136, 5191, 3400, 2399,
+ 4048, 2249, 2884, 1153, -3186, -5407, 2126, -1630,
+ -3510, -5332, -2865, 2370, 2969, 3978, 2686, 3247,
+ 5374, 5374, 5374, 5374, -130, -130, -130, -130,
+ -2396, -2396, -2396, -2396, 3296, 3296, 3296, 3296,
+ 5067, 5067, 5067, 5067, -118, -118, -118, -118,
+ -5767, -5767, -5767, -5767, 3748, 3748, 3748, 3748,
+ -2837, -2837, -2837, -2837, 4354, 4354, 4354, 4354,
+ -4452, -4452, -4452, -4452, -3949, -3949, -3949, -3949,
+ -2197, -2197, -2197, -2197, -2476, -2476, -2476, -2476,
+ -827, -827, -827, -827, 953, 953, 953, 953,
+ 726, 4611, 1853, 140, 2768, 1635, 4255, 1177,
+ -1351, 4978, 5777, 3328, -5860, -3195, 4134, PADDING,
+ 4624, -3263, -6068, 4080, 605, -504, -5886, 5594,
+ -4212, -3438, 1105, 5681, 6008, 5009, 1003, 241,
+ 2127, 2839, -5906, 431, -3174, 2766, 3336, 5874,
+ 2049, -1321, 3445, -4698, -787, -1010, 3127, 2920,
+ -6077, 3600, -3602, 421, -2302, 4213, -4782, 3029,
+ 975, 2844, -142, -3477, 885, -1956, -3532, 58,
+ -151, -3957, 2505, -1579, 52, -1323, 6055, -677,
+ -4912, 192, -4780, -5057, 3482, 5468, 4169, 5241,
+ -1484, 4895, 2780, -5195, -2645, -4053, 2305, 5042,
+ -4372, -2174, 4414, 2847, 3271, -4057, -1689, -3364,
+ 1777, -1663, 4654, 1426, -2704, -5291, -4938, -3636,
+ -4437, 3, -3149, 160, 4919, 113, -3915, -2166,
+ 2525, 2525, 2525, 2525, 3584, 3584, 3584, 3584,
+ 4989, 4989, 4989, 4989, -4278, -4278, -4278, -4278,
+ -544, -544, -544, -544, -339, -339, -339, -339,
+ -9, -9, -9, -9, -480, -480, -480, -480,
+ -1381, -1381, -1381, -1381, 4177, 4177, 4177, 4177,
+ 5331, 5331, 5331, 5331, 1673, 1673, 1673, 1673,
+ -5791, -5791, -5791, -5791, 2468, 2468, 2468, 2468,
+ -1022, -1022, -1022, -1022, 2842, 2842, 2842, 2842,
+ -2366, 3051, 4896, 2963, 1000, 4320, 81, -3091,
+ -5828, -5023, 4591, -5728, -1212, 1646, -5736, PADDING,
+ 5257, 5919, 5486, 1747, 2503, -5782, 64, -683,
+ 835, 3570, -1319, -709, -2078, -4322, -441, 1058,
+ -5297, -3956, 1200, 2555, -1594, 5106, -2692, -4298,
+ 4049, -1159, 948, 1404, 2919, -4077, -652, -6099,
+ -3834, 4433, 3054, 3123, 2948, 1566, -3656, -2459,
+ 6065, -4240, 3150, -4046, -1112, -1958, -922, 4079,
+ 6119, -1360, 5184, 6122, 1962, -5961, 168, -3329,
+ 3728, -5990, 1146, -325, 3762, 4016, -5766, -295,
+ -1912, -1378, 435, 4337, -2381, 5444, 4096, -493,
+ -3248, 1207, -5277, -1168, 4645, 404, -2143, 1065,
+ 2422, 6039, 2187, 2566, -2987, -6022, -3646, 2437,
+ 875, 3780, 1607, 4976, 4284, -5088, -5011, -1002,
+ 5911, 5911, 5911, 5911, 3932, 3932, 3932, 3932,
+ 5542, 5542, 5542, 5542, -3459, -3459, -3459, -3459,
+ 4231, 4231, 4231, 4231, 355, 355, 355, 355,
+ 3707, 3707, 3707, 3707, 5179, 5179, 5179, 5179,
+ 4890, 4890, 4890, 4890, 2731, 2731, 2731, 2731,
+ -145, -145, -145, -145, -3637, -3637, -3637, -3637,
+ 2548, 2548, 2548, 2548, -3382, -3382, -3382, -3382,
+ 1759, 1759, 1759, 1759, 3694, 3694, 3694, 3694,
+ 2294, 1062, 3553, -4805, -3712, 3135, 2747, -4846,
+ 2744, 3006, 2975, 563, -3621, 2545, -722, PADDING,
+ -1293, 4774, 453, -418, -5333, -5876, -156, 3969,
+ 1805, 2051, 2447, -576, 3529, -218, 1843, -4115,
+ -4754, 426, -2925, 3757, -723, -1693, -2655, 5868,
+ -4493, 2057, -5383, -350, 5942, 1489, -1483, -1263,
+ 4737, -5429, -5908, -3772, 2031, -2281, 2767, -3991,
+ 2882, -1954, -6142, -3963, -3434, -2908, -2361, -3030,
+ -1858, 3315, -347, 1975, -174, 3009, 5735, 2738,
+ 3202, -5369, 1815, -1512, 1583, 2500, -5915, -49,
+ 545, -5019, -3704, 2678, 4143, -4714, -242, -1537,
+ 1440, 3763, 3066, -27, 5084, -1632, 4885, -1017,
+ 1045, -2859, 2481, -5012, -5698, 2912, -4861, -354,
+ -3778, 3833, -773, -390, 1067, 5101, -442, -2401,
+ 2089, 2089, 2089, 2089, -3284, -3284, -3284, -3284,
+ -3241, -3241, -3241, -3241, 3289, 3289, 3289, 3289,
+ -1696, -1696, -1696, -1696, -334, -334, -334, -334,
+ 5755, 5755, 5755, 5755, -4388, -4388, -4388, -4388,
+ 5092, 5092, 5092, 5092, -2881, -2881, -2881, -2881,
+ -729, -729, -729, -729, -2013, -2013, -2013, -2013,
+ -1428, -1428, -1428, -1428, -2426, -2426, -2426, -2426,
+ -4632, -4632, -4632, -4632, -1260, -1260, -1260, -1260,
+ 1326, -5086, -3014, 3201, 790, 955, 1170, -2319,
+ 949, 2625, -2639, 4821, 3504, -3542, 1305, PADDING,
+ -4043, 5146, 1371, 12265, 1479, PADDING, PADDING, PADDING,
+}; // 712
+
+const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_qinv_br[] = {
+ 3343, 14492, -5124, 10660, 1018, 10633, -16246, -14521,
+ 4607, 16364, 9713, 15630, 149, -13881, -5298, -9505,
+ 2714, 2775, 14862, 6250, -14172, -13108, 1391, 8697,
+ 2799, -14518, 7154, 10180, 10103, 14548, -12662, 12769,
+ -2594, 3623, -10343, 5420, -842, -1247, -9892, -14236,
+ -1066, -13193, 14438, -16361, -8529, 14742, -4538, -1743,
+ -15798, 9369, -5260, 3407, 10865, 11529, -5783, -13542,
+ 12095, -10292, -3173, -16313, 391, -11863, 14764, 11911,
+ -1269, -9415, -13078, -10625, -4116, 5807, -1119, 14822,
+ -647, -8042, -1789, 7999, 8361, 13841, 9065, 6396,
+ 10793, 5996, 7690, 3074, -8495, -14417, 5668, -4346,
+ -9359, -14217, -7639, 6319, 7916, 10607, 7162, 8657,
+ 14329, 14329, 14329, 14329, -346, -346, -346, -346,
+ -6388, -6388, -6388, -6388, 8788, 8788, 8788, 8788,
+ 13510, 13510, 13510, 13510, -314, -314, -314, -314,
+ -15377, -15377, -15377, -15377, 9993, 9993, 9993, 9993,
+ -7564, -7564, -7564, -7564, 11609, 11609, 11609, 11609,
+ -11871, -11871, -11871, -11871, -10529, -10529, -10529, -10529,
+ -5858, -5858, -5858, -5858, -6602, -6602, -6602, -6602,
+ -2205, -2205, -2205, -2205, 2541, 2541, 2541, 2541,
+ 1935, 12294, 4940, 373, 7380, 4359, 11345, 3138,
+ -3602, 13273, 15404, 8873, -15625, -8519, 11023, PADDING,
+ 12329, -8700, -16180, 10879, 1613, -1343, -15694, 14916,
+ -11231, -9167, 2946, 15148, 16020, 13356, 2674, 642,
+ 5671, 7570, -15748, 1149, -8463, 7375, 8895, 15662,
+ 5463, -3522, 9185, -12526, -2098, -2693, 8337, 7786,
+ -16204, 9599, -9604, 1122, -6138, 11233, -12750, 8076,
+ 2599, 7583, -378, -9271, 2359, -5215, -9417, 154,
+ -402, -10551, 6679, -4210, 138, -3527, 16145, -1805,
+ -13097, 511, -12745, -13484, 9284, 14580, 11116, 13974,
+ -3957, 13052, 7412, -13852, -7052, -10807, 6146, 13444,
+ -11657, -5796, 11769, 7591, 8721, -10817, -4503, -8969,
+ 4738, -4434, 12409, 3802, -7210, -14108, -13166, -9695,
+ -11831, 7, -8396, 426, 13116, 301, -10439, -5775,
+ 6732, 6732, 6732, 6732, 9556, 9556, 9556, 9556,
+ 13302, 13302, 13302, 13302, -11407, -11407, -11407, -11407,
+ -1450, -1450, -1450, -1450, -903, -903, -903, -903,
+ -23, -23, -23, -23, -1279, -1279, -1279, -1279,
+ -3682, -3682, -3682, -3682, 11137, 11137, 11137, 11137,
+ 14214, 14214, 14214, 14214, 4460, 4460, 4460, 4460,
+ -15441, -15441, -15441, -15441, 6580, 6580, 6580, 6580,
+ -2725, -2725, -2725, -2725, 7578, 7578, 7578, 7578,
+ -6308, 8135, 13054, 7900, 2666, 11519, 215, -8241,
+ -15540, -13393, 12241, -15273, -3231, 4388, -15294, PADDING,
+ 14017, 15782, 14628, 4658, 6674, -15417, 170, -1821,
+ 2226, 9519, -3517, -1890, -5540, -11524, -1175, 2821,
+ -14124, -10548, 3199, 6812, -4250, 13614, -7178, -11460,
+ 10796, -3090, 2527, 3743, 7783, -10871, -1738, -16262,
+ -10223, 11820, 8143, 8327, 7860, 4175, -9748, -6556,
+ 16172, -11305, 8399, -10788, -2965, -5220, -2458, 10876,
+ 16316, -3626, 13822, 16324, 5231, -15894, 447, -8876,
+ 9940, -15972, 3055, -866, 10031, 10708, -15374, -786,
+ -5098, -3674, 1159, 11564, -6348, 14516, 10921, -1314,
+ -8660, 3218, -14070, -3114, 12385, 1077, -5714, 2839,
+ 6458, 16102, 5831, 6842, -7964, -16057, -9721, 6498,
+ 2333, 10079, 4284, 13268, 11423, -13566, -13361, -2671,
+ 15761, 15761, 15761, 15761, 10484, 10484, 10484, 10484,
+ 14777, 14777, 14777, 14777, -9223, -9223, -9223, -9223,
+ 11281, 11281, 11281, 11281, 946, 946, 946, 946,
+ 9884, 9884, 9884, 9884, 13809, 13809, 13809, 13809,
+ 13038, 13038, 13038, 13038, 7282, 7282, 7282, 7282,
+ -386, -386, -386, -386, -9697, -9697, -9697, -9697,
+ 6794, 6794, 6794, 6794, -9017, -9017, -9017, -9017,
+ 4690, 4690, 4690, 4690, 9849, 9849, 9849, 9849,
+ 6116, 2831, 9473, -12812, -9897, 8359, 7324, -12921,
+ 7316, 8015, 7932, 1501, -9655, 6786, -1925, PADDING,
+ -3447, 12729, 1207, -1114, -14220, -15668, -415, 10583,
+ 4812, 5468, 6524, -1535, 9409, -581, 4914, -10972,
+ -12676, 1135, -7799, 10017, -1927, -4514, -7079, 15646,
+ -11980, 5484, -14353, -933, 15844, 3970, -3954, -3367,
+ 12630, -14476, -15753, -10057, 5415, -6082, 7378, -10641,
+ 7684, -5210, -16377, -10567, -9156, -7754, -6295, -8079,
+ -4954, 8839, -925, 5266, -463, 8023, 15292, 7300,
+ 8537, -14316, 4839, -4031, 4220, 6666, -15772, -130,
+ 1453, -13382, -9876, 7140, 11047, -12569, -645, -4098,
+ 3839, 10033, 8175, -71, 13556, -4351, 13025, -2711,
+ 2786, -7623, 6615, -13364, -15193, 7764, -12961, -943,
+ -10073, 10220, -2061, -1039, 2845, 13601, -1178, -6402,
+ 5570, 5570, 5570, 5570, -8756, -8756, -8756, -8756,
+ -8641, -8641, -8641, -8641, 8769, 8769, 8769, 8769,
+ -4522, -4522, -4522, -4522, -890, -890, -890, -890,
+ 15345, 15345, 15345, 15345, -11700, -11700, -11700, -11700,
+ 13577, 13577, 13577, 13577, -7682, -7682, -7682, -7682,
+ -1943, -1943, -1943, -1943, -5367, -5367, -5367, -5367,
+ -3807, -3807, -3807, -3807, -6468, -6468, -6468, -6468,
+ -12350, -12350, -12350, -12350, -3359, -3359, -3359, -3359,
+ 3535, -13561, -8036, 8535, 2106, 2546, 3119, -6183,
+ 2530, 6999, -7036, 12854, 9343, -9444, 3479, PADDING,
+ -10780, 13721, 3655, 32704, 3943, PADDING, PADDING, PADDING,
+}; // 712
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.h
new file mode 100644
index 000000000..ded719645
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/ntt_consts.h
@@ -0,0 +1,23 @@
+#ifndef NTT_CONSTS
+#define NTT_CONSTS
+
+#include
+
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_qmvq[8];
+
+/*
+ * Table for NTT, binary case:
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ */
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_ntt_qinv_br[];
+
+/*
+ * Table for inverse NTT
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_br[];
+extern const int16_t PQCLEAN_FALCONPADDED512_AARCH64_invntt_qinv_br[];
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/params.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/params.h
new file mode 100644
index 000000000..b02384ae9
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/params.h
@@ -0,0 +1,17 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define FALCON_LOGN 9
+
+#define FALCON_N (1 << FALCON_LOGN)
+#define FALCON_Q 12289
+#define FALCON_QINV (-12287) // pow(12289, -1, pow(2, 16)) - pow(2, 16)
+#define FALCON_V 5461 // Barrett reduction
+#define FALCON_MONT 4091 // pow(2, 16, 12289)
+#define FALCON_MONT_BR 10908 // (4091 << 16)//q//2
+
+#define FALCON_NINV_MONT 128 // pow(512, -1, 12289) * pow(2, 16, 12289)
+#define FALCON_NINV_MONT_BR 341 // (128 << 16) //q // 2
+#define FALCON_LOG2_NINV_MONT 7
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly.h
new file mode 100644
index 000000000..73836b3f8
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly.h
@@ -0,0 +1,42 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "inner.h"
+#include "params.h"
+
+typedef enum ntt_domain {
+ NTT_NONE = 0,
+ NTT_MONT = 1,
+ NTT_MONT_INV = 2,
+} ntt_domain_t;
+
+typedef enum invntt_domain {
+ INVNTT_NONE = 0,
+ INVNTT_NINV = 1,
+} invntt_domain_t;
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(int16_t a[FALCON_N], ntt_domain_t mont);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(int16_t a[FALCON_N], invntt_domain_t ninv);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]);
+
+uint16_t PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(int16_t f[FALCON_N], const int16_t g[FALCON_N]);
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+ const int8_t low, const int8_t high);
+
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+ const int16_t low, const int16_t high);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_float.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_float.c
new file mode 100644
index 000000000..b3eb7598d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_float.c
@@ -0,0 +1,1459 @@
+/*
+ * Poly FFT
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add(fpr *c, const fpr *restrict a,
+ const fpr *restrict b, unsigned logn) {
+ float64x2x4_t neon_a, neon_b, neon_c;
+ float64x2x2_t neon_a2, neon_b2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+ switch (logn) {
+ case 1:
+ // n = 2;
+ vload(neon_a.val[0], &a[0]);
+ vload(neon_b.val[0], &b[0]);
+
+ vfadd(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+ vstore(&c[0], neon_c.val[0]);
+ break;
+
+ case 2:
+ // n = 4
+ vloadx2(neon_a2, &a[0]);
+ vloadx2(neon_b2, &b[0]);
+
+ vfadd(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+ vfadd(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+ vstorex2(&c[0], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+ vloadx4(neon_b, &b[i]);
+
+ vfaddx4(neon_c, neon_a, neon_b);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+/*
+ * c = a - b
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(fpr *c, const fpr *restrict a,
+ const fpr *restrict b, unsigned logn) {
+ float64x2x4_t neon_a, neon_b, neon_c;
+ float64x2x2_t neon_a2, neon_b2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+ switch (logn) {
+ case 1:
+ vload(neon_a.val[0], &a[0]);
+ vload(neon_b.val[0], &b[0]);
+
+ vfsub(neon_c.val[0], neon_a.val[0], neon_b.val[0]);
+
+ vstore(&c[0], neon_c.val[0]);
+ break;
+
+ case 2:
+ vloadx2(neon_a2, &a[0]);
+ vloadx2(neon_b2, &b[0]);
+
+ vfsub(neon_c2.val[0], neon_a2.val[0], neon_b2.val[0]);
+ vfsub(neon_c2.val[1], neon_a2.val[1], neon_b2.val[1]);
+
+ vstorex2(&c[0], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+ vloadx4(neon_b, &b[i]);
+
+ vfsubx4(neon_c, neon_a, neon_b);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+/*
+ * c = -a
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(fpr *c, const fpr *restrict a,
+ unsigned logn) {
+ float64x2x4_t neon_a, neon_c;
+ float64x2x2_t neon_a2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+
+ switch (logn) {
+ case 1:
+ vload(neon_a.val[0], &a[0]);
+
+ vfneg(neon_c.val[0], neon_a.val[0]);
+
+ vstore(&c[0], neon_c.val[0]);
+ break;
+
+ case 2:
+ vloadx2(neon_a2, &a[0]);
+
+ vfneg(neon_c2.val[0], neon_a2.val[0]);
+ vfneg(neon_c2.val[1], neon_a2.val[1]);
+
+ vstorex2(&c[0], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+
+ vfnegx4(neon_c, neon_a);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_adj_fft(fpr *c, const fpr *restrict a,
+ unsigned logn) {
+
+ float64x2x4_t neon_a, neon_c;
+ float64x2x2_t neon_a2, neon_c2;
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1;
+ c[1] = fpr_neg(a[1]);
+ break;
+
+ case 2:
+ // n = 4; hn = 2
+ vload(neon_a.val[0], &a[2]);
+ vfneg(neon_c.val[0], neon_a.val[0]);
+ vstore(&c[2], neon_c.val[0]);
+ break;
+
+ case 3:
+ // n = 8; hn = 4
+ vloadx2(neon_a2, &a[4]);
+ vfneg(neon_c2.val[0], neon_a2.val[0]);
+ vfneg(neon_c2.val[1], neon_a2.val[1]);
+ vstorex2(&c[4], neon_c2);
+ break;
+
+ default:
+ for (unsigned i = hn; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+
+ vfnegx4(neon_c, neon_a);
+
+ vstorex4(&c[i], neon_c);
+ }
+ break;
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log1(
+ fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ a_re = a[0];
+ a_im = a[1];
+ b_re = b[0];
+ b_im = b[1];
+
+ c_re = a_re * b_re - a_im * b_im;
+ c_im = a_re * b_im + a_im * b_re;
+
+ c[0] = c_re;
+ c[1] = c_im;
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log2(
+ fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+ // n = 4
+ float64x2x2_t neon_a, neon_b, neon_c;
+ float64x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+ // 0: re, re
+ // 1: im, im
+ vloadx2(neon_a, &a[0]);
+ vloadx2(neon_b, &b[0]);
+
+ a_re = neon_a.val[0];
+ a_im = neon_a.val[1];
+ b_re = neon_b.val[0];
+ b_im = neon_b.val[1];
+
+ FPC_MUL(c_re, c_im, a_re, a_im, b_re, b_im);
+
+ neon_c.val[0] = c_re;
+ neon_c.val[1] = c_im;
+
+ vstorex2(&c[0], neon_c);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log3(
+ fpr *restrict c, const fpr *restrict a, const fpr *restrict b) {
+ // n = 8
+ float64x2x4_t neon_a, neon_b, neon_c;
+ float64x2x2_t a_re, a_im, b_re, b_im, c_re, c_im;
+
+ vloadx4(neon_a, &a[0]);
+ vloadx4(neon_b, &b[0]);
+
+ a_re.val[0] = neon_a.val[0];
+ a_re.val[1] = neon_a.val[1];
+ a_im.val[0] = neon_a.val[2];
+ a_im.val[1] = neon_a.val[3];
+
+ b_re.val[0] = neon_b.val[0];
+ b_re.val[1] = neon_b.val[1];
+ b_im.val[0] = neon_b.val[2];
+ b_im.val[1] = neon_b.val[3];
+
+ FPC_MULx2(c_re, c_im, a_re, a_im, b_re, b_im);
+
+ neon_c.val[0] = c_re.val[0];
+ neon_c.val[1] = c_re.val[1];
+ neon_c.val[2] = c_im.val[0];
+ neon_c.val[3] = c_im.val[1];
+
+ vstorex4(&c[0], neon_c);
+}
+
+/* see inner.h */
+/*
+ * c = a * b
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(fpr *c, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ // Total 32 registers
+ float64x2x4_t a_re, b_re, a_im, b_im; // 24
+ float64x2x4_t c_re, c_im; // 8
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log1(c, a, b);
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log2(c, a, b);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_log3(c, a, b);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ FPC_MULx4(c_re, c_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+ break;
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log1(
+ fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+ const fpr *restrict b) {
+ fpr a_re, a_im, b_re, b_im, c_re, c_im, d_re, d_im;
+
+ a_re = a[0];
+ a_im = a[1];
+ b_re = b[0];
+ b_im = b[1];
+ d_re = d[0];
+ d_im = d[1];
+
+ c_re = a_re * b_re - a_im * b_im;
+ c_im = a_re * b_im + a_im * b_re;
+
+ c[0] = c_re + d_re;
+ c[1] = c_im + d_im;
+
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log2(
+ fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+ const fpr *restrict b) {
+ // n = 4
+ float64x2x2_t neon_a, neon_b, neon_d;
+ float64x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+ // 0: re, re
+ // 1: im, im
+ vloadx2(neon_a, &a[0]);
+ vloadx2(neon_b, &b[0]);
+ vloadx2(neon_d, &d[0]);
+
+ a_re = neon_a.val[0];
+ a_im = neon_a.val[1];
+ b_re = neon_b.val[0];
+ b_im = neon_b.val[1];
+ d_re = neon_d.val[0];
+ d_im = neon_d.val[1];
+
+ FPC_MLA(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ neon_d.val[0] = d_re;
+ neon_d.val[1] = d_im;
+
+ vstorex2(&c[0], neon_d);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log3(
+ fpr *restrict c, const fpr *restrict d, const fpr *restrict a,
+ const fpr *restrict b) {
+ // n = 8
+ float64x2x4_t neon_a, neon_b, neon_d;
+ float64x2x2_t a_re, a_im, b_re, b_im, d_re, d_im;
+
+ vloadx4(neon_a, &a[0]);
+ vloadx4(neon_b, &b[0]);
+ vloadx4(neon_d, &d[0]);
+
+ a_re.val[0] = neon_a.val[0];
+ a_re.val[1] = neon_a.val[1];
+ a_im.val[0] = neon_a.val[2];
+ a_im.val[1] = neon_a.val[3];
+
+ b_re.val[0] = neon_b.val[0];
+ b_re.val[1] = neon_b.val[1];
+ b_im.val[0] = neon_b.val[2];
+ b_im.val[1] = neon_b.val[3];
+
+ d_re.val[0] = neon_d.val[0];
+ d_re.val[1] = neon_d.val[1];
+ d_im.val[0] = neon_d.val[2];
+ d_im.val[1] = neon_d.val[3];
+
+ FPC_MLAx2(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ neon_d.val[0] = d_re.val[0];
+ neon_d.val[1] = d_re.val[1];
+ neon_d.val[2] = d_im.val[0];
+ neon_d.val[3] = d_im.val[1];
+
+ vstorex4(&c[0], neon_d);
+}
+
+/* see inner.h */
+/*
+ * c = d + a * b
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(fpr *c, const fpr *restrict d,
+ const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ // Total 32 registers
+ float64x2x4_t a_re, b_re, a_im, b_im, d_re, d_im; // 32
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log1(c, d, a, b);
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log2(c, d, a, b);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft_add_log3(c, d, a, b);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+ vloadx4(d_re, &d[i]);
+ vloadx4(d_im, &d[i + hn]);
+
+ FPC_MLAx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&c[i], d_re);
+ vstorex4(&c[i + hn], d_im);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(fpr *d, fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+
+ float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ FPC_MUL_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&d[i], d_re);
+ vstorex4(&d[i + hn], d_im);
+ }
+}
+
+// c = d + a*b
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(fpr *c, fpr *d, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+
+ float64x2x4_t a_re, b_re, d_re, a_im, b_im, d_im; // 24
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+ vloadx4(d_re, &d[i]);
+ vloadx4(d_im, &d[i + hn]);
+
+ FPC_MLA_CONJx4(d_re, d_im, a_re, a_im, b_re, b_im);
+
+ vstorex4(&c[i], d_re);
+ vstorex4(&c[i + hn], d_im);
+ }
+}
+
+/* see inner.h */
+/*
+ * c = a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(fpr *c,
+ const fpr *restrict a,
+ unsigned logn) {
+
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ float64x2x4_t a_re, a_im, c_re, c_im; // 16
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+
+ vfdupx4(c_im, 0);
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+
+ vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+ vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+ vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+ vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+}
+
+/*
+ * c = d + a * adj(a)
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(fpr *c,
+ const fpr *restrict d,
+ const fpr *restrict a,
+ unsigned logn) {
+
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ float64x2x4_t a_re, a_im, d_re; // 16
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(d_re, &d[i]);
+
+ vfmla(d_re.val[0], d_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(d_re.val[0], d_re.val[0], a_im.val[0], a_im.val[0]);
+ vfmla(d_re.val[1], d_re.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(d_re.val[1], d_re.val[1], a_im.val[1], a_im.val[1]);
+ vfmla(d_re.val[2], d_re.val[2], a_re.val[2], a_re.val[2]);
+ vfmla(d_re.val[2], d_re.val[2], a_im.val[2], a_im.val[2]);
+ vfmla(d_re.val[3], d_re.val[3], a_re.val[3], a_re.val[3]);
+ vfmla(d_re.val[3], d_re.val[3], a_im.val[3], a_im.val[3]);
+
+ vstorex4(&c[i], d_re);
+ }
+}
+
+/* see inner.h */
+/*
+ * c = a * scalar_x
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(fpr *c, const fpr *a, const fpr x,
+ unsigned logn) {
+ // assert(logn >= 3);
+ // Total SIMD registers: 9
+ const unsigned falcon_n = 1 << logn;
+ float64x2x4_t neon_a, neon_c; // 8
+ float64x2_t neon_x; // 1
+ neon_x = vdupq_n_f64(x);
+ for (unsigned i = 0; i < falcon_n; i += 8) {
+ vloadx4(neon_a, &a[i]);
+
+ vfmulx4_i(neon_c, neon_a, neon_x);
+
+ vstorex4(&c[i], neon_c);
+ }
+}
+
+/* see inner.h
+ * Unused in the implementation
+ */
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_fft(fpr *restrict c,
+ const fpr *restrict a,
+ const fpr *restrict b,
+ unsigned logn) {
+
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, b_im, c_re, c_im, m;
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ vfmulx4(m, b_re, b_re);
+ vfmlax4(m, m, b_im, b_im);
+
+ vfmulx4(c_re, a_re, b_re);
+ vfmlax4(c_re, c_re, a_im, b_im);
+
+ vfinvx4(m, m);
+
+ vfmulx4(c_im, a_im, b_re);
+ vfmlsx4(c_im, c_im, a_re, b_im);
+
+ vfmulx4(c_re, c_re, m);
+ vfmulx4(c_im, c_im, m);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_invnorm2_fft(fpr *restrict d,
+ const fpr *restrict a,
+ const fpr *restrict b,
+ unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, b_im, c_re;
+ float64x2x2_t x, y;
+ float64x2_t z;
+
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1; i = 0
+ /*
+ * x_re = a[0];
+ * x_im = a[1];
+ * y_re = b[0];
+ * y_im = b[1];
+ * d[0] = 1.0/( (x_re*x_re) + (x_im*x_im) + (y_re*y_re) + (y_im*y_im) );
+ */
+ vload(a_re.val[0], &a[0]);
+ vload(b_re.val[0], &b[0]);
+ vfmul(a_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(c_re.val[0], a_re.val[0], b_re.val[0], b_re.val[0]);
+ d[0] = 1.0 / vaddvq_f64(c_re.val[0]);
+ break;
+
+ case 2:
+ // n = 4; hn = 2; i = 0, 1
+ vloadx2(x, &a[0]);
+ vloadx2(y, &b[0]);
+
+ vfmul(z, x.val[0], x.val[0]);
+ vfmla(z, z, x.val[1], x.val[1]);
+ vfmla(z, z, y.val[0], y.val[0]);
+ vfmla(z, z, y.val[1], y.val[1]);
+ vfinv(z, z);
+
+ vstore(&d[0], z);
+ break;
+
+ case 3:
+ // n = 8; hn = 4; i = 0,1,2,3
+ vloadx4(a_re, &a[0]);
+ vloadx4(b_re, &b[0]);
+
+ vfmul(x.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(x.val[0], x.val[0], b_re.val[0], b_re.val[0]);
+ vfmla(x.val[0], x.val[0], a_re.val[2], a_re.val[2]);
+ vfmla(x.val[0], x.val[0], b_re.val[2], b_re.val[2]);
+ vfinv(x.val[0], x.val[0]);
+
+ vfmul(x.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(x.val[1], x.val[1], b_re.val[1], b_re.val[1]);
+ vfmla(x.val[1], x.val[1], a_re.val[3], a_re.val[3]);
+ vfmla(x.val[1], x.val[1], b_re.val[3], b_re.val[3]);
+ vfinv(x.val[1], x.val[1]);
+
+ vstorex2(&d[0], x);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+ vloadx4(b_im, &b[i + hn]);
+
+ vfmul(c_re.val[0], a_re.val[0], a_re.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], a_im.val[0], a_im.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], b_re.val[0], b_re.val[0]);
+ vfmla(c_re.val[0], c_re.val[0], b_im.val[0], b_im.val[0]);
+ vfinv(c_re.val[0], c_re.val[0]);
+
+ vfmul(c_re.val[1], a_re.val[1], a_re.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], a_im.val[1], a_im.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], b_re.val[1], b_re.val[1]);
+ vfmla(c_re.val[1], c_re.val[1], b_im.val[1], b_im.val[1]);
+ vfinv(c_re.val[1], c_re.val[1]);
+
+ vfmul(c_re.val[2], a_re.val[2], a_re.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], a_im.val[2], a_im.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], b_re.val[2], b_re.val[2]);
+ vfmla(c_re.val[2], c_re.val[2], b_im.val[2], b_im.val[2]);
+ vfinv(c_re.val[2], c_re.val[2]);
+
+ vfmul(c_re.val[3], a_re.val[3], a_re.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], a_im.val[3], a_im.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], b_re.val[3], b_re.val[3]);
+ vfmla(c_re.val[3], c_re.val[3], b_im.val[3], b_im.val[3]);
+ vfinv(c_re.val[3], c_re.val[3]);
+
+ vstorex4(&d[i], c_re);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_add_muladj_fft(
+ fpr *restrict d, const fpr *restrict F, const fpr *restrict G,
+ const fpr *restrict f, const fpr *restrict g, unsigned logn) {
+
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t F_re, F_im, G_re, G_im;
+ float64x2x4_t f_re, f_im, g_re, g_im;
+ float64x2x4_t a_re, a_im;
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(F_re, &F[i]);
+ vloadx4(F_im, &F[i + hn]);
+ vloadx4(f_re, &f[i]);
+ vloadx4(f_im, &f[i + hn]);
+
+ FPC_MUL_CONJx4(a_re, a_im, F_re, F_im, f_re, f_im);
+
+ vloadx4(G_re, &G[i]);
+ vloadx4(g_re, &g[i]);
+
+ vloadx4(G_im, &G[i + hn]);
+ vloadx4(g_im, &g[i + hn]);
+
+ FPC_MLA_CONJx4(a_re, a_im, G_re, G_im, g_re, g_im);
+
+ vstorex4(&d[i], a_re);
+ vstorex4(&d[i + hn], a_im);
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_autoadj_fft(fpr *c, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, c_re, c_im;
+ float64x2x2_t a_re_im, b_re_im, c_re_im;
+ switch (logn) {
+ case 1:
+ // n = 2; hn = 1; i = 0
+ vload(a_re.val[0], &a[0]);
+ vfmuln(a_re.val[0], a_re.val[0], b[0]);
+ vstore(&c[0], a_re.val[0]);
+ break;
+
+ case 2:
+ // n = 4; hn = 2; i = 0, 1
+ vload2(a_re_im, &a[0]);
+ vload(b_re_im.val[0], &b[0]);
+ vfmul_lane(c_re_im.val[0], a_re_im.val[0], b_re_im.val[0], 0);
+ vfmul_lane(c_re_im.val[1], a_re_im.val[1], b_re_im.val[0], 1);
+ vstore2(&c[0], c_re_im);
+ break;
+
+ case 3:
+ // n = 8; hn = 4; i = 0,1,2,3
+ vload4(a_re, &a[0]);
+ vloadx2(b_re_im, &b[0]);
+ vfmul_lane(c_re.val[0], a_re.val[0], b_re_im.val[0], 0);
+ vfmul_lane(c_re.val[1], a_re.val[1], b_re_im.val[0], 1);
+ vfmul_lane(c_re.val[2], a_re.val[2], b_re_im.val[1], 0);
+ vfmul_lane(c_re.val[3], a_re.val[3], b_re_im.val[1], 1);
+ vstore4(&c[0], c_re);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+ vloadx4(b_re, &b[i]);
+
+ vfmulx4(c_re, a_re, b_re);
+ vfmulx4(c_im, a_im, b_re);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+ break;
+ }
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_autoadj_fft(fpr *c, const fpr *a,
+ const fpr *restrict b,
+ unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t a_re, a_im, b_re, binv, c_re, c_im;
+
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(b_re, &b[i]);
+ vfinvx4(binv, b_re);
+
+ vloadx4(a_re, &a[i]);
+ vloadx4(a_im, &a[i + hn]);
+
+ vfmulx4(c_re, a_re, binv);
+ vfmulx4(c_im, a_im, binv);
+
+ vstorex4(&c[i], c_re);
+ vstorex4(&c[i + hn], c_im);
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log1(
+ const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+ float64x2x4_t g00_re, g01_re, g11_re;
+ float64x2x4_t mu_re, m;
+ float64x2_t neon_1i2;
+
+ const fpr imagine[2] = {1.0, -1.0};
+ // n = 2; hn = 1;
+ vload(g00_re.val[0], &g00[0]);
+
+ // g00_re^2 | g00_im^2
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ // 1 / ( g00_re^2 + g00_im^2 )
+ m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+ vload(g01_re.val[0], &g01[0]);
+ vload(neon_1i2, &imagine[0]);
+
+ // g01_re * g00_re | g01_im * g01_im
+ vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+ // g01_im | -g01_re
+ vswap(g01_re.val[1], g01_re.val[0]);
+ vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+ // g01_im * g00_re - g01_re * g00_im
+ vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+ mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+ // re: mu_re * g01_re + mu_im * g01_im
+ vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+ vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+ vswap(g01_re.val[2], g01_re.val[2]);
+ // im: -g01_im * mu_re + g01_re * mu_im
+ vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+ g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+ vload(g11_re.val[0], &g11[0]);
+
+ vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+ vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+ vstore(&g11[0], g11_re.val[0]);
+ vstore(&g01[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log2(
+ const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+ float64x2x2_t tmp;
+
+ // n = 4; hn = 2
+ vloadx2(tmp, &g00[0]);
+ g00_re.val[0] = tmp.val[0];
+ g00_im.val[0] = tmp.val[1];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vloadx2(tmp, &g01[0]);
+ g01_re.val[0] = tmp.val[0];
+ g01_im.val[0] = tmp.val[1];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+ vloadx2(tmp, &g11[0]);
+ g11_re.val[0] = tmp.val[0];
+ g11_im.val[0] = tmp.val[1];
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+ tmp.val[0] = d_re.val[0];
+ tmp.val[1] = d_im.val[0];
+ vstorex2(&g11[0], tmp);
+
+ vfneg(mu_im.val[0], mu_im.val[0]);
+ tmp.val[0] = mu_re.val[0];
+ tmp.val[1] = mu_im.val[0];
+ vstorex2(&g01[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log3(
+ const fpr *restrict g00, fpr *restrict g01, fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+ float64x2x4_t mu_re, mu_im, m, d_re;
+ // n = 8; hn = 4
+ vloadx4(g00_re, &g00[0]);
+ g00_im.val[0] = g00_re.val[2];
+ g00_im.val[1] = g00_re.val[3];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vloadx4(g01_re, &g01[0]);
+ g01_im.val[0] = g01_re.val[2];
+ g01_im.val[1] = g01_re.val[3];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+ vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+ vloadx4(g11_re, &g11[0]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+ vstorex4(&g11[0], d_re);
+
+ vfneg(mu_re.val[2], mu_im.val[0]);
+ vfneg(mu_re.val[3], mu_im.val[1]);
+
+ vstorex4(&g01[0], mu_re);
+}
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft(const fpr *restrict g00,
+ fpr *restrict g01,
+ fpr *restrict g11, unsigned logn) {
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log1(g00, g01, g11);
+
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log2(g00, g01, g11);
+
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft_log3(g00, g01, g11);
+
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(g00_re, &g00[i]);
+ vloadx4(g00_im, &g00[i + hn]);
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+ vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+ vfinv(m.val[2], m.val[2]);
+
+ vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+ vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+ vfinv(m.val[3], m.val[3]);
+
+ vloadx4(g01_re, &g01[i]);
+ vloadx4(g01_im, &g01[i + hn]);
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+ vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+ vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+ vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+ vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+ vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+ vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+ vfmulx4(mu_re, mu_re, m);
+ vfmulx4(mu_im, mu_im, m);
+ vstorex4(&g01[i], mu_re);
+
+ vloadx4(g11_re, &g11[i]);
+ vloadx4(g11_im, &g11[i + hn]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+ vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+ vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+ vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+ vstorex4(&g11[i], d_re);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+ vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+ vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+ vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+ vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+ vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+ vstorex4(&g11[i + hn], d_im);
+
+ vfnegx4(mu_im, mu_im);
+ vstorex4(&g01[i + hn], mu_im);
+ }
+ break;
+ }
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log1(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11) {
+ float64x2x4_t g00_re, g01_re, g11_re;
+ float64x2x4_t mu_re, m;
+ float64x2_t neon_1i2;
+
+ const fpr imagine[2] = {1.0, -1.0};
+ // n = 2; hn = 1;
+ vload(g00_re.val[0], &g00[0]);
+
+ // g00_re^2 | g00_im^2
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ // 1 / ( g00_re^2 + g00_im^2 )
+ m.val[0] = vdupq_n_f64(1 / vaddvq_f64(m.val[0]));
+
+ vload(g01_re.val[0], &g01[0]);
+ vload(neon_1i2, &imagine[0]);
+
+ // g01_re * g00_re | g01_im * g01_im
+ vfmul(g01_re.val[2], g01_re.val[0], g00_re.val[0]);
+
+ // g01_im | -g01_re
+ vswap(g01_re.val[1], g01_re.val[0]);
+ vfmul(g01_re.val[1], g01_re.val[1], neon_1i2);
+ // g01_im * g00_re - g01_re * g00_im
+ vfmul(g01_re.val[1], g01_re.val[1], g00_re.val[0]);
+ mu_re.val[0] = vpaddq_f64(g01_re.val[2], g01_re.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+
+ // re: mu_re * g01_re + mu_im * g01_im
+ vfmul(g01_re.val[1], mu_re.val[0], g01_re.val[0]);
+
+ vfmul(g01_re.val[2], g01_re.val[0], neon_1i2);
+ vswap(g01_re.val[2], g01_re.val[2]);
+ // im: -g01_im * mu_re + g01_re * mu_im
+ vfmul(g01_re.val[2], g01_re.val[2], mu_re.val[0]);
+ g01_re.val[0] = vpaddq_f64(g01_re.val[1], g01_re.val[2]);
+
+ vload(g11_re.val[0], &g11[0]);
+
+ vfsub(g11_re.val[0], g11_re.val[0], g01_re.val[0]);
+ vfmul(mu_re.val[0], mu_re.val[0], neon_1i2);
+
+ vstore(&d11[0], g11_re.val[0]);
+ vstore(&l10[0], mu_re.val[0]);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log2(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+ float64x2x2_t tmp;
+
+ // n = 4; hn = 2
+ vloadx2(tmp, &g00[0]);
+ g00_re.val[0] = tmp.val[0];
+ g00_im.val[0] = tmp.val[1];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vloadx2(tmp, &g01[0]);
+ g01_re.val[0] = tmp.val[0];
+ g01_im.val[0] = tmp.val[1];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+
+ vloadx2(tmp, &g11[0]);
+ g11_re.val[0] = tmp.val[0];
+ g11_im.val[0] = tmp.val[1];
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+
+ tmp.val[0] = d_re.val[0];
+ tmp.val[1] = d_im.val[0];
+ vstorex2(&d11[0], tmp);
+
+ vfneg(mu_im.val[0], mu_im.val[0]);
+ tmp.val[0] = mu_re.val[0];
+ tmp.val[1] = mu_im.val[0];
+ vstorex2(&l10[0], tmp);
+}
+
+static inline void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log3(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11) {
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re;
+ float64x2x4_t mu_re, mu_im, m, d_re;
+ // n = 8; hn = 4
+ vloadx4(g00_re, &g00[0]);
+ g00_im.val[0] = g00_re.val[2];
+ g00_im.val[1] = g00_re.val[3];
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vloadx4(g01_re, &g01[0]);
+ g01_im.val[0] = g01_re.val[2];
+ g01_im.val[1] = g01_re.val[3];
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[0], mu_re.val[0], m.val[0]);
+ vfmul(mu_re.val[1], mu_re.val[1], m.val[1]);
+ vfmul(mu_im.val[0], mu_im.val[0], m.val[0]);
+ vfmul(mu_im.val[1], mu_im.val[1], m.val[1]);
+
+ vloadx4(g11_re, &g11[0]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_re.val[2], d_re.val[2], mu_re.val[0], g01_im.val[0]);
+
+ vfmls(d_re.val[3], g11_re.val[3], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_re.val[3], d_re.val[3], mu_re.val[1], g01_im.val[1]);
+
+ vstorex4(&d11[0], d_re);
+
+ vfneg(mu_re.val[2], mu_im.val[0]);
+ vfneg(mu_re.val[3], mu_im.val[1]);
+
+ vstorex4(&l10[0], mu_re);
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(
+ fpr *restrict d11, fpr *restrict l10, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11, unsigned logn) {
+
+ const unsigned falcon_n = 1 << logn;
+ const unsigned hn = falcon_n >> 1;
+ float64x2x4_t g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ float64x2x4_t mu_re, mu_im, m, d_re, d_im;
+
+ switch (logn) {
+ case 1:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log1(d11, l10, g00, g01, g11);
+ break;
+
+ case 2:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log2(d11, l10, g00, g01, g11);
+ break;
+
+ case 3:
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft_log3(d11, l10, g00, g01, g11);
+ break;
+
+ default:
+ for (unsigned i = 0; i < hn; i += 8) {
+ vloadx4(g00_re, &g00[i]);
+ vloadx4(g00_im, &g00[i + hn]);
+
+ vfmul(m.val[0], g00_re.val[0], g00_re.val[0]);
+ vfmla(m.val[0], m.val[0], g00_im.val[0], g00_im.val[0]);
+ vfinv(m.val[0], m.val[0]);
+
+ vfmul(m.val[1], g00_re.val[1], g00_re.val[1]);
+ vfmla(m.val[1], m.val[1], g00_im.val[1], g00_im.val[1]);
+ vfinv(m.val[1], m.val[1]);
+
+ vfmul(m.val[2], g00_re.val[2], g00_re.val[2]);
+ vfmla(m.val[2], m.val[2], g00_im.val[2], g00_im.val[2]);
+ vfinv(m.val[2], m.val[2]);
+
+ vfmul(m.val[3], g00_re.val[3], g00_re.val[3]);
+ vfmla(m.val[3], m.val[3], g00_im.val[3], g00_im.val[3]);
+ vfinv(m.val[3], m.val[3]);
+
+ vloadx4(g01_re, &g01[i]);
+ vloadx4(g01_im, &g01[i + hn]);
+
+ vfmul(mu_re.val[0], g01_re.val[0], g00_re.val[0]);
+ vfmla(mu_re.val[0], mu_re.val[0], g01_im.val[0], g00_im.val[0]);
+
+ vfmul(mu_re.val[1], g01_re.val[1], g00_re.val[1]);
+ vfmla(mu_re.val[1], mu_re.val[1], g01_im.val[1], g00_im.val[1]);
+
+ vfmul(mu_re.val[2], g01_re.val[2], g00_re.val[2]);
+ vfmla(mu_re.val[2], mu_re.val[2], g01_im.val[2], g00_im.val[2]);
+
+ vfmul(mu_re.val[3], g01_re.val[3], g00_re.val[3]);
+ vfmla(mu_re.val[3], mu_re.val[3], g01_im.val[3], g00_im.val[3]);
+
+ vfmul(mu_im.val[0], g01_im.val[0], g00_re.val[0]);
+ vfmls(mu_im.val[0], mu_im.val[0], g01_re.val[0], g00_im.val[0]);
+
+ vfmul(mu_im.val[1], g01_im.val[1], g00_re.val[1]);
+ vfmls(mu_im.val[1], mu_im.val[1], g01_re.val[1], g00_im.val[1]);
+
+ vfmul(mu_im.val[2], g01_im.val[2], g00_re.val[2]);
+ vfmls(mu_im.val[2], mu_im.val[2], g01_re.val[2], g00_im.val[2]);
+
+ vfmul(mu_im.val[3], g01_im.val[3], g00_re.val[3]);
+ vfmls(mu_im.val[3], mu_im.val[3], g01_re.val[3], g00_im.val[3]);
+
+ vfmulx4(mu_re, mu_re, m);
+ vfmulx4(mu_im, mu_im, m);
+ vstorex4(&l10[i], mu_re);
+
+ vloadx4(g11_re, &g11[i]);
+ vloadx4(g11_im, &g11[i + hn]);
+
+ vfmls(d_re.val[0], g11_re.val[0], mu_re.val[0], g01_re.val[0]);
+ vfmls(d_re.val[0], d_re.val[0], mu_im.val[0], g01_im.val[0]);
+ vfmls(d_re.val[1], g11_re.val[1], mu_re.val[1], g01_re.val[1]);
+ vfmls(d_re.val[1], d_re.val[1], mu_im.val[1], g01_im.val[1]);
+
+ vfmls(d_re.val[2], g11_re.val[2], mu_re.val[2], g01_re.val[2]);
+ vfmls(d_re.val[2], d_re.val[2], mu_im.val[2], g01_im.val[2]);
+ vfmls(d_re.val[3], g11_re.val[3], mu_re.val[3], g01_re.val[3]);
+ vfmls(d_re.val[3], d_re.val[3], mu_im.val[3], g01_im.val[3]);
+ vstorex4(&d11[i], d_re);
+
+ vfmls(d_im.val[0], g11_im.val[0], mu_im.val[0], g01_re.val[0]);
+ vfmla(d_im.val[0], d_im.val[0], mu_re.val[0], g01_im.val[0]);
+ vfmls(d_im.val[1], g11_im.val[1], mu_im.val[1], g01_re.val[1]);
+ vfmla(d_im.val[1], d_im.val[1], mu_re.val[1], g01_im.val[1]);
+
+ vfmls(d_im.val[2], g11_im.val[2], mu_im.val[2], g01_re.val[2]);
+ vfmla(d_im.val[2], d_im.val[2], mu_re.val[2], g01_im.val[2]);
+ vfmls(d_im.val[3], g11_im.val[3], mu_im.val[3], g01_re.val[3]);
+ vfmla(d_im.val[3], d_im.val[3], mu_re.val[3], g01_im.val[3]);
+ vstorex4(&d11[i + hn], d_im);
+
+ vfnegx4(mu_im, mu_im);
+ vstorex4(&l10[i + hn], mu_im);
+ }
+ break;
+ }
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(fpr *t0, const uint16_t *hm,
+ const unsigned falcon_n) {
+ float64x2x4_t neon_t0;
+ uint16x8x4_t neon_hm;
+ uint16x8_t neon_zero;
+ uint32x4x4_t neon_hmu32[2];
+ int64x2x4_t neon_hms64[4];
+ neon_zero = vdupq_n_u16(0);
+ for (unsigned u = 0; u < falcon_n; u += 32) {
+ neon_hm = vld1q_u16_x4(&hm[u]);
+ neon_hmu32[0].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[0], neon_zero);
+ neon_hmu32[0].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[0], neon_zero);
+ neon_hmu32[0].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[1], neon_zero);
+ neon_hmu32[0].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[1], neon_zero);
+
+ neon_hmu32[1].val[0] = (uint32x4_t)vzip1q_u16(neon_hm.val[2], neon_zero);
+ neon_hmu32[1].val[1] = (uint32x4_t)vzip2q_u16(neon_hm.val[2], neon_zero);
+ neon_hmu32[1].val[2] = (uint32x4_t)vzip1q_u16(neon_hm.val[3], neon_zero);
+ neon_hmu32[1].val[3] = (uint32x4_t)vzip2q_u16(neon_hm.val[3], neon_zero);
+
+ neon_hms64[0].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[0].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[0].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+ neon_hms64[0].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[1], (uint32x4_t)neon_zero);
+
+ neon_hms64[1].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[1].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[1].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+ neon_hms64[1].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[0].val[3], (uint32x4_t)neon_zero);
+
+ neon_hms64[2].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[2].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[0], (uint32x4_t)neon_zero);
+ neon_hms64[2].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+ neon_hms64[2].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[1], (uint32x4_t)neon_zero);
+
+ neon_hms64[3].val[0] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[3].val[1] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[2], (uint32x4_t)neon_zero);
+ neon_hms64[3].val[2] =
+ (int64x2_t)vzip1q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+ neon_hms64[3].val[3] =
+ (int64x2_t)vzip2q_u32(neon_hmu32[1].val[3], (uint32x4_t)neon_zero);
+
+ vfcvtx4(neon_t0, neon_hms64[0]);
+ vstorex4(&t0[u], neon_t0);
+
+ vfcvtx4(neon_t0, neon_hms64[1]);
+ vstorex4(&t0[u + 8], neon_t0);
+
+ vfcvtx4(neon_t0, neon_hms64[2]);
+ vstorex4(&t0[u + 16], neon_t0);
+
+ vfcvtx4(neon_t0, neon_hms64[3]);
+ vstorex4(&t0[u + 24], neon_t0);
+ }
+}
+
+fpr PQCLEAN_FALCONPADDED512_AARCH64_compute_bnorm(const fpr *rt1, const fpr *rt2) {
+ float64x2x4_t r1, r11, r2, r22;
+ float64x2x4_t bnorm, bnorm2;
+
+ vfdupx4(bnorm, 0);
+ vfdupx4(bnorm2, 0);
+
+ for (unsigned i = 0; i < FALCON_N;) {
+ vloadx4(r1, &rt1[i]);
+ i += 8;
+
+ vfmla(bnorm.val[0], bnorm.val[0], r1.val[0], r1.val[0]);
+ vfmla(bnorm.val[1], bnorm.val[1], r1.val[1], r1.val[1]);
+ vfmla(bnorm.val[2], bnorm.val[2], r1.val[2], r1.val[2]);
+ vfmla(bnorm.val[3], bnorm.val[3], r1.val[3], r1.val[3]);
+
+ vloadx4(r11, &rt1[i]);
+ i += 8;
+
+ vfmla(bnorm2.val[0], bnorm2.val[0], r11.val[0], r11.val[0]);
+ vfmla(bnorm2.val[1], bnorm2.val[1], r11.val[1], r11.val[1]);
+ vfmla(bnorm2.val[2], bnorm2.val[2], r11.val[2], r11.val[2]);
+ vfmla(bnorm2.val[3], bnorm2.val[3], r11.val[3], r11.val[3]);
+ }
+
+ for (unsigned i = 0; i < FALCON_N;) {
+ vloadx4(r2, &rt2[i]);
+ i += 8;
+
+ vfmla(bnorm.val[0], bnorm.val[0], r2.val[0], r2.val[0]);
+ vfmla(bnorm.val[1], bnorm.val[1], r2.val[1], r2.val[1]);
+ vfmla(bnorm.val[2], bnorm.val[2], r2.val[2], r2.val[2]);
+ vfmla(bnorm.val[3], bnorm.val[3], r2.val[3], r2.val[3]);
+
+ vloadx4(r22, &rt2[i]);
+ i += 8;
+
+ vfmla(bnorm2.val[0], bnorm2.val[0], r22.val[0], r22.val[0]);
+ vfmla(bnorm2.val[1], bnorm2.val[1], r22.val[1], r22.val[1]);
+ vfmla(bnorm2.val[2], bnorm2.val[2], r22.val[2], r22.val[2]);
+ vfmla(bnorm2.val[3], bnorm2.val[3], r22.val[3], r22.val[3]);
+ }
+
+ vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[1]);
+ vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[1]);
+ vfadd(bnorm.val[2], bnorm.val[2], bnorm.val[3]);
+ vfadd(bnorm2.val[2], bnorm2.val[2], bnorm2.val[3]);
+ vfadd(bnorm.val[0], bnorm.val[0], bnorm.val[2]);
+ vfadd(bnorm2.val[0], bnorm2.val[0], bnorm2.val[2]);
+
+ vfadd(bnorm.val[0], bnorm.val[0], bnorm2.val[0]);
+
+ return vaddvq_f64(bnorm.val[0]);
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_int.c
new file mode 100644
index 000000000..3e1120687
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/poly_int.c
@@ -0,0 +1,501 @@
+/*
+ * poly_int.c
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include
+#include "macrous.h"
+#include "params.h"
+#include "poly.h"
+#include "ntt_consts.h"
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(int16_t out[FALCON_N], const int8_t in[FALCON_N]) {
+ // Total SIMD registers: 24 = 16 + 8
+ int16x8x4_t a, b, e, f; // 16
+ int8x16x4_t c, d; // 8
+
+ for (int i = 0; i < FALCON_N; i += 128) {
+ c = vld1q_s8_x4(&in[i]);
+
+ a.val[0] = vmovl_s8(vget_low_s8(c.val[0]));
+ a.val[2] = vmovl_s8(vget_low_s8(c.val[1]));
+ b.val[0] = vmovl_s8(vget_low_s8(c.val[2]));
+ b.val[2] = vmovl_s8(vget_low_s8(c.val[3]));
+
+ a.val[1] = vmovl_high_s8(c.val[0]);
+ a.val[3] = vmovl_high_s8(c.val[1]);
+ b.val[1] = vmovl_high_s8(c.val[2]);
+ b.val[3] = vmovl_high_s8(c.val[3]);
+
+ d = vld1q_s8_x4(&in[i + 64]);
+
+ e.val[0] = vmovl_s8(vget_low_s8(d.val[0]));
+ e.val[2] = vmovl_s8(vget_low_s8(d.val[1]));
+ f.val[0] = vmovl_s8(vget_low_s8(d.val[2]));
+ f.val[2] = vmovl_s8(vget_low_s8(d.val[3]));
+
+ e.val[1] = vmovl_high_s8(d.val[0]);
+ e.val[3] = vmovl_high_s8(d.val[1]);
+ f.val[1] = vmovl_high_s8(d.val[2]);
+ f.val[3] = vmovl_high_s8(d.val[3]);
+
+ vst1q_s16_x4(&out[i], a);
+ vst1q_s16_x4(&out[i + 32], b);
+ vst1q_s16_x4(&out[i + 64], e);
+ vst1q_s16_x4(&out[i + 96], f);
+ }
+}
+
+/*
+ * Return f[] = f[]/g[] % 12289
+ * See assembly https://godbolt.org/z/od3Ex7Mbx
+ */
+
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(int16_t f[FALCON_N], const int16_t g[FALCON_N]) {
+ // Total SIMD registers: 24 = 4 + 19 + 1
+ int16x8x4_t src, dst, t, k; // 4
+ int16x8x4_t y0, y1, y2, y3, y4, y5,
+ y6, y7, y8, y9, y10, y11, y12,
+ y13, y14, y15, y16, y17, y18; // 19
+ int16x8_t neon_qmvm; // 1
+
+ neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+
+ for (int i = 0; i < FALCON_N; i += 32) {
+ // Find y0 = g^12287
+ vload_s16_x4(y0, &g[i]);
+
+ // y0 is already in Montgomery domain
+
+ montmul_x4(y1, y0, y0, neon_qmvm, t);
+ montmul_x4(y2, y1, y0, neon_qmvm, k);
+ montmul_x4(y3, y2, y1, neon_qmvm, t);
+ montmul_x4(y4, y3, y3, neon_qmvm, k);
+ montmul_x4(y5, y4, y4, neon_qmvm, t);
+ montmul_x4(y6, y5, y5, neon_qmvm, k);
+ montmul_x4(y7, y6, y6, neon_qmvm, t);
+ montmul_x4(y8, y7, y7, neon_qmvm, k);
+ montmul_x4(y9, y8, y2, neon_qmvm, t);
+ montmul_x4(y10, y9, y8, neon_qmvm, k);
+ montmul_x4(y11, y10, y10, neon_qmvm, t);
+ montmul_x4(y12, y11, y11, neon_qmvm, k);
+ montmul_x4(y13, y12, y9, neon_qmvm, t);
+ montmul_x4(y14, y13, y13, neon_qmvm, k);
+ montmul_x4(y15, y14, y14, neon_qmvm, t);
+ montmul_x4(y16, y15, y10, neon_qmvm, k);
+ montmul_x4(y17, y16, y16, neon_qmvm, t);
+ montmul_x4(y18, y17, y0, neon_qmvm, k);
+
+ vload_s16_x4(src, &f[i]);
+
+ montmul_x4(dst, y18, src, neon_qmvm, t);
+
+ vstore_s16_x4(&f[i], dst);
+ }
+}
+
+/*
+ * f = g - s
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(int16_t f[FALCON_N], const int16_t g[FALCON_N], const int16_t s[FALCON_N]) {
+ // Total SIMD registers: 29 = 28 + 1
+ int16x8x4_t a, b, c, d, e, h, t; // 28
+ int16x8_t neon_qmvm; // 1
+ neon_qmvm = vld1q_s16(PQCLEAN_FALCONPADDED512_AARCH64_qmvq);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &g[i]);
+ vload_s16_x4(b, &s[i]);
+
+ e.val[0] = vsubq_s16(a.val[0], b.val[0]);
+ e.val[1] = vsubq_s16(a.val[1], b.val[1]);
+ e.val[2] = vsubq_s16(a.val[2], b.val[2]);
+ e.val[3] = vsubq_s16(a.val[3], b.val[3]);
+
+ vload_s16_x4(c, &g[i + 32]);
+ vload_s16_x4(d, &s[i + 32]);
+
+ h.val[0] = vsubq_s16(c.val[0], d.val[0]);
+ h.val[1] = vsubq_s16(c.val[1], d.val[1]);
+ h.val[2] = vsubq_s16(c.val[2], d.val[2]);
+ h.val[3] = vsubq_s16(c.val[3], d.val[3]);
+
+ barrett_x4(e, neon_qmvm, t);
+ barrett_x4(h, neon_qmvm, t);
+
+ vstore_s16_x4(&f[i], e);
+ vstore_s16_x4(&f[i + 32], h);
+ }
+}
+
+/*
+ * Check f[] has 0
+ * Return:
+ * 1 if 0 in f[]
+ * otherwise, 0
+ */
+uint16_t PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]) {
+ // Total SIMD registers: 22 = 12 + 8 + 2
+ int16x8x4_t a, b; // 8
+ uint16x8x4_t c, d, e1; // 12
+ uint16x8x2_t e2; // 2
+
+ e2.val[1] = vdupq_n_u16(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &f[i]);
+
+ // Compare bitwise Equal to zero (vector)
+ // a == 0 ? 1 : 0;
+ c.val[0] = vceqzq_s16(a.val[0]);
+ c.val[1] = vceqzq_s16(a.val[1]);
+ c.val[2] = vceqzq_s16(a.val[2]);
+ c.val[3] = vceqzq_s16(a.val[3]);
+
+ vload_s16_x4(b, &f[i + 32]);
+
+ d.val[0] = vceqzq_s16(b.val[0]);
+ d.val[1] = vceqzq_s16(b.val[1]);
+ d.val[2] = vceqzq_s16(b.val[2]);
+ d.val[3] = vceqzq_s16(b.val[3]);
+
+ e1.val[0] = vorrq_u16(d.val[0], c.val[0]);
+ e1.val[1] = vorrq_u16(d.val[1], c.val[1]);
+ e1.val[2] = vorrq_u16(d.val[2], c.val[2]);
+ e1.val[3] = vorrq_u16(d.val[3], c.val[3]);
+
+ e1.val[0] = vorrq_u16(e1.val[0], e1.val[2]);
+ e1.val[1] = vorrq_u16(e1.val[1], e1.val[3]);
+
+ e2.val[0] = vorrq_u16(e1.val[0], e1.val[1]);
+
+ e2.val[1] = vorrq_u16(e2.val[1], e2.val[0]);
+ }
+
+ uint16_t ret = vmaxvq_u16(e2.val[1]);
+
+ return ret;
+}
+
+/*
+ * Branchless conditional addtion with FALCON_Q if coeffcient is < 0
+ * If coefficient is larger than Q, it is subtracted with Q
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) {
+ // Total SIMD registers: 26 = 8 + 16 + 1 + 1
+ uint16x8x4_t b0, b1; // 8
+ int16x8x4_t a0, a1, c0, c1; // 16
+ int16x8_t neon_q; // 1
+ uint16x8_t neon_2q; // 1
+
+ neon_q = vdupq_n_s16(FALCON_Q);
+ neon_2q = vdupq_n_u16(FALCON_Q << 1);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a0, &f[i]);
+
+ b0.val[0] = vcltzq_s16(a0.val[0]);
+ b0.val[1] = vcltzq_s16(a0.val[1]);
+ b0.val[2] = vcltzq_s16(a0.val[2]);
+ b0.val[3] = vcltzq_s16(a0.val[3]);
+
+ vload_s16_x4(a1, &f[i + 32]);
+
+ // Conditional addition with 2*FALCON_Q
+ b1.val[0] = vcltzq_s16(a1.val[0]);
+ b1.val[1] = vcltzq_s16(a1.val[1]);
+ b1.val[2] = vcltzq_s16(a1.val[2]);
+ b1.val[3] = vcltzq_s16(a1.val[3]);
+
+ c0.val[0] = vreinterpretq_s16_u16(vandq_u16(b0.val[0], neon_2q));
+ c0.val[1] = vreinterpretq_s16_u16(vandq_u16(b0.val[1], neon_2q));
+ c0.val[2] = vreinterpretq_s16_u16(vandq_u16(b0.val[2], neon_2q));
+ c0.val[3] = vreinterpretq_s16_u16(vandq_u16(b0.val[3], neon_2q));
+
+ c1.val[0] = vreinterpretq_s16_u16(vandq_u16(b1.val[0], neon_2q));
+ c1.val[1] = vreinterpretq_s16_u16(vandq_u16(b1.val[1], neon_2q));
+ c1.val[2] = vreinterpretq_s16_u16(vandq_u16(b1.val[2], neon_2q));
+ c1.val[3] = vreinterpretq_s16_u16(vandq_u16(b1.val[3], neon_2q));
+
+ vadd_x4(a0, a0, c0);
+ vadd_x4(a1, a1, c1);
+
+ // a > Q ? 1 : 0
+ b0.val[0] = vcgtq_s16(a0.val[0], neon_q);
+ b0.val[1] = vcgtq_s16(a0.val[1], neon_q);
+ b0.val[2] = vcgtq_s16(a0.val[2], neon_q);
+ b0.val[3] = vcgtq_s16(a0.val[3], neon_q);
+
+ b1.val[0] = vcgtq_s16(a1.val[0], neon_q);
+ b1.val[1] = vcgtq_s16(a1.val[1], neon_q);
+ b1.val[2] = vcgtq_s16(a1.val[2], neon_q);
+ b1.val[3] = vcgtq_s16(a1.val[3], neon_q);
+
+ // Conditional subtraction with FALCON_Q
+
+ c0.val[0] = vandq_s16(vreinterpretq_s16_u16(b0.val[0]), neon_q);
+ c0.val[1] = vandq_s16(vreinterpretq_s16_u16(b0.val[1]), neon_q);
+ c0.val[2] = vandq_s16(vreinterpretq_s16_u16(b0.val[2]), neon_q);
+ c0.val[3] = vandq_s16(vreinterpretq_s16_u16(b0.val[3]), neon_q);
+
+ c1.val[0] = vandq_s16(vreinterpretq_s16_u16(b1.val[0]), neon_q);
+ c1.val[1] = vandq_s16(vreinterpretq_s16_u16(b1.val[1]), neon_q);
+ c1.val[2] = vandq_s16(vreinterpretq_s16_u16(b1.val[2]), neon_q);
+ c1.val[3] = vandq_s16(vreinterpretq_s16_u16(b1.val[3]), neon_q);
+
+ vsub_x4(a0, a0, c0);
+ vsub_x4(a1, a1, c1);
+
+ vstore_s16_x4(&f[i], a0);
+ vstore_s16_x4(&f[i + 32], a1);
+ }
+}
+
+/*
+ * Perform conditional subtraction with Q and compare with min, max = -127, 127
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]) {
+ // Total SIMD registers: 32
+ int16x8x4_t a, f; // 8
+ int16x8x4_t d0, d1; // 8
+ uint16x8x4_t c0, c1, x0, x1; // 16
+ uint16x8x2_t e; // 2
+ int8x16x4_t g; // 4
+ int16x8_t neon_127, neon__127, neon_q_2, neon__q_2; // 4
+ uint16x8_t neon_q; // 1
+ neon_127 = vdupq_n_s16(127);
+ neon__127 = vdupq_n_s16(-127);
+ neon_q = vdupq_n_u16(FALCON_Q);
+ neon_q_2 = vdupq_n_s16(FALCON_Q >> 1);
+ neon__q_2 = vdupq_n_s16(-(FALCON_Q >> 1));
+
+ e.val[1] = vdupq_n_u16(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ vload_s16_x4(a, &t[i]);
+ vload_s16_x4(f, &t[i + 32]);
+
+ // Conditional subtraction with FALCON_Q
+ // a >= Q/2 ? 1 : 0
+ c0.val[0] = vcgeq_s16(a.val[0], neon_q_2);
+ c0.val[1] = vcgeq_s16(a.val[1], neon_q_2);
+ c0.val[2] = vcgeq_s16(a.val[2], neon_q_2);
+ c0.val[3] = vcgeq_s16(a.val[3], neon_q_2);
+
+ c1.val[0] = vcgeq_s16(f.val[0], neon_q_2);
+ c1.val[1] = vcgeq_s16(f.val[1], neon_q_2);
+ c1.val[2] = vcgeq_s16(f.val[2], neon_q_2);
+ c1.val[3] = vcgeq_s16(f.val[3], neon_q_2);
+
+ // Perform subtraction with Q
+ d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+ d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+ d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+ d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+ d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+ d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+ d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+ d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+ vsub_x4(a, a, d0);
+ vsub_x4(f, f, d1);
+
+ // -Q/2 > a ? 1: 0
+ c0.val[0] = vcgtq_s16(neon__q_2, a.val[0]);
+ c0.val[1] = vcgtq_s16(neon__q_2, a.val[1]);
+ c0.val[2] = vcgtq_s16(neon__q_2, a.val[2]);
+ c0.val[3] = vcgtq_s16(neon__q_2, a.val[3]);
+
+ c1.val[0] = vcgtq_s16(neon__q_2, f.val[0]);
+ c1.val[1] = vcgtq_s16(neon__q_2, f.val[1]);
+ c1.val[2] = vcgtq_s16(neon__q_2, f.val[2]);
+ c1.val[3] = vcgtq_s16(neon__q_2, f.val[3]);
+
+ // Perform addition with Q
+ d0.val[0] = vreinterpretq_s16_u16(vandq_u16(c0.val[0], neon_q));
+ d0.val[1] = vreinterpretq_s16_u16(vandq_u16(c0.val[1], neon_q));
+ d0.val[2] = vreinterpretq_s16_u16(vandq_u16(c0.val[2], neon_q));
+ d0.val[3] = vreinterpretq_s16_u16(vandq_u16(c0.val[3], neon_q));
+
+ d1.val[0] = vreinterpretq_s16_u16(vandq_u16(c1.val[0], neon_q));
+ d1.val[1] = vreinterpretq_s16_u16(vandq_u16(c1.val[1], neon_q));
+ d1.val[2] = vreinterpretq_s16_u16(vandq_u16(c1.val[2], neon_q));
+ d1.val[3] = vreinterpretq_s16_u16(vandq_u16(c1.val[3], neon_q));
+
+ vadd_x4(a, a, d0);
+ vadd_x4(f, f, d1);
+
+ g.val[0] = vmovn_high_s16(vmovn_s16(a.val[0]), a.val[1]);
+ g.val[1] = vmovn_high_s16(vmovn_s16(a.val[2]), a.val[3]);
+ g.val[2] = vmovn_high_s16(vmovn_s16(f.val[0]), f.val[1]);
+ g.val[3] = vmovn_high_s16(vmovn_s16(f.val[2]), f.val[3]);
+
+ vst1q_s8_x4(&G[i], g);
+
+ // -127 > a ? 1 : 0
+ c0.val[0] = vcgtq_s16(neon__127, a.val[0]);
+ c0.val[1] = vcgtq_s16(neon__127, a.val[1]);
+ c0.val[2] = vcgtq_s16(neon__127, a.val[2]);
+ c0.val[3] = vcgtq_s16(neon__127, a.val[3]);
+ // a > 127 ? 1 : 0
+ c1.val[0] = vcgtq_s16(a.val[0], neon_127);
+ c1.val[1] = vcgtq_s16(a.val[1], neon_127);
+ c1.val[2] = vcgtq_s16(a.val[2], neon_127);
+ c1.val[3] = vcgtq_s16(a.val[3], neon_127);
+
+ // -127 > f ? 1 : 0
+ x0.val[0] = vcgtq_s16(neon__127, f.val[0]);
+ x0.val[1] = vcgtq_s16(neon__127, f.val[1]);
+ x0.val[2] = vcgtq_s16(neon__127, f.val[2]);
+ x0.val[3] = vcgtq_s16(neon__127, f.val[3]);
+ // f > 127 ? 1 : 0
+ x1.val[0] = vcgtq_s16(f.val[0], neon_127);
+ x1.val[1] = vcgtq_s16(f.val[1], neon_127);
+ x1.val[2] = vcgtq_s16(f.val[2], neon_127);
+ x1.val[3] = vcgtq_s16(f.val[3], neon_127);
+
+ c0.val[0] = vorrq_u16(c0.val[0], c1.val[0]);
+ c0.val[1] = vorrq_u16(c0.val[1], c1.val[1]);
+ c0.val[2] = vorrq_u16(c0.val[2], c1.val[2]);
+ c0.val[3] = vorrq_u16(c0.val[3], c1.val[3]);
+
+ x0.val[0] = vorrq_u16(x0.val[0], x1.val[0]);
+ x0.val[1] = vorrq_u16(x0.val[1], x1.val[1]);
+ x0.val[2] = vorrq_u16(x0.val[2], x1.val[2]);
+ x0.val[3] = vorrq_u16(x0.val[3], x1.val[3]);
+
+ c0.val[0] = vorrq_u16(c0.val[0], x0.val[0]);
+ c0.val[1] = vorrq_u16(c0.val[1], x0.val[1]);
+ c0.val[2] = vorrq_u16(c0.val[2], x0.val[2]);
+ c0.val[3] = vorrq_u16(c0.val[3], x0.val[3]);
+
+ c0.val[0] = vorrq_u16(c0.val[0], c0.val[2]);
+ c0.val[1] = vorrq_u16(c0.val[1], c0.val[3]);
+
+ e.val[0] = vorrq_u16(c0.val[0], c0.val[1]);
+
+ e.val[1] = vorrq_u16(e.val[1], e.val[0]);
+ }
+ if (vmaxvq_u16(e.val[1])) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int8(const int8_t t[FALCON_N],
+ const int8_t low, const int8_t high) {
+ // Total SIMD registers: 15
+ int8x16x4_t a; // 4
+ uint8x16x4_t c, d; // 8
+ uint8x16_t e; // 1
+ int8x16_t neon_low, neon_high; // 2
+
+ neon_high = vdupq_n_s8(high);
+ neon_low = vdupq_n_s8(low);
+ e = vdupq_n_u8(0);
+
+ for (int i = 0; i < FALCON_N; i += 64) {
+ a = vld1q_s8_x4(&t[i]);
+
+ // low > a ? 1 : 0
+ c.val[0] = vcgtq_s8(neon_low, a.val[0]);
+ c.val[1] = vcgtq_s8(neon_low, a.val[1]);
+ c.val[2] = vcgtq_s8(neon_low, a.val[2]);
+ c.val[3] = vcgtq_s8(neon_low, a.val[3]);
+ // a > high ? 1 : 0
+ d.val[0] = vcgtq_s8(a.val[0], neon_high);
+ d.val[1] = vcgtq_s8(a.val[1], neon_high);
+ d.val[2] = vcgtq_s8(a.val[2], neon_high);
+ d.val[3] = vcgtq_s8(a.val[3], neon_high);
+
+ c.val[0] = vorrq_u8(c.val[0], d.val[0]);
+ c.val[1] = vorrq_u8(c.val[1], d.val[1]);
+ c.val[2] = vorrq_u8(c.val[2], d.val[2]);
+ c.val[3] = vorrq_u8(c.val[3], d.val[3]);
+
+ c.val[0] = vorrq_u8(c.val[0], c.val[2]);
+ c.val[1] = vorrq_u8(c.val[1], c.val[3]);
+
+ c.val[0] = vorrq_u8(c.val[0], c.val[1]);
+
+ e = vorrq_u8(e, c.val[0]);
+
+ if (vmaxvq_u8(e)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Check if (t < low || t > high)
+ * Return 1 if True
+ * Otherwise 0
+ * Work for FALCON_N >= 32, or FALCON_LOGN >= 5
+ */
+int PQCLEAN_FALCONPADDED512_AARCH64_poly_check_bound_int16(const int16_t t[FALCON_N],
+ const int16_t low, const int16_t high) {
+ // Total SIMD registers = 15
+ int16x8x4_t a; // 4
+ uint16x8x4_t c, d; // 8
+ uint16x8_t e; // 1
+ int16x8_t neon_low, neon_high; // 2
+
+ neon_high = vdupq_n_s16(high);
+ neon_low = vdupq_n_s16(low);
+ e = vdupq_n_u16(0);
+
+ for (int i = 0; i < FALCON_N; i += 32) {
+ a = vld1q_s16_x4(&t[i]);
+
+ // low > a ? 1 : 0
+ c.val[0] = vcgtq_s16(neon_low, a.val[0]);
+ c.val[1] = vcgtq_s16(neon_low, a.val[1]);
+ c.val[2] = vcgtq_s16(neon_low, a.val[2]);
+ c.val[3] = vcgtq_s16(neon_low, a.val[3]);
+ // a > high ? 1 : 0
+ d.val[0] = vcgtq_s16(a.val[0], neon_high);
+ d.val[1] = vcgtq_s16(a.val[1], neon_high);
+ d.val[2] = vcgtq_s16(a.val[2], neon_high);
+ d.val[3] = vcgtq_s16(a.val[3], neon_high);
+
+ c.val[0] = vorrq_u16(c.val[0], d.val[0]);
+ c.val[1] = vorrq_u16(c.val[1], d.val[1]);
+ c.val[2] = vorrq_u16(c.val[2], d.val[2]);
+ c.val[3] = vorrq_u16(c.val[3], d.val[3]);
+
+ c.val[0] = vorrq_u16(c.val[0], c.val[2]);
+ c.val[1] = vorrq_u16(c.val[1], c.val[3]);
+
+ c.val[0] = vorrq_u16(c.val[0], c.val[1]);
+
+ e = vorrq_u16(e, c.val[0]);
+
+ if (vmaxvq_u16(e)) {
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/pqclean.c
new file mode 100644
index 000000000..bd6f04943
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/pqclean.c
@@ -0,0 +1,377 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include
+#include
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN 40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ * private key:
+ * header byte: 0101nnnn
+ * private f (6 or 5 bits by element, depending on degree)
+ * private g (6 or 5 bits by element, depending on degree)
+ * private F (8 bits by element)
+ *
+ * public key:
+ * header byte: 0000nnnn
+ * public h (14 bits by element)
+ *
+ * signature:
+ * header byte: 0011nnnn
+ * nonce (r) 40 bytes
+ * value (s) compressed format
+ * padding to PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES bytes
+ *
+ * message + signature:
+ * signature PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES bytes
+ * message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk) {
+ union {
+ uint8_t b[28 * FALCON_N];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N];
+ uint16_t h[FALCON_N];
+ unsigned char seed[48];
+ inner_shake256_context rng;
+ size_t u, v;
+
+ /*
+ * Generate key pair.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&rng);
+ inner_shake256_inject(&rng, seed, sizeof seed);
+ inner_shake256_flip(&rng);
+ PQCLEAN_FALCONPADDED512_AARCH64_keygen(&rng, f, g, F, NULL, h, FALCON_LOGN, tmp.b);
+ inner_shake256_ctx_release(&rng);
+
+ /*
+ * Encode private key.
+ */
+ sk[0] = 0x50 + FALCON_LOGN;
+ u = 1;
+ v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+ f, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+ g, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u,
+ F, PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[FALCON_LOGN]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+
+ /*
+ * Encode public key.
+ */
+ pk[0] = 0x00 + FALCON_LOGN;
+ v = PQCLEAN_FALCONPADDED512_AARCH64_modq_encode(
+ pk + 1, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1,
+ h, FALCON_LOGN);
+ if (v != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ union {
+ uint8_t b[72 * FALCON_N];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[FALCON_N], g[FALCON_N], F[FALCON_N], G[FALCON_N];
+ struct {
+ int16_t sig[FALCON_N];
+ uint16_t hm[FALCON_N];
+ } r;
+ unsigned char seed[48];
+ inner_shake256_context sc;
+ size_t u, v;
+
+ /*
+ * Decode the private key.
+ */
+ if (sk[0] != 0x50 + FALCON_LOGN) {
+ return -1;
+ }
+ u = 1;
+ v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(
+ f, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN],
+ sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(
+ g, PQCLEAN_FALCONPADDED512_AARCH64_max_fg_bits[FALCON_LOGN],
+ sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AARCH64_trim_i8_decode(
+ F, PQCLEAN_FALCONPADDED512_AARCH64_max_FG_bits[FALCON_LOGN],
+ sk + u, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+ if (!PQCLEAN_FALCONPADDED512_AARCH64_complete_private(G, f, g, F, tmp.b)) {
+ return -1;
+ }
+
+ /*
+ * Create a random nonce (40 bytes).
+ */
+ randombytes(nonce, NONCELEN);
+
+ /*
+ * Hash message nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(&sc, r.hm, FALCON_LOGN, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Initialize a RNG.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, seed, sizeof seed);
+ inner_shake256_flip(&sc);
+
+ /*
+ * Compute and return the signature. This loops until a signature
+ * value is found that fits in the provided buffer.
+ */
+ for (;;) {
+ PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, tmp.b);
+ v = PQCLEAN_FALCONPADDED512_AARCH64_comp_encode(sigbuf, sigbuflen, r.sig);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ memset(sigbuf + v, 0, sigbuflen - v);
+ return 0;
+ }
+ }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+ const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ union {
+ uint8_t b[2 * FALCON_N];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int16_t h[FALCON_N];
+ int16_t hm[FALCON_N];
+ int16_t sig[FALCON_N];
+ inner_shake256_context sc;
+ size_t v;
+
+ /*
+ * Decode public key.
+ */
+ if (pk[0] != 0x00 + FALCON_LOGN) {
+ return -1;
+ }
+ if (PQCLEAN_FALCONPADDED512_AARCH64_modq_decode( (uint16_t *) h,
+ pk + 1, PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1, FALCON_LOGN)
+ != PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+ // We move the conversion to NTT domain of `h` inside verify_raw()
+
+ /*
+ * Decode signature.
+ */
+ if (sigbuflen == 0) {
+ return -1;
+ }
+
+ v = PQCLEAN_FALCONPADDED512_AARCH64_comp_decode(sig, sigbuf, sigbuflen);
+ if (v == 0) {
+ return -1;
+ }
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
+
+ /*
+ * Hash nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED512_AARCH64_hash_to_point_ct(&sc, (uint16_t *) hm, FALCON_LOGN, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Verify signature.
+ */
+ if (!PQCLEAN_FALCONPADDED512_AARCH64_verify_raw(hm, sig, h, (int16_t *) tmp.b)) {
+ return -1;
+ }
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ size_t vlen;
+
+ vlen = PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sig[0] = 0x30 + FALCON_LOGN;
+ *siglen = 1 + NONCELEN + vlen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ if (siglen < 1 + NONCELEN) {
+ return -1;
+ }
+ if (sig[0] != 0x30 + FALCON_LOGN) {
+ return -1;
+ }
+ return do_verify(sig + 1,
+ sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ uint8_t *sigbuf;
+ size_t sigbuflen;
+
+ /*
+ * Move the message to its final location; this is a memmove() so
+ * it handles overlaps properly.
+ */
+ memmove(sm + PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES, m, mlen);
+ sigbuf = sm + 1 + NONCELEN;
+ sigbuflen = PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sm[0] = 0x30 + FALCON_LOGN;
+ sigbuflen ++;
+ *smlen = mlen + NONCELEN + sigbuflen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+ const uint8_t *sigbuf;
+ size_t pmlen, sigbuflen;
+
+ if (smlen < PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES) {
+ return -1;
+ }
+ sigbuflen = PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES - NONCELEN - 1;
+ pmlen = smlen - PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES;
+ if (sm[0] != 0x30 + FALCON_LOGN) {
+ return -1;
+ }
+ sigbuf = sm + 1 + NONCELEN;
+
+ /*
+ * The one-byte signature header has been verified. Nonce is at sm+1
+ * followed by the signature (pointed to by sigbuf). The message
+ * follows the signature value.
+ */
+ if (do_verify(sm + 1, sigbuf, sigbuflen,
+ sm + PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES, pmlen, pk) < 0) {
+ return -1;
+ }
+
+ /*
+ * Signature is correct, we just have to copy/move the message
+ * to its final destination. The memmove() properly handles
+ * overlaps.
+ */
+ memmove(m, sm + PQCLEAN_FALCONPADDED512_AARCH64_CRYPTO_BYTES, pmlen);
+ *mlen = pmlen;
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/rng.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/rng.c
new file mode 100644
index 000000000..cd5bd7703
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/rng.c
@@ -0,0 +1,194 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+#include
+#include "inner.h"
+
+int PQCLEAN_FALCONPADDED512_AARCH64_get_seed(void *seed, size_t len) {
+ unsigned char tmp[48];
+ for (size_t i = 0; i < len; i++) {
+ tmp[i] = (unsigned char) i;
+ }
+ memcpy(seed, tmp, len);
+ return 1;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_prng_init(prng *p, inner_shake256_context *src) {
+ /*
+ * To ensure reproducibility for a given seed, we
+ * must enforce little-endian interpretation of
+ * the state words.
+ */
+ uint8_t tmp[56];
+ uint64_t th, tl;
+ int i;
+
+ inner_shake256_extract(src, tmp, 56);
+ for (i = 0; i < 14; i ++) {
+ uint32_t w;
+
+ w = (uint32_t)tmp[(i << 2) + 0]
+ | ((uint32_t)tmp[(i << 2) + 1] << 8)
+ | ((uint32_t)tmp[(i << 2) + 2] << 16)
+ | ((uint32_t)tmp[(i << 2) + 3] << 24);
+ *(uint32_t *)(p->state.d + (i << 2)) = w;
+ }
+ tl = *(uint32_t *)(p->state.d + 48);
+ th = *(uint32_t *)(p->state.d + 52);
+ *(uint64_t *)(p->state.d + 48) = tl + (th << 32);
+ PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(prng *p) {
+
+ static const uint32_t CW[] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+ };
+
+ uint64_t cc;
+ size_t u;
+
+ /*
+ * State uses local endianness. Only the output bytes must be
+ * converted to little endian (if used on a big-endian machine).
+ */
+ cc = *(uint64_t *)(p->state.d + 48);
+ for (u = 0; u < 8; u ++) {
+ uint32_t state[16];
+ size_t v;
+ int i;
+
+ memcpy(&state[0], CW, sizeof CW);
+ memcpy(&state[4], p->state.d, 48);
+ state[14] ^= (uint32_t)cc;
+ state[15] ^= (uint32_t)(cc >> 32);
+ for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d) do { \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 16) | (state[d] >> 16); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 12) | (state[b] >> 20); \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 8) | (state[d] >> 24); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 7) | (state[b] >> 25); \
+ } while (0)
+
+ QROUND( 0, 4, 8, 12);
+ QROUND( 1, 5, 9, 13);
+ QROUND( 2, 6, 10, 14);
+ QROUND( 3, 7, 11, 15);
+ QROUND( 0, 5, 10, 15);
+ QROUND( 1, 6, 11, 12);
+ QROUND( 2, 7, 8, 13);
+ QROUND( 3, 4, 9, 14);
+
+#undef QROUND
+
+ }
+
+ for (v = 0; v < 4; v ++) {
+ state[v] += CW[v];
+ }
+ for (v = 4; v < 14; v ++) {
+ state[v] += ((uint32_t *)p->state.d)[v - 4];
+ }
+ state[14] += ((uint32_t *)p->state.d)[10]
+ ^ (uint32_t)cc;
+ state[15] += ((uint32_t *)p->state.d)[11]
+ ^ (uint32_t)(cc >> 32);
+ cc ++;
+
+ /*
+ * We mimic the interleaving that is used in the AVX2
+ * implementation.
+ */
+ for (v = 0; v < 16; v ++) {
+ p->buf.d[(u << 2) + (v << 5) + 0] =
+ (uint8_t)state[v];
+ p->buf.d[(u << 2) + (v << 5) + 1] =
+ (uint8_t)(state[v] >> 8);
+ p->buf.d[(u << 2) + (v << 5) + 2] =
+ (uint8_t)(state[v] >> 16);
+ p->buf.d[(u << 2) + (v << 5) + 3] =
+ (uint8_t)(state[v] >> 24);
+ }
+ }
+ *(uint64_t *)(p->state.d + 48) = cc;
+
+ p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_prng_get_bytes(prng *p, void *dst, size_t len) {
+ uint8_t *buf;
+
+ buf = dst;
+ while (len > 0) {
+ size_t clen;
+
+ clen = (sizeof p->buf.d) - p->ptr;
+ if (clen > len) {
+ clen = len;
+ }
+ memcpy(buf, p->buf.d, clen);
+ buf += clen;
+ len -= clen;
+ p->ptr += clen;
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED512_AARCH64_prng_refill(p);
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sampler.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sampler.c
new file mode 100644
index 000000000..e77dc4b52
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sampler.c
@@ -0,0 +1,292 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(prng *p) {
+
+ static const uint32_t dist[] = {
+ 10745844u, 3068844u, 3741698u,
+ 5559083u, 1580863u, 8248194u,
+ 2260429u, 13669192u, 2736639u,
+ 708981u, 4421575u, 10046180u,
+ 169348u, 7122675u, 4136815u,
+ 30538u, 13063405u, 7650655u,
+ 4132u, 14505003u, 7826148u,
+ 417u, 16768101u, 11363290u,
+ 31u, 8444042u, 8086568u,
+ 1u, 12844466u, 265321u,
+ 0u, 1232676u, 13644283u,
+ 0u, 38047u, 9111839u,
+ 0u, 870u, 6138264u,
+ 0u, 14u, 12545723u,
+ 0u, 0u, 3104126u,
+ 0u, 0u, 28824u,
+ 0u, 0u, 198u,
+ 0u, 0u, 1u
+ };
+
+ uint32_t v0, v1, v2, hi;
+ uint64_t lo;
+ int z;
+
+ /*
+ * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+ */
+ lo = prng_get_u64(p);
+ hi = prng_get_u8(p);
+ v0 = (uint32_t)lo & 0xFFFFFF;
+ v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+ v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+ /*
+ * Sampled value is z, such that v0..v2 is lower than the first
+ * z elements of the table.
+ */
+
+ uint32x4x3_t w;
+ uint32x4_t x0, x1, x2, cc0, cc1, cc2, zz;
+ uint32x2x3_t wh;
+ uint32x2_t cc0h, cc1h, cc2h, zzh;
+ x0 = vdupq_n_u32(v0);
+ x1 = vdupq_n_u32(v1);
+ x2 = vdupq_n_u32(v2);
+
+ // 0: 0, 3, 6, 9
+ // 1: 1, 4, 7, 10
+ // 2: 2, 5, 8, 11
+ // v0 - w0
+ // v1 - w1
+ // v2 - w2
+ // cc1 - cc0 >> 31
+ // cc2 - cc1 >> 31
+ // z + cc2 >> 31
+ w = vld3q_u32(&dist[0]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vshrq_n_u32(cc2, 31);
+
+ w = vld3q_u32(&dist[12]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vsraq_n_u32(zz, cc2, 31);
+
+ w = vld3q_u32(&dist[24]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vsraq_n_u32(zz, cc2, 31);
+
+ w = vld3q_u32(&dist[36]);
+ cc0 = vsubq_u32(x0, w.val[2]);
+ cc1 = vsubq_u32(x1, w.val[1]);
+ cc2 = vsubq_u32(x2, w.val[0]);
+ cc1 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc1, (int32x4_t)cc0, 31);
+ cc2 = (uint32x4_t)vsraq_n_s32((int32x4_t)cc2, (int32x4_t)cc1, 31);
+ zz = vsraq_n_u32(zz, cc2, 31);
+
+ // 0: 48, 51
+ // 1: 49, 52
+ // 2: 50, 53
+ wh = vld3_u32(&dist[48]);
+ cc0h = vsub_u32(vget_low_u32(x0), wh.val[2]);
+ cc1h = vsub_u32(vget_low_u32(x1), wh.val[1]);
+ cc2h = vsub_u32(vget_low_u32(x2), wh.val[0]);
+ cc1h = (uint32x2_t)vsra_n_s32((int32x2_t)cc1h, (int32x2_t)cc0h, 31);
+ cc2h = (uint32x2_t)vsra_n_s32((int32x2_t)cc2h, (int32x2_t)cc1h, 31);
+ zzh = vshr_n_u32(cc2h, 31);
+
+ z = (int) (vaddvq_u32(zz) + vaddv_u32(zzh));
+ return z;
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+ int s, i;
+ fpr r;
+ uint32_t sw, w;
+ uint64_t z;
+
+ /*
+ * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+ * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+ */
+ s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+ r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+ /*
+ * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+ * (the minimum value for sigma), r = 0 and b = 1, then we get
+ * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+ * with probability about 0.000000000230383991, which is
+ * approximatively equal to 2^(-32). In any case, if s >= 64,
+ * then BerExp will be non-zero with probability less than
+ * 2^(-64), so we can simply saturate s at 63.
+ */
+ sw = (uint32_t)s;
+ sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+ s = (int)sw;
+
+ /*
+ * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+ * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+ * We scale it up to 2^64, then right-shift it by s bits because
+ * we really want exp(-x) = 2^(-s)*exp(-r).
+ *
+ * The "-1" operation makes sure that the value fits on 64 bits
+ * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+ * case). The bias is negligible since fpr_expm_p63() only computes
+ * with 51 bits of precision or so.
+ */
+ z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+ /*
+ * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+ * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+ * PRNG output to limit its consumption, the sign of the difference
+ * yields the expected result.
+ */
+ i = 64;
+ do {
+ i -= 8;
+ w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+ } while (!w && i > 0);
+ return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED512_AARCH64_sampler(void *ctx, fpr mu, fpr isigma) {
+ sampler_context *spc;
+ int s;
+ fpr r, dss, ccs;
+
+ spc = ctx;
+
+ /*
+ * Center is mu. We compute mu = s + r where s is an integer
+ * and 0 <= r < 1.
+ */
+ s = (int)fpr_floor(mu);
+ r = fpr_sub(mu, fpr_of(s));
+
+ /*
+ * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+ */
+ dss = fpr_half(fpr_sqr(isigma));
+
+ /*
+ * ccs = sigma_min / sigma = sigma_min * isigma.
+ */
+ ccs = fpr_mul(isigma, spc->sigma_min);
+
+ /*
+ * We now need to sample on center r.
+ */
+ for (;;) {
+ int z0, z, b;
+ fpr x;
+
+ /*
+ * Sample z for a Gaussian distribution. Then get a
+ * random bit b to turn the sampling into a bimodal
+ * distribution: if b = 1, we use z+1, otherwise we
+ * use -z. We thus have two situations:
+ *
+ * - b = 1: z >= 1 and sampled against a Gaussian
+ * centered on 1.
+ * - b = 0: z <= 0 and sampled against a Gaussian
+ * centered on 0.
+ */
+ z0 = PQCLEAN_FALCONPADDED512_AARCH64_gaussian0_sampler(&spc->p);
+ b = (int)prng_get_u8(&spc->p) & 1;
+ z = b + ((b << 1) - 1) * z0;
+
+ /*
+ * Rejection sampling. We want a Gaussian centered on r;
+ * but we sampled against a Gaussian centered on b (0 or
+ * 1). But we know that z is always in the range where
+ * our sampling distribution is greater than the Gaussian
+ * distribution, so rejection works.
+ *
+ * We got z with distribution:
+ * G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+ * We target distribution:
+ * S(z) = exp(-((z-r)^2)/(2*sigma^2))
+ * Rejection sampling works by keeping the value z with
+ * probability S(z)/G(z), and starting again otherwise.
+ * This requires S(z) <= G(z), which is the case here.
+ * Thus, we simply need to keep our z with probability:
+ * P = exp(-x)
+ * where:
+ * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+ *
+ * Here, we scale up the Bernouilli distribution, which
+ * makes rejection more probable, but makes rejection
+ * rate sufficiently decorrelated from the Gaussian
+ * center and standard deviation that the whole sampler
+ * can be said to be constant-time.
+ */
+ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+ x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+ if (BerExp(&spc->p, x, ccs)) {
+ /*
+ * Rejection sampling was centered on r, but the
+ * actual center is mu = s + r.
+ */
+ return s + z;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sign.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sign.c
new file mode 100644
index 000000000..550a6e434
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/sign.c
@@ -0,0 +1,953 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+#include "macrof.h"
+#include "macrofx4.h"
+#include "util.h"
+#include
+#include
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ * N = 2^logn
+ * phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+ /*
+ * For logn = 0 (polynomials are constant), the "tree" is a
+ * single element. Otherwise, the tree node has size 2^logn, and
+ * has two child trees for size logn-1 each. Thus, treesize s()
+ * must fulfill these two relations:
+ *
+ * s(0) = 1
+ * s(logn) = (2^logn) + 2*s(logn-1)
+ */
+ return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *restrict tree,
+ fpr *restrict g0, fpr *restrict g1, unsigned logn, fpr *restrict tmp) {
+ size_t n, hn;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g0[0];
+ return;
+ }
+ hn = n >> 1;
+
+ /*
+ * The LDL decomposition yields L (which is written in the tree)
+ * and the diagonal of D. Since d00 = g0, we just write d11
+ * into tmp.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+ /*
+ * Split d00 (currently in g0) and d11 (currently in tmp). We
+ * reuse g0 and g1 as temporary storage spaces:
+ * d00 splits into g1, g1+hn
+ * d11 splits into g0, g0+hn
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(g1, g1 + hn, g0, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+ /*
+ * Each split result is the first row of a new auto-adjoint
+ * quasicyclic matrix for the next recursive step.
+ */
+ ffLDL_fft_inner(tree + n,
+ g1, g1 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *restrict tree, const fpr *restrict g00,
+ const fpr *restrict g01, const fpr *restrict g11,
+ unsigned logn, fpr *restrict tmp) {
+ size_t n, hn;
+ fpr *d00, *d11;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g00[0];
+ return;
+ }
+ hn = n >> 1;
+ d00 = tmp;
+ d11 = tmp + n;
+ tmp += n << 1;
+
+ memcpy(d00, g00, n * sizeof * g00);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, d00, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(d00, d00 + hn, d11, logn);
+ memcpy(d11, tmp, n * sizeof * tmp);
+
+ ffLDL_fft_inner(tree + n, d11, d11 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1), d00, d00 + hn, logn - 1, tmp);
+
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+ /*
+ * TODO: make an iterative version.
+ */
+ size_t n;
+
+ n = MKN(logn);
+ if (n == 1) {
+ /*
+ * We actually store in the tree leaf the inverse of
+ * the value mandated by the specification: this
+ * saves a division both here and in the sampler.
+ */
+ tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma_9);
+ } else {
+ ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+ ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+ orig_logn, logn - 1);
+ }
+}
+
+/* =================================================================== */
+
+/*
+ * The expanded private key contains:
+ * - The B0 matrix (four elements)
+ * - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+ (void)logn;
+ return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+ return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+ return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+ return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+ return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_expand_privkey(fpr *restrict expanded_key,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ uint8_t *restrict tmp) {
+ fpr *rf, *rg, *rF, *rG;
+ fpr *b00, *b01, *b10, *b11;
+ fpr *g00, *g01, *g11, *gxx;
+ fpr *tree;
+
+ b00 = expanded_key + skoff_b00(FALCON_LOGN);
+ b01 = expanded_key + skoff_b01(FALCON_LOGN);
+ b10 = expanded_key + skoff_b10(FALCON_LOGN);
+ b11 = expanded_key + skoff_b11(FALCON_LOGN);
+ tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+ /*
+ * We load the private key elements directly into the B0 matrix,
+ * since B0 = [[g, -f], [G, -F]].
+ */
+ rg = b00;
+ rf = b01;
+ rG = b10;
+ rF = b11;
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rg, g, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rg, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rf, f, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rf, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(rf, rf, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rG, G, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rG, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(rF, F, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(rF, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(rF, rF, FALCON_LOGN);
+
+ /*
+ * Compute the FFT for the key elements, and negate f and F.
+ */
+
+ /*
+ * The Gram matrix is G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle).
+ */
+ g00 = (fpr *)tmp;
+ g01 = g00 + FALCON_N;
+ g11 = g01 + FALCON_N;
+ gxx = g11 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(g00, b00, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(g00, g00, b01, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(g01, b00, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(g01, g01, b01, b11, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(g11, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(g11, g11, b11, FALCON_LOGN);
+
+ /*
+ * Compute the Falcon tree.
+ */
+ ffLDL_fft(tree, g00, g01, g11, FALCON_LOGN, gxx);
+
+ /*
+ * Normalize tree.
+ */
+ ffLDL_binary_normalize(tree, FALCON_LOGN, FALCON_LOGN);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+ fpr *restrict t0, fpr *restrict t1,
+ fpr *restrict g00, fpr *restrict g01, fpr *restrict g11,
+ unsigned orig_logn, unsigned logn, fpr *restrict tmp) {
+ size_t n, hn;
+ fpr *z0, *z1;
+
+ /*
+ * Deepest level: the LDL tree leaf value is just g00 (the
+ * array has length only 1 at this point); we normalize it
+ * with regards to sigma, then use it for sampling.
+ */
+ if (logn == 0) {
+ fpr leaf;
+
+ leaf = g00[0];
+ leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma_9);
+ t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+ t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+ return;
+ }
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Decompose G into LDL. We only need d00 (identical to g00),
+ * d11, and l10; we do that in place.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_LDL_fft(g00, g01, g11, logn);
+
+ /*
+ * Split d00 and d11 and expand them into half-size quasi-cyclic
+ * Gram matrices. We also save l10 in tmp[].
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, g00, logn);
+ memcpy(g00, tmp, n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(tmp, tmp + hn, g11, logn);
+ memcpy(g11, tmp, n * sizeof * tmp);
+ memcpy(tmp, g01, n * sizeof * g01);
+ memcpy(g01, g00, hn * sizeof * g00);
+ memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+ /*
+ * The half-size Gram matrices for the recursive LDL tree
+ * building are now:
+ * - left sub-tree: g00, g00+hn, g01
+ * - right sub-tree: g11, g11+hn, g01+hn
+ * l10 is in tmp[].
+ */
+
+ /*
+ * We split t1 and use the first recursive call on the two
+ * halves, using the right sub-tree. The result is merged
+ * back into tmp + 2*n.
+ */
+ z1 = tmp + n;
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+ g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * l10.
+ * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+ * in tmp + (n << 1). The buffer in z1 is free.
+ *
+ * In the end, z1 is written over t1, and tb0 is in t0.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(z1, t1, tmp + (n << 1), logn);
+ memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, t0, tmp, z1, logn);
+
+ /*
+ * Second recursive invocation, on the split tb0 (currently in t0)
+ * and the left sub-tree.
+ */
+ z0 = tmp;
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z0, z0 + hn, t0, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+ g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+ fpr *restrict z0, fpr *restrict z1,
+ const fpr *restrict tree,
+ const fpr *restrict t0, const fpr *restrict t1, unsigned logn,
+ fpr *restrict tmp) {
+ size_t n, hn;
+ const fpr *tree0, *tree1;
+
+ /*
+ * When logn == 2, we inline the last two recursion levels.
+ */
+ if (logn == 2) {
+ fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ tree0 = tree + 4;
+ tree1 = tree + 8;
+
+ /*
+ * We split t1 into w*, then do the recursive invocation,
+ * with output in w*. We finally merge back into z1.
+ */
+ // Split
+ a_re = t1[0];
+ a_im = t1[2];
+ b_re = t1[1];
+ b_im = t1[3];
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ // Sampling
+ x0 = w2;
+ x1 = w3;
+ sigma = tree1[3];
+ w2 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, w2);
+ a_im = fpr_sub(x1, w3);
+ b_re = tree1[0];
+ b_im = tree1[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree1[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ // Merge
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z1[0] = w0 = fpr_add(a_re, c_re);
+ z1[2] = w2 = fpr_add(a_im, c_im);
+ z1[1] = w1 = fpr_sub(a_re, c_re);
+ z1[3] = w3 = fpr_sub(a_im, c_im);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+ */
+ w0 = fpr_sub(t1[0], w0);
+ w1 = fpr_sub(t1[1], w1);
+ w2 = fpr_sub(t1[2], w2);
+ w3 = fpr_sub(t1[3], w3);
+
+ a_re = w0;
+ a_im = w2;
+ b_re = tree[0];
+ b_im = tree[2];
+ w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ a_re = w1;
+ a_im = w3;
+ b_re = tree[1];
+ b_im = tree[3];
+ w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+ w0 = fpr_add(w0, t0[0]);
+ w1 = fpr_add(w1, t0[1]);
+ w2 = fpr_add(w2, t0[2]);
+ w3 = fpr_add(w3, t0[3]);
+
+ /*
+ * Second recursive invocation.
+ */
+ // Split
+ a_re = w0;
+ a_im = w2;
+ b_re = w1;
+ b_im = w3;
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ // Sampling
+ x0 = w2;
+ x1 = w3;
+ sigma = tree0[3];
+ w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree0[0];
+ b_im = tree0[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree0[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ // Merge
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z0[0] = fpr_add(a_re, c_re);
+ z0[2] = fpr_add(a_im, c_im);
+ z0[1] = fpr_sub(a_re, c_re);
+ z0[3] = fpr_sub(a_im, c_im);
+
+ return;
+ }
+
+ /*
+ * Case logn == 1 is reachable only when using Falcon-2 (the
+ * smallest size for which Falcon is mathematically defined, but
+ * of course way too insecure to be of any use).
+ */
+ if (logn == 1) {
+ fpr x0, x1, y0, y1, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ x0 = t1[0];
+ x1 = t1[1];
+ sigma = tree[3];
+ z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree[0];
+ b_im = tree[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, t0[0]);
+ x1 = fpr_add(c_im, t0[1]);
+ sigma = tree[2];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+ return;
+ }
+
+ /*
+ * General recursive case (logn >= 2).
+ */
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ tree0 = tree + n;
+ tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+ /*
+ * We split t1 into z1 (reused as temporary storage), then do
+ * the recursive invocation, with output in tmp. We finally
+ * merge back into z1.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree1, z1, z1 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_sub(tmp, t1, z1, logn);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(tmp, t0, tmp, tree, logn);
+
+ /*
+ * Second recursive invocation.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_split_fft(z0, z0 + hn, tmp, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree0, z0, z0 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const fpr *restrict expanded_key,
+ const uint16_t *hm, fpr *restrict tmp) {
+ fpr *t0, *t1, *tx, *ty;
+ const fpr *b00, *b01, *b10, *b11, *tree;
+ fpr ni;
+ int16_t *s1tmp, *s2tmp;
+
+ t0 = tmp;
+ t1 = t0 + FALCON_N;
+ b00 = expanded_key + skoff_b00(FALCON_LOGN);
+ b01 = expanded_key + skoff_b01(FALCON_LOGN);
+ b10 = expanded_key + skoff_b10(FALCON_LOGN);
+ b11 = expanded_key + skoff_b11(FALCON_LOGN);
+ tree = expanded_key + skoff_tree(FALCON_LOGN);
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(t0, FALCON_LOGN);
+ ni = fpr_inverse_of_q;
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+ tx = t1 + FALCON_N;
+ ty = tx + FALCON_N;
+
+ /*
+ * Apply sampling. Output is written back in [tx, ty].
+ */
+ ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, FALCON_LOGN, ty + FALCON_N);
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, tx, b00, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, t0, ty, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t0, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, tx, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t1, t1, ty, b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t1, FALCON_LOGN);
+
+ /*
+ * Compute the signature.
+ */
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+
+ s1tmp = (int16_t *)tx;
+ s2tmp = (int16_t *)tmp;
+
+ if (PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+ memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+ memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const int8_t *restrict f, const int8_t *restrict g,
+ const int8_t *restrict F, const int8_t *restrict G,
+ const uint16_t *hm, fpr *restrict tmp) {
+ fpr *t0, *t1, *tx, *ty;
+ fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+ fpr ni;
+ int16_t *s1tmp, *s2tmp;
+
+ /*
+ * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+ */
+ b00 = tmp;
+ b01 = b00 + FALCON_N;
+ b10 = b01 + FALCON_N;
+ b11 = b10 + FALCON_N;
+ t0 = b11 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b00, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b10, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+ /*
+ * Compute the Gram matrix G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle). g10 is not kept
+ * since it is equal to adj(g01).
+ *
+ * We _replace_ the matrix B with the Gram matrix, but we
+ * must keep b01 and b11 for computing the target vector.
+ *
+ * Memory layout:
+ * b00 | b01 | b10 | b11 | t0 | t1
+ * g00 | g01 | g11 | b01 | t0 | t1
+ */
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_fft(t1, b00, b10, FALCON_LOGN); // t1 <- b00*adj(b10)
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(t0, b01, FALCON_LOGN); // t0 <- b01*adj(b01)
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(b00, b00, FALCON_LOGN); // b00 <- b00*adj(b00)
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_add(b00, b00, t0, FALCON_LOGN); // b00 <- g00
+
+ memcpy(t0, b01, FALCON_N * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_muladj_add_fft(b01, t1, b01, b11, FALCON_LOGN); // b01 <- b01*adj(b11)
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_fft(b10, b10, FALCON_LOGN); // b10 <- b10*adj(b10)
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulselfadj_add_fft(b10, b10, b11, FALCON_LOGN); // t1 = g11 <- b11*adj(b11)
+
+ /*
+ * We rename variables to make things clearer. The three elements
+ * of the Gram matrix uses the first 3*n slots of tmp[], followed
+ * by b11 and b01 (in that order).
+ */
+ g00 = b00;
+ g01 = b01;
+ g11 = b10;
+ b01 = t0;
+ t0 = b01 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ /*
+ * Memory layout at that point:
+ * g00 g01 g11 b11 b01 t0 t1
+ */
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_fpr_of_s16(t0, hm, FALCON_N);
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(t0, FALCON_LOGN);
+ ni = fpr_inverse_of_q;
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t1, t0, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t1, t1, fpr_neg(ni), FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(t0, t0, b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mulconst(t0, t0, ni, FALCON_LOGN);
+
+ /*
+ * b01 and b11 can be discarded, so we move back (t0,t1).
+ * Memory layout is now:
+ * g00 g01 g11 t0 t1
+ */
+ memcpy(b11, t0, FALCON_N * 2 * sizeof * t0);
+ t0 = g11 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ /*
+ * Apply sampling; result is written over (t0,t1).
+ * t1, g00
+ */
+ ffSampling_fft_dyntree(samp, samp_ctx,
+ t0, t1, g00, g01, g11, FALCON_LOGN, FALCON_LOGN, t1 + FALCON_N);
+
+ /*
+ * We arrange the layout back to:
+ * b00 b01 b10 b11 t0 t1
+ *
+ * We did not conserve the matrix basis, so we must recompute
+ * it now.
+ */
+ b00 = tmp;
+ b01 = b00 + FALCON_N;
+ b10 = b01 + FALCON_N;
+ b11 = b10 + FALCON_N;
+ memmove(b11 + FALCON_N, t0, FALCON_N * 2 * sizeof * t0);
+ t0 = b11 + FALCON_N;
+ t1 = t0 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b00, g, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b00, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b01, f, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b01, b01, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b10, G, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b10, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(b11, F, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_FFT(b11, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_neg(b11, b11, FALCON_LOGN);
+
+ tx = t1 + FALCON_N;
+ ty = tx + FALCON_N;
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(tx, t0, b00, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_fft(ty, t0, b01, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t0, tx, t1, b10, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_mul_add_fft(t1, ty, t1, b11, FALCON_LOGN);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t0, FALCON_LOGN);
+ PQCLEAN_FALCONPADDED512_AARCH64_iFFT(t1, FALCON_LOGN);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s1tmp = (int16_t *)tx;
+ s2tmp = (int16_t *)tmp;
+
+ if (PQCLEAN_FALCONPADDED512_AARCH64_is_short_tmp(s1tmp, s2tmp, (int16_t *) hm, t0, t1)) {
+ memcpy(s2, s2tmp, FALCON_N * sizeof * s2);
+ memcpy(tmp, s1tmp, FALCON_N * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *restrict expanded_key,
+ const uint16_t *hm, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min_9;
+ PQCLEAN_FALCONPADDED512_AARCH64_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED512_AARCH64_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_tree(samp, samp_ctx, sig, expanded_key, hm, ftmp)) {
+ break;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AARCH64_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *restrict f, const int8_t *restrict g,
+ const int8_t *restrict F, const int8_t *restrict G,
+ const uint16_t *hm, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+
+ spc.sigma_min = fpr_sigma_min_9;
+ PQCLEAN_FALCONPADDED512_AARCH64_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED512_AARCH64_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_dyn(samp, samp_ctx, sig, f, g, F, G, hm, ftmp)) {
+ break;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.c
new file mode 100644
index 000000000..5f63c48fc
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.c
@@ -0,0 +1,71 @@
+/*
+ * Utils function
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "macrofx4.h"
+#include "util.h"
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+void PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, const unsigned logn) {
+ float64x2x4_t neon_flo64, neon_fhi64;
+ int64x2x4_t neon_lo64, neon_hi64;
+ int32x4_t neon_lo32[2], neon_hi32[2];
+ int16x8_t neon_lo16, neon_hi16;
+ int8x16_t neon_8;
+
+ const unsigned falcon_n = 1 << logn;
+
+ for (unsigned i = 0; i < falcon_n; i += 16) {
+ neon_8 = vld1q_s8(&t[i]);
+
+ // Extend from 8 to 16 bit
+ // x7 | x6 | x5 | x5 - x3 | x2 | x1 | x0
+ neon_lo16 = vmovl_s8(vget_low_s8(neon_8));
+ neon_hi16 = vmovl_high_s8(neon_8);
+
+ // Extend from 16 to 32 bit
+ // xxx3 | xxx2 | xxx1 | xxx0
+ neon_lo32[0] = vmovl_s16(vget_low_s16(neon_lo16));
+ neon_lo32[1] = vmovl_high_s16(neon_lo16);
+ neon_hi32[0] = vmovl_s16(vget_low_s16(neon_hi16));
+ neon_hi32[1] = vmovl_high_s16(neon_hi16);
+
+ // Extend from 32 to 64 bit
+ neon_lo64.val[0] = vmovl_s32(vget_low_s32(neon_lo32[0]));
+ neon_lo64.val[1] = vmovl_high_s32(neon_lo32[0]);
+ neon_lo64.val[2] = vmovl_s32(vget_low_s32(neon_lo32[1]));
+ neon_lo64.val[3] = vmovl_high_s32(neon_lo32[1]);
+
+ neon_hi64.val[0] = vmovl_s32(vget_low_s32(neon_hi32[0]));
+ neon_hi64.val[1] = vmovl_high_s32(neon_hi32[0]);
+ neon_hi64.val[2] = vmovl_s32(vget_low_s32(neon_hi32[1]));
+ neon_hi64.val[3] = vmovl_high_s32(neon_hi32[1]);
+
+ vfcvtx4(neon_flo64, neon_lo64);
+ vfcvtx4(neon_fhi64, neon_hi64);
+
+ vstorex4(&r[i], neon_flo64);
+ vstorex4(&r[i + 8], neon_fhi64);
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.h b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.h
new file mode 100644
index 000000000..e3576bc5c
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/util.h
@@ -0,0 +1,8 @@
+#ifndef UTIL_H
+#define UTIL_H
+
+#define poly_small_to_fp PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr
+
+void PQCLEAN_FALCONPADDED512_AARCH64_smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_aarch64/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/vrfy.c
new file mode 100644
index 000000000..c1345d95a
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_aarch64/vrfy.c
@@ -0,0 +1,174 @@
+/*
+ * Falcon signature verification.
+ *
+ * =============================================================================
+ * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG)
+ * ECE Department, George Mason University
+ * Fairfax, VA, U.S.A.
+ * Author: Duc Tri Nguyen
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ * @author Duc Tri Nguyen ,
+ */
+
+#include "inner.h"
+#include "poly.h"
+
+/* see inner.h */
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt(int16_t *h) {
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+}
+
+void PQCLEAN_FALCONPADDED512_AARCH64_to_ntt_monty(int16_t *h) {
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_MONT);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_raw(const int16_t *c0, const int16_t *s2,
+ int16_t *h, int16_t *tmp) {
+ int16_t *tt = tmp;
+
+ /*
+ * Compute s1 = c0 - s2*h mod phi mod q (in tt[]).
+ */
+
+ memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT_INV);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(tt, h);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(tt, INVNTT_NONE);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(tt, c0, tt);
+
+ /*
+ * Signature is valid if and only if the aggregate (s1,s2) vector
+ * is short enough.
+ */
+ return PQCLEAN_FALCONPADDED512_AARCH64_is_short(tt, s2);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_compute_public(int16_t *h, const int8_t *f, const int8_t *g, int16_t *tmp) {
+ int16_t *tt = tmp;
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(h, g);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(tt, f);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT);
+
+ if (PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(tt)) {
+ return 0;
+ }
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(h, tt);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_convert_to_unsigned(h);
+
+ return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_complete_private(int8_t *G, const int8_t *f,
+ const int8_t *g, const int8_t *F,
+ uint8_t *tmp) {
+ int16_t *t1, *t2;
+
+ t1 = (int16_t *)tmp;
+ t2 = t1 + FALCON_N;
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(t1, g);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(t1, NTT_NONE);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(t2, F);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(t2, NTT_MONT);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_montmul_ntt(t1, t2);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_int8_to_int16(t2, f);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(t2, NTT_MONT);
+
+ if (PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(t2)) {
+ return 0;
+ }
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(t1, t2);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(t1, INVNTT_NINV);
+
+ if (PQCLEAN_FALCONPADDED512_AARCH64_poly_int16_to_int8(G, t1)) {
+ return 0;
+ }
+ return 1;
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_is_invertible(const int16_t *s2, uint8_t *tmp) {
+ int16_t *tt = (int16_t *)tmp;
+ uint16_t r;
+
+ memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT);
+
+ r = PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(tt);
+
+ return (int)(1u - (r >> 15));
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_verify_recover(int16_t *h, const int16_t *c0,
+ const int16_t *s1, const int16_t *s2,
+ uint8_t *tmp) {
+ int16_t *tt = (int16_t *)tmp;
+ uint16_t r;
+
+ /*
+ * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+ * is zero (in NTT representation) then the operation fails. We
+ * keep that information into a flag so that we do not deviate
+ * from strict constant-time processing; if all coefficients of
+ * s2 are non-zero, then the high bit of r will be zero.
+ */
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_sub_barrett(h, c0, s1);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(h, NTT_NONE);
+
+ /*
+ * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+ * and c0 - s1 into h[].
+ */
+ memcpy(tt, s2, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(tt, NTT_MONT);
+ r = PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(tt);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_div_12289(h, tt);
+
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_invntt(h, INVNTT_NINV);
+
+ /*
+ * Signature is acceptable if and only if it is short enough,
+ * and s2 was invertible mod phi mod q. The caller must still
+ * check that the rebuilt public key matches the expected
+ * value (e.g. through a hash).
+ */
+ r = (uint16_t) (~r & (uint16_t) - PQCLEAN_FALCONPADDED512_AARCH64_is_short(s1, s2));
+ return (int)(r >> 15);
+}
+
+/* see inner.h */
+int PQCLEAN_FALCONPADDED512_AARCH64_count_nttzero(const int16_t *sig, uint8_t *tmp) {
+ int16_t *s2 = (int16_t *)tmp;
+
+ memcpy(s2, sig, sizeof(int16_t) * FALCON_N);
+ PQCLEAN_FALCONPADDED512_AARCH64_poly_ntt(s2, NTT_MONT);
+
+ int r = PQCLEAN_FALCONPADDED512_AARCH64_poly_compare_with_zero(s2);
+
+ return r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/LICENSE b/src/sig/falcon/pqclean_falcon-padded-512_avx2/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin .
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/api.h b/src/sig/falcon/pqclean_falcon-padded-512_avx2/api.h
new file mode 100644
index 000000000..c039206c7
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED512_AVX2_API_H
+#define PQCLEAN_FALCONPADDED512_AVX2_API_H
+
+#include
+#include
+
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES 1281
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES 897
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES 666
+
+#define PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_ALGNAME "Falcon-padded-512"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ * public (pk): PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES
+ * private (sk): PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/codec.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/codec.c
new file mode 100644
index 000000000..64f07533a
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_modq_encode(
+ void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn) {
+ size_t n, out_len, u;
+ uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ if (x[u] >= 12289) {
+ return 0;
+ }
+ }
+ out_len = ((n * 14) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << 14) | x[u];
+ acc_len += 14;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_modq_decode(
+ uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len, u;
+ const uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * 14) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ u = 0;
+ while (u < n) {
+ acc = (acc << 8) | (*buf ++);
+ acc_len += 8;
+ if (acc_len >= 14) {
+ unsigned w;
+
+ acc_len -= 14;
+ w = (acc >> acc_len) & 0x3FFF;
+ if (w >= 12289) {
+ return 0;
+ }
+ x[u ++] = (uint16_t)w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i16_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint16_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i16_decode(
+ int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ w |= -(w & mask2);
+ x[u ++] = (int16_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+ void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint8_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+ int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ x[u ++] = (int8_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_comp_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn) {
+ uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = out;
+
+ /*
+ * Make sure that all values are within the -2047..+2047 range.
+ */
+ for (u = 0; u < n; u ++) {
+ if (x[u] < -2047 || x[u] > +2047) {
+ return 0;
+ }
+ }
+
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ int t;
+ unsigned w;
+
+ /*
+ * Get sign and absolute value of next integer; push the
+ * sign bit.
+ */
+ acc <<= 1;
+ t = x[u];
+ if (t < 0) {
+ t = -t;
+ acc |= 1;
+ }
+ w = (unsigned)t;
+
+ /*
+ * Push the low 7 bits of the absolute value.
+ */
+ acc <<= 7;
+ acc |= w & 127u;
+ w >>= 7;
+
+ /*
+ * We pushed exactly 8 bits.
+ */
+ acc_len += 8;
+
+ /*
+ * Push as many zeros as necessary, then a one. Since the
+ * absolute value is at most 2047, w can only range up to
+ * 15 at this point, thus we will add at most 16 bits
+ * here. With the 8 bits above and possibly up to 7 bits
+ * from previous iterations, we may go up to 31 bits, which
+ * will fit in the accumulator, which is an uint32_t.
+ */
+ acc <<= (w + 1);
+ acc |= 1;
+ acc_len += w + 1;
+
+ /*
+ * Produce all full bytes.
+ */
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc >> acc_len);
+ }
+ v ++;
+ }
+ }
+
+ /*
+ * Flush remaining bits (if any).
+ */
+ if (acc_len > 0) {
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc << (8 - acc_len));
+ }
+ v ++;
+ }
+
+ return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_AVX2_comp_decode(
+ int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ const uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ unsigned b, s, m;
+
+ /*
+ * Get next eight bits: sign and low seven bits of the
+ * absolute value.
+ */
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ b = acc >> acc_len;
+ s = b & 128;
+ m = b & 127;
+
+ /*
+ * Get next bits until a 1 is reached.
+ */
+ for (;;) {
+ if (acc_len == 0) {
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ acc_len = 8;
+ }
+ acc_len --;
+ if (((acc >> acc_len) & 1) != 0) {
+ break;
+ }
+ m += 128;
+ if (m > 2047) {
+ return 0;
+ }
+ }
+
+ /*
+ * "-0" is forbidden.
+ */
+ if (s && m == 0) {
+ return 0;
+ }
+ if (s) {
+ x[u] = (int16_t) - m;
+ } else {
+ x[u] = (int16_t)m;
+ }
+ }
+
+ /*
+ * Unused bits in the last byte must be zero.
+ */
+ if ((acc & ((1u << acc_len) - 1u)) != 0) {
+ return 0;
+ }
+
+ return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ * log(n) n max(f,g) std(f,g) max(F,G) std(F,G)
+ * 1 2 129 56.31 143 60.02
+ * 2 4 123 40.93 160 46.52
+ * 3 8 97 28.97 159 38.01
+ * 4 16 100 21.48 154 32.50
+ * 5 32 71 15.41 151 29.36
+ * 6 64 59 11.07 138 27.77
+ * 7 128 39 7.91 144 27.00
+ * 8 256 32 5.63 148 26.61
+ * 9 512 22 4.00 137 26.46
+ * 10 1024 15 2.84 146 26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 7,
+ 7,
+ 6,
+ 6,
+ 5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ * log(n) n max sig coeff (theoretical)
+ * 1 2 412
+ * 2 4 583
+ * 3 8 824
+ * 4 16 1166
+ * 5 32 1649
+ * 6 64 2332
+ * 7 128 3299
+ * 8 256 4665
+ * 9 512 6598
+ * 10 1024 9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_sig_bits[] = {
+ 0, /* unused */
+ 10,
+ 11,
+ 11,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/common.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/common.c
new file mode 100644
index 000000000..70ef4d04d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn) {
+ /*
+ * This is the straightforward per-the-spec implementation. It
+ * is not constant-time, thus it might reveal information on the
+ * plaintext (at least, enough to check the plaintext against a
+ * list of potential plaintexts) in a scenario where the
+ * attacker does not have access to the signature value or to
+ * the public key, but knows the nonce (without knowledge of the
+ * nonce, the hashed output cannot be matched against potential
+ * plaintexts).
+ */
+ size_t n;
+
+ n = (size_t)1 << logn;
+ while (n > 0) {
+ uint8_t buf[2];
+ uint32_t w;
+
+ inner_shake256_extract(sc, (void *)buf, sizeof buf);
+ w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+ if (w < 61445) {
+ while (w >= 12289) {
+ w -= 12289;
+ }
+ *x ++ = (uint16_t)w;
+ n --;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp) {
+ /*
+ * Each 16-bit sample is a value in 0..65535. The value is
+ * kept if it falls in 0..61444 (because 61445 = 5*12289)
+ * and rejected otherwise; thus, each sample has probability
+ * about 0.93758 of being selected.
+ *
+ * We want to oversample enough to be sure that we will
+ * have enough values with probability at least 1 - 2^(-256).
+ * Depending on degree N, this leads to the following
+ * required oversampling:
+ *
+ * logn n oversampling
+ * 1 2 65
+ * 2 4 67
+ * 3 8 71
+ * 4 16 77
+ * 5 32 86
+ * 6 64 100
+ * 7 128 122
+ * 8 256 154
+ * 9 512 205
+ * 10 1024 287
+ *
+ * If logn >= 7, then the provided temporary buffer is large
+ * enough. Otherwise, we use a stack buffer of 63 entries
+ * (i.e. 126 bytes) for the values that do not fit in tmp[].
+ */
+
+ static const uint16_t overtab[] = {
+ 0, /* unused */
+ 65,
+ 67,
+ 71,
+ 77,
+ 86,
+ 100,
+ 122,
+ 154,
+ 205,
+ 287
+ };
+
+ unsigned n, n2, u, m, p, over;
+ uint16_t *tt1, tt2[63];
+
+ /*
+ * We first generate m 16-bit value. Values 0..n-1 go to x[].
+ * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+ * We also reduce modulo q the values; rejected values are set
+ * to 0xFFFF.
+ */
+ n = 1U << logn;
+ n2 = n << 1;
+ over = overtab[logn];
+ m = n + over;
+ tt1 = (uint16_t *)tmp;
+ for (u = 0; u < m; u ++) {
+ uint8_t buf[2];
+ uint32_t w, wr;
+
+ inner_shake256_extract(sc, buf, sizeof buf);
+ w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+ wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+ wr |= ((w - 61445) >> 31) - 1;
+ if (u < n) {
+ x[u] = (uint16_t)wr;
+ } else if (u < n2) {
+ tt1[u - n] = (uint16_t)wr;
+ } else {
+ tt2[u - n2] = (uint16_t)wr;
+ }
+ }
+
+ /*
+ * Now we must "squeeze out" the invalid values. We do this in
+ * a logarithmic sequence of passes; each pass computes where a
+ * value should go, and moves it down by 'p' slots if necessary,
+ * where 'p' uses an increasing powers-of-two scale. It can be
+ * shown that in all cases where the loop decides that a value
+ * has to be moved down by p slots, the destination slot is
+ * "free" (i.e. contains an invalid value).
+ */
+ for (p = 1; p <= over; p <<= 1) {
+ unsigned v;
+
+ /*
+ * In the loop below:
+ *
+ * - v contains the index of the final destination of
+ * the value; it is recomputed dynamically based on
+ * whether values are valid or not.
+ *
+ * - u is the index of the value we consider ("source");
+ * its address is s.
+ *
+ * - The loop may swap the value with the one at index
+ * u-p. The address of the swap destination is d.
+ */
+ v = 0;
+ for (u = 0; u < m; u ++) {
+ uint16_t *s, *d;
+ unsigned j, sv, dv, mk;
+
+ if (u < n) {
+ s = &x[u];
+ } else if (u < n2) {
+ s = &tt1[u - n];
+ } else {
+ s = &tt2[u - n2];
+ }
+ sv = *s;
+
+ /*
+ * The value in sv should ultimately go to
+ * address v, i.e. jump back by u-v slots.
+ */
+ j = u - v;
+
+ /*
+ * We increment v for the next iteration, but
+ * only if the source value is valid. The mask
+ * 'mk' is -1 if the value is valid, 0 otherwise,
+ * so we _subtract_ mk.
+ */
+ mk = (sv >> 15) - 1U;
+ v -= mk;
+
+ /*
+ * In this loop we consider jumps by p slots; if
+ * u < p then there is nothing more to do.
+ */
+ if (u < p) {
+ continue;
+ }
+
+ /*
+ * Destination for the swap: value at address u-p.
+ */
+ if ((u - p) < n) {
+ d = &x[u - p];
+ } else if ((u - p) < n2) {
+ d = &tt1[(u - p) - n];
+ } else {
+ d = &tt2[(u - p) - n2];
+ }
+ dv = *d;
+
+ /*
+ * The swap should be performed only if the source
+ * is valid AND the jump j has its 'p' bit set.
+ */
+ mk &= -(((j & p) + 0x1FF) >> 9);
+
+ *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+ *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+ }
+ }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+ 0, /* unused */
+ 101498,
+ 208714,
+ 428865,
+ 892039,
+ 1852696,
+ 3842630,
+ 7959734,
+ 16468416,
+ 34034726,
+ 70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_is_short(
+ const int16_t *s1, const int16_t *s2, unsigned logn) {
+ /*
+ * We use the l2-norm. Code below uses only 32-bit operations to
+ * compute the square of the norm with saturation to 2^32-1 if
+ * the value exceeds 2^31-1.
+ */
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = (size_t)1 << logn;
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s1[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ z = s2[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ s |= -(ng >> 31);
+
+ return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_is_short_half(
+ uint32_t sqn, const int16_t *s2, unsigned logn) {
+ size_t n, u;
+ uint32_t ng;
+
+ n = (size_t)1 << logn;
+ ng = -(sqn >> 31);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s2[u];
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ }
+ sqn |= -(ng >> 31);
+
+ return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/fft.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fft.c
new file mode 100644
index 000000000..8ba5b435d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fft.c
@@ -0,0 +1,1108 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_add(a_re, b_re); \
+ fpct_im = fpr_add(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_sub(a_re, b_re); \
+ fpct_im = fpr_sub(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+ fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+ fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ * Re(f(w_j)) -> slot rev(j)/2
+ * Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_FFT(fpr *f, unsigned logn) {
+ /*
+ * FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = N
+ * for m = 1; m < N; m *= 2:
+ * ht = t/2
+ * for i1 = 0; i1 < m; i1 ++:
+ * j1 = i1 * t
+ * s = GM[m + i1]
+ * for j = j1; j < (j1 + ht); j ++:
+ * x = f[j]
+ * y = s * f[j + ht]
+ * f[j] = x + y
+ * f[j + ht] = x - y
+ * t = ht
+ *
+ * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+ *
+ * In the description above, f[] is supposed to contain complex
+ * numbers. In our in-memory representation, the real and
+ * imaginary parts of f[k] are in array slots k and k+N/2.
+ *
+ * We only keep the first half of the complex numbers. We can
+ * see that after the first iteration, the first and second halves
+ * of the array of complex numbers have separate lives, so we
+ * simply ignore the second part.
+ */
+
+ unsigned u;
+ size_t t, n, hn, m;
+
+ /*
+ * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+ * (because GM[1] = w^rev(1) = w^(N/2) = i).
+ * In our chosen representation, this is a no-op: everything is
+ * already where it should be.
+ */
+
+ /*
+ * Subsequent iterations are truncated to use only the first
+ * half of values.
+ */
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ t = hn;
+ for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+ size_t ht, hm, i1, j1;
+
+ ht = t >> 1;
+ hm = m >> 1;
+ for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+ size_t j, j2;
+
+ j2 = j1 + ht;
+ if (ht >= 4) {
+ __m256d s_re, s_im;
+
+ s_re = _mm256_set1_pd(
+ fpr_gm_tab[((m + i1) << 1) + 0].v);
+ s_im = _mm256_set1_pd(
+ fpr_gm_tab[((m + i1) << 1) + 1].v);
+ for (j = j1; j < j2; j += 4) {
+ __m256d x_re, x_im, y_re, y_im;
+ __m256d z_re, z_im;
+
+ x_re = _mm256_loadu_pd(&f[j].v);
+ x_im = _mm256_loadu_pd(&f[j + hn].v);
+ z_re = _mm256_loadu_pd(&f[j + ht].v);
+ z_im = _mm256_loadu_pd(&f[j + ht + hn].v);
+ y_re = FMSUB(z_re, s_re,
+ _mm256_mul_pd(z_im, s_im));
+ y_im = FMADD(z_re, s_im,
+ _mm256_mul_pd(z_im, s_re));
+ _mm256_storeu_pd(&f[j].v,
+ _mm256_add_pd(x_re, y_re));
+ _mm256_storeu_pd(&f[j + hn].v,
+ _mm256_add_pd(x_im, y_im));
+ _mm256_storeu_pd(&f[j + ht].v,
+ _mm256_sub_pd(x_re, y_re));
+ _mm256_storeu_pd(&f[j + ht + hn].v,
+ _mm256_sub_pd(x_im, y_im));
+ }
+ } else {
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+ s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + ht];
+ y_im = f[j + ht + hn];
+ FPC_MUL(y_re, y_im,
+ y_re, y_im, s_re, s_im);
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(f[j + ht], f[j + ht + hn],
+ x_re, x_im, y_re, y_im);
+ }
+ }
+ }
+ t = ht;
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_iFFT(fpr *f, unsigned logn) {
+ /*
+ * Inverse FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = 1
+ * for m = N; m > 1; m /= 2:
+ * hm = m/2
+ * dt = t*2
+ * for i1 = 0; i1 < hm; i1 ++:
+ * j1 = i1 * dt
+ * s = iGM[hm + i1]
+ * for j = j1; j < (j1 + t); j ++:
+ * x = f[j]
+ * y = f[j + t]
+ * f[j] = x + y
+ * f[j + t] = s * (x - y)
+ * t = dt
+ * for i1 = 0; i1 < N; i1 ++:
+ * f[i1] = f[i1] / N
+ *
+ * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+ * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+ *
+ * In the main loop (not counting the final division loop), in
+ * all iterations except the last, the first and second half of f[]
+ * (as an array of complex numbers) are separate. In our chosen
+ * representation, we do not keep the second half.
+ *
+ * The last iteration recombines the recomputed half with the
+ * implicit half, and should yield only real numbers since the
+ * target polynomial is real; moreover, s = i at that step.
+ * Thus, when considering x and y:
+ * y = conj(x) since the final f[j] must be real
+ * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+ * filled with 2*Im(x).
+ * But we already have Re(x) and Im(x) in array slots j and j+t
+ * in our chosen representation. That last iteration is thus a
+ * simple doubling of the values in all the array.
+ *
+ * We make the last iteration a no-op by tweaking the final
+ * division into a division by N/2, not N.
+ */
+ size_t u, n, hn, t, m;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ hn = n >> 1;
+ for (u = logn; u > 1; u --) {
+ size_t hm, dt, i1, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+ size_t j, j2;
+
+ j2 = j1 + t;
+ if (t >= 4) {
+ __m256d s_re, s_im;
+
+ s_re = _mm256_set1_pd(
+ fpr_gm_tab[((hm + i1) << 1) + 0].v);
+ s_im = _mm256_set1_pd(
+ fpr_gm_tab[((hm + i1) << 1) + 1].v);
+ for (j = j1; j < j2; j += 4) {
+ __m256d x_re, x_im, y_re, y_im;
+ __m256d z_re, z_im;
+
+ x_re = _mm256_loadu_pd(&f[j].v);
+ x_im = _mm256_loadu_pd(&f[j + hn].v);
+ y_re = _mm256_loadu_pd(&f[j + t].v);
+ y_im = _mm256_loadu_pd(&f[j + t + hn].v);
+ _mm256_storeu_pd(&f[j].v,
+ _mm256_add_pd(x_re, y_re));
+ _mm256_storeu_pd(&f[j + hn].v,
+ _mm256_add_pd(x_im, y_im));
+ x_re = _mm256_sub_pd(y_re, x_re);
+ x_im = _mm256_sub_pd(x_im, y_im);
+ z_re = FMSUB(x_im, s_im,
+ _mm256_mul_pd(x_re, s_re));
+ z_im = FMADD(x_re, s_im,
+ _mm256_mul_pd(x_im, s_re));
+ _mm256_storeu_pd(&f[j + t].v, z_re);
+ _mm256_storeu_pd(&f[j + t + hn].v, z_im);
+ }
+ } else {
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+ s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + t];
+ y_im = f[j + t + hn];
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(x_re, x_im,
+ x_re, x_im, y_re, y_im);
+ FPC_MUL(f[j + t], f[j + t + hn],
+ x_re, x_im, s_re, s_im);
+ }
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * Last iteration is a no-op, provided that we divide by N/2
+ * instead of N. We need to make a special case for logn = 0.
+ */
+ if (logn > 0) {
+ fpr ni;
+
+ ni = fpr_p2_tab[logn];
+ for (u = 0; u < n; u ++) {
+ f[u] = fpr_mul(f[u], ni);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_add(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_add_pd(
+ _mm256_loadu_pd(&a[u].v),
+ _mm256_loadu_pd(&b[u].v)));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_add(a[u], b[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_sub(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_sub_pd(
+ _mm256_loadu_pd(&a[u].v),
+ _mm256_loadu_pd(&b[u].v)));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_sub(a[u], b[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_neg(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ __m256d s;
+
+ s = _mm256_set1_pd(-0.0);
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 8) {
+ __m256d s;
+
+ s = _mm256_set1_pd(-0.0);
+ for (u = (n >> 1); u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_xor_pd(_mm256_loadu_pd(&a[u].v), s));
+ }
+ } else {
+ for (u = (n >> 1); u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ c_re = FMSUB(
+ a_re, b_re, _mm256_mul_pd(a_im, b_im));
+ c_im = FMADD(
+ a_re, b_im, _mm256_mul_pd(a_im, b_re));
+ _mm256_storeu_pd(&a[u].v, c_re);
+ _mm256_storeu_pd(&a[u + hn].v, c_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ c_re = FMADD(
+ a_re, b_re, _mm256_mul_pd(a_im, b_im));
+ c_im = FMSUB(
+ a_im, b_re, _mm256_mul_pd(a_re, b_im));
+ _mm256_storeu_pd(&a[u].v, c_re);
+ _mm256_storeu_pd(&a[u + hn].v, c_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = fpr_neg(b[u + hn]);
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d zero;
+
+ zero = _mm256_setzero_pd();
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ _mm256_storeu_pd(&a[u].v,
+ FMADD(a_re, a_re,
+ _mm256_mul_pd(a_im, a_im)));
+ _mm256_storeu_pd(&a[u + hn].v, zero);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+ a[u + hn] = fpr_zero;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ if (n >= 4) {
+ __m256d x4;
+
+ x4 = _mm256_set1_pd(x.v);
+ for (u = 0; u < n; u += 4) {
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_mul_pd(x4, _mm256_loadu_pd(&a[u].v)));
+ }
+ } else {
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_mul(a[u], x);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_div_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im, t;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ t = _mm256_div_pd(one,
+ FMADD(b_re, b_re,
+ _mm256_mul_pd(b_im, b_im)));
+ b_re = _mm256_mul_pd(b_re, t);
+ b_im = _mm256_mul_pd(b_im, t);
+ c_re = FMADD(
+ a_re, b_re, _mm256_mul_pd(a_im, b_im));
+ c_im = FMSUB(
+ a_im, b_re, _mm256_mul_pd(a_re, b_im));
+ _mm256_storeu_pd(&a[u].v, c_re);
+ _mm256_storeu_pd(&a[u + hn].v, c_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, dv;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ b_re = _mm256_loadu_pd(&b[u].v);
+ b_im = _mm256_loadu_pd(&b[u + hn].v);
+ dv = _mm256_div_pd(one,
+ _mm256_add_pd(
+ FMADD(a_re, a_re,
+ _mm256_mul_pd(a_im, a_im)),
+ FMADD(b_re, b_re,
+ _mm256_mul_pd(b_im, b_im))));
+ _mm256_storeu_pd(&d[u].v, dv);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+ fpr b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ d[u] = fpr_inv(fpr_add(
+ fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+ fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d F_re, F_im, G_re, G_im;
+ __m256d f_re, f_im, g_re, g_im;
+ __m256d a_re, a_im, b_re, b_im;
+
+ F_re = _mm256_loadu_pd(&F[u].v);
+ F_im = _mm256_loadu_pd(&F[u + hn].v);
+ G_re = _mm256_loadu_pd(&G[u].v);
+ G_im = _mm256_loadu_pd(&G[u + hn].v);
+ f_re = _mm256_loadu_pd(&f[u].v);
+ f_im = _mm256_loadu_pd(&f[u + hn].v);
+ g_re = _mm256_loadu_pd(&g[u].v);
+ g_im = _mm256_loadu_pd(&g[u + hn].v);
+
+ a_re = FMADD(F_re, f_re,
+ _mm256_mul_pd(F_im, f_im));
+ a_im = FMSUB(F_im, f_re,
+ _mm256_mul_pd(F_re, f_im));
+ b_re = FMADD(G_re, g_re,
+ _mm256_mul_pd(G_im, g_im));
+ b_im = FMSUB(G_im, g_re,
+ _mm256_mul_pd(G_re, g_im));
+ _mm256_storeu_pd(&d[u].v,
+ _mm256_add_pd(a_re, b_re));
+ _mm256_storeu_pd(&d[u + hn].v,
+ _mm256_add_pd(a_im, b_im));
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr F_re, F_im, G_re, G_im;
+ fpr f_re, f_im, g_re, g_im;
+ fpr a_re, a_im, b_re, b_im;
+
+ F_re = F[u];
+ F_im = F[u + hn];
+ G_re = G[u];
+ G_im = G[u + hn];
+ f_re = f[u];
+ f_im = f[u + hn];
+ g_re = g[u];
+ g_im = g[u + hn];
+
+ FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+ FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+ d[u] = fpr_add(a_re, b_re);
+ d[u + hn] = fpr_add(a_im, b_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ for (u = 0; u < hn; u += 4) {
+ __m256d a_re, a_im, bv;
+
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ bv = _mm256_loadu_pd(&b[u].v);
+ _mm256_storeu_pd(&a[u].v,
+ _mm256_mul_pd(a_re, bv));
+ _mm256_storeu_pd(&a[u + hn].v,
+ _mm256_mul_pd(a_im, bv));
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ a[u] = fpr_mul(a[u], b[u]);
+ a[u + hn] = fpr_mul(a[u + hn], b[u]);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_div_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d ib, a_re, a_im;
+
+ ib = _mm256_div_pd(one, _mm256_loadu_pd(&b[u].v));
+ a_re = _mm256_loadu_pd(&a[u].v);
+ a_im = _mm256_loadu_pd(&a[u + hn].v);
+ _mm256_storeu_pd(&a[u].v, _mm256_mul_pd(a_re, ib));
+ _mm256_storeu_pd(&a[u + hn].v, _mm256_mul_pd(a_im, ib));
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr ib;
+
+ ib = fpr_inv(b[u]);
+ a[u] = fpr_mul(a[u], ib);
+ a[u + hn] = fpr_mul(a[u + hn], ib);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_LDL_fft(
+ const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+ g00_re = _mm256_loadu_pd(&g00[u].v);
+ g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+ g01_re = _mm256_loadu_pd(&g01[u].v);
+ g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+ g11_re = _mm256_loadu_pd(&g11[u].v);
+ g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+ t = _mm256_div_pd(one,
+ FMADD(g00_re, g00_re,
+ _mm256_mul_pd(g00_im, g00_im)));
+ g00_re = _mm256_mul_pd(g00_re, t);
+ g00_im = _mm256_mul_pd(g00_im, t);
+ mu_re = FMADD(g01_re, g00_re,
+ _mm256_mul_pd(g01_im, g00_im));
+ mu_im = FMSUB(g01_re, g00_im,
+ _mm256_mul_pd(g01_im, g00_re));
+ xi_re = FMSUB(mu_re, g01_re,
+ _mm256_mul_pd(mu_im, g01_im));
+ xi_im = FMADD(mu_im, g01_re,
+ _mm256_mul_pd(mu_re, g01_im));
+ _mm256_storeu_pd(&g11[u].v,
+ _mm256_sub_pd(g11_re, xi_re));
+ _mm256_storeu_pd(&g11[u + hn].v,
+ _mm256_add_pd(g11_im, xi_im));
+ _mm256_storeu_pd(&g01[u].v, mu_re);
+ _mm256_storeu_pd(&g01[u + hn].v, mu_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im,
+ mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(g11[u], g11[u + hn],
+ g11_re, g11_im, g01_re, g01_im);
+ g01[u] = mu_re;
+ g01[u + hn] = fpr_neg(mu_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(
+ fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ if (n >= 8) {
+ __m256d one;
+
+ one = _mm256_set1_pd(1.0);
+ for (u = 0; u < hn; u += 4) {
+ __m256d g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ __m256d t, mu_re, mu_im, xi_re, xi_im;
+
+ g00_re = _mm256_loadu_pd(&g00[u].v);
+ g00_im = _mm256_loadu_pd(&g00[u + hn].v);
+ g01_re = _mm256_loadu_pd(&g01[u].v);
+ g01_im = _mm256_loadu_pd(&g01[u + hn].v);
+ g11_re = _mm256_loadu_pd(&g11[u].v);
+ g11_im = _mm256_loadu_pd(&g11[u + hn].v);
+
+ t = _mm256_div_pd(one,
+ FMADD(g00_re, g00_re,
+ _mm256_mul_pd(g00_im, g00_im)));
+ g00_re = _mm256_mul_pd(g00_re, t);
+ g00_im = _mm256_mul_pd(g00_im, t);
+ mu_re = FMADD(g01_re, g00_re,
+ _mm256_mul_pd(g01_im, g00_im));
+ mu_im = FMSUB(g01_re, g00_im,
+ _mm256_mul_pd(g01_im, g00_re));
+ xi_re = FMSUB(mu_re, g01_re,
+ _mm256_mul_pd(mu_im, g01_im));
+ xi_im = FMADD(mu_im, g01_re,
+ _mm256_mul_pd(mu_re, g01_im));
+ _mm256_storeu_pd(&d11[u].v,
+ _mm256_sub_pd(g11_re, xi_re));
+ _mm256_storeu_pd(&d11[u + hn].v,
+ _mm256_add_pd(g11_im, xi_im));
+ _mm256_storeu_pd(&l10[u].v, mu_re);
+ _mm256_storeu_pd(&l10[u + hn].v, mu_im);
+ }
+ } else {
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im,
+ mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(d11[u], d11[u + hn],
+ g11_re, g11_im, g01_re, g01_im);
+ l10[u] = mu_re;
+ l10[u + hn] = fpr_neg(mu_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(
+ fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn) {
+ /*
+ * The FFT representation we use is in bit-reversed order
+ * (element i contains f(w^(rev(i))), where rev() is the
+ * bit-reversal function over the ring degree. This changes
+ * indexes with regards to the Falcon specification.
+ */
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ if (n >= 8) {
+ __m256d half, sv;
+
+ half = _mm256_set1_pd(0.5);
+ sv = _mm256_set_pd(-0.0, 0.0, -0.0, 0.0);
+ for (u = 0; u < qn; u += 2) {
+ __m256d ab_re, ab_im, ff0, ff1, ff2, ff3, gmt;
+
+ ab_re = _mm256_loadu_pd(&f[(u << 1)].v);
+ ab_im = _mm256_loadu_pd(&f[(u << 1) + hn].v);
+ ff0 = _mm256_mul_pd(_mm256_hadd_pd(ab_re, ab_im), half);
+ ff0 = _mm256_permute4x64_pd(ff0, 0xD8);
+ _mm_storeu_pd(&f0[u].v,
+ _mm256_extractf128_pd(ff0, 0));
+ _mm_storeu_pd(&f0[u + qn].v,
+ _mm256_extractf128_pd(ff0, 1));
+
+ ff1 = _mm256_mul_pd(_mm256_hsub_pd(ab_re, ab_im), half);
+ gmt = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+ ff2 = _mm256_shuffle_pd(ff1, ff1, 0x5);
+ ff3 = _mm256_hadd_pd(
+ _mm256_mul_pd(ff1, gmt),
+ _mm256_xor_pd(_mm256_mul_pd(ff2, gmt), sv));
+ ff3 = _mm256_permute4x64_pd(ff3, 0xD8);
+ _mm_storeu_pd(&f1[u].v,
+ _mm256_extractf128_pd(ff3, 0));
+ _mm_storeu_pd(&f1[u + qn].v,
+ _mm256_extractf128_pd(ff3, 1));
+ }
+ } else {
+ f0[0] = f[0];
+ f1[0] = f[hn];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f[(u << 1) + 0];
+ a_im = f[(u << 1) + 0 + hn];
+ b_re = f[(u << 1) + 1];
+ b_im = f[(u << 1) + 1 + hn];
+
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f0[u] = fpr_half(t_re);
+ f0[u + qn] = fpr_half(t_im);
+
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ FPC_MUL(t_re, t_im, t_re, t_im,
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+ f1[u] = fpr_half(t_re);
+ f1[u + qn] = fpr_half(t_im);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(
+ fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn) {
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ if (n >= 16) {
+ for (u = 0; u < qn; u += 4) {
+ __m256d a_re, a_im, b_re, b_im, c_re, c_im;
+ __m256d gm1, gm2, g_re, g_im;
+ __m256d t_re, t_im, u_re, u_im;
+ __m256d tu1_re, tu2_re, tu1_im, tu2_im;
+
+ a_re = _mm256_loadu_pd(&f0[u].v);
+ a_im = _mm256_loadu_pd(&f0[u + qn].v);
+ c_re = _mm256_loadu_pd(&f1[u].v);
+ c_im = _mm256_loadu_pd(&f1[u + qn].v);
+
+ gm1 = _mm256_loadu_pd(&fpr_gm_tab[(u + hn) << 1].v);
+ gm2 = _mm256_loadu_pd(&fpr_gm_tab[(u + 2 + hn) << 1].v);
+ g_re = _mm256_unpacklo_pd(gm1, gm2);
+ g_im = _mm256_unpackhi_pd(gm1, gm2);
+ g_re = _mm256_permute4x64_pd(g_re, 0xD8);
+ g_im = _mm256_permute4x64_pd(g_im, 0xD8);
+
+ b_re = FMSUB(
+ c_re, g_re, _mm256_mul_pd(c_im, g_im));
+ b_im = FMADD(
+ c_re, g_im, _mm256_mul_pd(c_im, g_re));
+
+ t_re = _mm256_add_pd(a_re, b_re);
+ t_im = _mm256_add_pd(a_im, b_im);
+ u_re = _mm256_sub_pd(a_re, b_re);
+ u_im = _mm256_sub_pd(a_im, b_im);
+
+ tu1_re = _mm256_unpacklo_pd(t_re, u_re);
+ tu2_re = _mm256_unpackhi_pd(t_re, u_re);
+ tu1_im = _mm256_unpacklo_pd(t_im, u_im);
+ tu2_im = _mm256_unpackhi_pd(t_im, u_im);
+ _mm256_storeu_pd(&f[(u << 1)].v,
+ _mm256_permute2f128_pd(tu1_re, tu2_re, 0x20));
+ _mm256_storeu_pd(&f[(u << 1) + 4].v,
+ _mm256_permute2f128_pd(tu1_re, tu2_re, 0x31));
+ _mm256_storeu_pd(&f[(u << 1) + hn].v,
+ _mm256_permute2f128_pd(tu1_im, tu2_im, 0x20));
+ _mm256_storeu_pd(&f[(u << 1) + 4 + hn].v,
+ _mm256_permute2f128_pd(tu1_im, tu2_im, 0x31));
+ }
+ } else {
+ f[0] = f0[0];
+ f[hn] = f1[0];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f0[u];
+ a_im = f0[u + qn];
+ FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_gm_tab[((u + hn) << 1) + 1]);
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 0] = t_re;
+ f[(u << 1) + 0 + hn] = t_im;
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 1] = t_re;
+ f[(u << 1) + 1 + hn] = t_im;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.c
new file mode 100644
index 000000000..8940f3400
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.c
@@ -0,0 +1,1076 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+const fpr fpr_gm_tab[] = {
+ {0}, {0}, /* unused */
+ {-0.000000000000000000000000000}, { 1.000000000000000000000000000},
+ { 0.707106781186547524400844362}, { 0.707106781186547524400844362},
+ {-0.707106781186547524400844362}, { 0.707106781186547524400844362},
+ { 0.923879532511286756128183189}, { 0.382683432365089771728459984},
+ {-0.382683432365089771728459984}, { 0.923879532511286756128183189},
+ { 0.382683432365089771728459984}, { 0.923879532511286756128183189},
+ {-0.923879532511286756128183189}, { 0.382683432365089771728459984},
+ { 0.980785280403230449126182236}, { 0.195090322016128267848284868},
+ {-0.195090322016128267848284868}, { 0.980785280403230449126182236},
+ { 0.555570233019602224742830814}, { 0.831469612302545237078788378},
+ {-0.831469612302545237078788378}, { 0.555570233019602224742830814},
+ { 0.831469612302545237078788378}, { 0.555570233019602224742830814},
+ {-0.555570233019602224742830814}, { 0.831469612302545237078788378},
+ { 0.195090322016128267848284868}, { 0.980785280403230449126182236},
+ {-0.980785280403230449126182236}, { 0.195090322016128267848284868},
+ { 0.995184726672196886244836953}, { 0.098017140329560601994195564},
+ {-0.098017140329560601994195564}, { 0.995184726672196886244836953},
+ { 0.634393284163645498215171613}, { 0.773010453362736960810906610},
+ {-0.773010453362736960810906610}, { 0.634393284163645498215171613},
+ { 0.881921264348355029712756864}, { 0.471396736825997648556387626},
+ {-0.471396736825997648556387626}, { 0.881921264348355029712756864},
+ { 0.290284677254462367636192376}, { 0.956940335732208864935797887},
+ {-0.956940335732208864935797887}, { 0.290284677254462367636192376},
+ { 0.956940335732208864935797887}, { 0.290284677254462367636192376},
+ {-0.290284677254462367636192376}, { 0.956940335732208864935797887},
+ { 0.471396736825997648556387626}, { 0.881921264348355029712756864},
+ {-0.881921264348355029712756864}, { 0.471396736825997648556387626},
+ { 0.773010453362736960810906610}, { 0.634393284163645498215171613},
+ {-0.634393284163645498215171613}, { 0.773010453362736960810906610},
+ { 0.098017140329560601994195564}, { 0.995184726672196886244836953},
+ {-0.995184726672196886244836953}, { 0.098017140329560601994195564},
+ { 0.998795456205172392714771605}, { 0.049067674327418014254954977},
+ {-0.049067674327418014254954977}, { 0.998795456205172392714771605},
+ { 0.671558954847018400625376850}, { 0.740951125354959091175616897},
+ {-0.740951125354959091175616897}, { 0.671558954847018400625376850},
+ { 0.903989293123443331586200297}, { 0.427555093430282094320966857},
+ {-0.427555093430282094320966857}, { 0.903989293123443331586200297},
+ { 0.336889853392220050689253213}, { 0.941544065183020778412509403},
+ {-0.941544065183020778412509403}, { 0.336889853392220050689253213},
+ { 0.970031253194543992603984207}, { 0.242980179903263889948274162},
+ {-0.242980179903263889948274162}, { 0.970031253194543992603984207},
+ { 0.514102744193221726593693839}, { 0.857728610000272069902269984},
+ {-0.857728610000272069902269984}, { 0.514102744193221726593693839},
+ { 0.803207531480644909806676513}, { 0.595699304492433343467036529},
+ {-0.595699304492433343467036529}, { 0.803207531480644909806676513},
+ { 0.146730474455361751658850130}, { 0.989176509964780973451673738},
+ {-0.989176509964780973451673738}, { 0.146730474455361751658850130},
+ { 0.989176509964780973451673738}, { 0.146730474455361751658850130},
+ {-0.146730474455361751658850130}, { 0.989176509964780973451673738},
+ { 0.595699304492433343467036529}, { 0.803207531480644909806676513},
+ {-0.803207531480644909806676513}, { 0.595699304492433343467036529},
+ { 0.857728610000272069902269984}, { 0.514102744193221726593693839},
+ {-0.514102744193221726593693839}, { 0.857728610000272069902269984},
+ { 0.242980179903263889948274162}, { 0.970031253194543992603984207},
+ {-0.970031253194543992603984207}, { 0.242980179903263889948274162},
+ { 0.941544065183020778412509403}, { 0.336889853392220050689253213},
+ {-0.336889853392220050689253213}, { 0.941544065183020778412509403},
+ { 0.427555093430282094320966857}, { 0.903989293123443331586200297},
+ {-0.903989293123443331586200297}, { 0.427555093430282094320966857},
+ { 0.740951125354959091175616897}, { 0.671558954847018400625376850},
+ {-0.671558954847018400625376850}, { 0.740951125354959091175616897},
+ { 0.049067674327418014254954977}, { 0.998795456205172392714771605},
+ {-0.998795456205172392714771605}, { 0.049067674327418014254954977},
+ { 0.999698818696204220115765650}, { 0.024541228522912288031734529},
+ {-0.024541228522912288031734529}, { 0.999698818696204220115765650},
+ { 0.689540544737066924616730630}, { 0.724247082951466920941069243},
+ {-0.724247082951466920941069243}, { 0.689540544737066924616730630},
+ { 0.914209755703530654635014829}, { 0.405241314004989870908481306},
+ {-0.405241314004989870908481306}, { 0.914209755703530654635014829},
+ { 0.359895036534988148775104572}, { 0.932992798834738887711660256},
+ {-0.932992798834738887711660256}, { 0.359895036534988148775104572},
+ { 0.975702130038528544460395766}, { 0.219101240156869797227737547},
+ {-0.219101240156869797227737547}, { 0.975702130038528544460395766},
+ { 0.534997619887097210663076905}, { 0.844853565249707073259571205},
+ {-0.844853565249707073259571205}, { 0.534997619887097210663076905},
+ { 0.817584813151583696504920884}, { 0.575808191417845300745972454},
+ {-0.575808191417845300745972454}, { 0.817584813151583696504920884},
+ { 0.170961888760301226363642357}, { 0.985277642388941244774018433},
+ {-0.985277642388941244774018433}, { 0.170961888760301226363642357},
+ { 0.992479534598709998156767252}, { 0.122410675199216198498704474},
+ {-0.122410675199216198498704474}, { 0.992479534598709998156767252},
+ { 0.615231590580626845484913563}, { 0.788346427626606262009164705},
+ {-0.788346427626606262009164705}, { 0.615231590580626845484913563},
+ { 0.870086991108711418652292404}, { 0.492898192229784036873026689},
+ {-0.492898192229784036873026689}, { 0.870086991108711418652292404},
+ { 0.266712757474898386325286515}, { 0.963776065795439866686464356},
+ {-0.963776065795439866686464356}, { 0.266712757474898386325286515},
+ { 0.949528180593036667195936074}, { 0.313681740398891476656478846},
+ {-0.313681740398891476656478846}, { 0.949528180593036667195936074},
+ { 0.449611329654606600046294579}, { 0.893224301195515320342416447},
+ {-0.893224301195515320342416447}, { 0.449611329654606600046294579},
+ { 0.757208846506484547575464054}, { 0.653172842953776764084203014},
+ {-0.653172842953776764084203014}, { 0.757208846506484547575464054},
+ { 0.073564563599667423529465622}, { 0.997290456678690216135597140},
+ {-0.997290456678690216135597140}, { 0.073564563599667423529465622},
+ { 0.997290456678690216135597140}, { 0.073564563599667423529465622},
+ {-0.073564563599667423529465622}, { 0.997290456678690216135597140},
+ { 0.653172842953776764084203014}, { 0.757208846506484547575464054},
+ {-0.757208846506484547575464054}, { 0.653172842953776764084203014},
+ { 0.893224301195515320342416447}, { 0.449611329654606600046294579},
+ {-0.449611329654606600046294579}, { 0.893224301195515320342416447},
+ { 0.313681740398891476656478846}, { 0.949528180593036667195936074},
+ {-0.949528180593036667195936074}, { 0.313681740398891476656478846},
+ { 0.963776065795439866686464356}, { 0.266712757474898386325286515},
+ {-0.266712757474898386325286515}, { 0.963776065795439866686464356},
+ { 0.492898192229784036873026689}, { 0.870086991108711418652292404},
+ {-0.870086991108711418652292404}, { 0.492898192229784036873026689},
+ { 0.788346427626606262009164705}, { 0.615231590580626845484913563},
+ {-0.615231590580626845484913563}, { 0.788346427626606262009164705},
+ { 0.122410675199216198498704474}, { 0.992479534598709998156767252},
+ {-0.992479534598709998156767252}, { 0.122410675199216198498704474},
+ { 0.985277642388941244774018433}, { 0.170961888760301226363642357},
+ {-0.170961888760301226363642357}, { 0.985277642388941244774018433},
+ { 0.575808191417845300745972454}, { 0.817584813151583696504920884},
+ {-0.817584813151583696504920884}, { 0.575808191417845300745972454},
+ { 0.844853565249707073259571205}, { 0.534997619887097210663076905},
+ {-0.534997619887097210663076905}, { 0.844853565249707073259571205},
+ { 0.219101240156869797227737547}, { 0.975702130038528544460395766},
+ {-0.975702130038528544460395766}, { 0.219101240156869797227737547},
+ { 0.932992798834738887711660256}, { 0.359895036534988148775104572},
+ {-0.359895036534988148775104572}, { 0.932992798834738887711660256},
+ { 0.405241314004989870908481306}, { 0.914209755703530654635014829},
+ {-0.914209755703530654635014829}, { 0.405241314004989870908481306},
+ { 0.724247082951466920941069243}, { 0.689540544737066924616730630},
+ {-0.689540544737066924616730630}, { 0.724247082951466920941069243},
+ { 0.024541228522912288031734529}, { 0.999698818696204220115765650},
+ {-0.999698818696204220115765650}, { 0.024541228522912288031734529},
+ { 0.999924701839144540921646491}, { 0.012271538285719926079408262},
+ {-0.012271538285719926079408262}, { 0.999924701839144540921646491},
+ { 0.698376249408972853554813503}, { 0.715730825283818654125532623},
+ {-0.715730825283818654125532623}, { 0.698376249408972853554813503},
+ { 0.919113851690057743908477789}, { 0.393992040061048108596188661},
+ {-0.393992040061048108596188661}, { 0.919113851690057743908477789},
+ { 0.371317193951837543411934967}, { 0.928506080473215565937167396},
+ {-0.928506080473215565937167396}, { 0.371317193951837543411934967},
+ { 0.978317370719627633106240097}, { 0.207111376192218549708116020},
+ {-0.207111376192218549708116020}, { 0.978317370719627633106240097},
+ { 0.545324988422046422313987347}, { 0.838224705554838043186996856},
+ {-0.838224705554838043186996856}, { 0.545324988422046422313987347},
+ { 0.824589302785025264474803737}, { 0.565731810783613197389765011},
+ {-0.565731810783613197389765011}, { 0.824589302785025264474803737},
+ { 0.183039887955140958516532578}, { 0.983105487431216327180301155},
+ {-0.983105487431216327180301155}, { 0.183039887955140958516532578},
+ { 0.993906970002356041546922813}, { 0.110222207293883058807899140},
+ {-0.110222207293883058807899140}, { 0.993906970002356041546922813},
+ { 0.624859488142386377084072816}, { 0.780737228572094478301588484},
+ {-0.780737228572094478301588484}, { 0.624859488142386377084072816},
+ { 0.876070094195406607095844268}, { 0.482183772079122748517344481},
+ {-0.482183772079122748517344481}, { 0.876070094195406607095844268},
+ { 0.278519689385053105207848526}, { 0.960430519415565811199035138},
+ {-0.960430519415565811199035138}, { 0.278519689385053105207848526},
+ { 0.953306040354193836916740383}, { 0.302005949319228067003463232},
+ {-0.302005949319228067003463232}, { 0.953306040354193836916740383},
+ { 0.460538710958240023633181487}, { 0.887639620402853947760181617},
+ {-0.887639620402853947760181617}, { 0.460538710958240023633181487},
+ { 0.765167265622458925888815999}, { 0.643831542889791465068086063},
+ {-0.643831542889791465068086063}, { 0.765167265622458925888815999},
+ { 0.085797312344439890461556332}, { 0.996312612182778012627226190},
+ {-0.996312612182778012627226190}, { 0.085797312344439890461556332},
+ { 0.998118112900149207125155861}, { 0.061320736302208577782614593},
+ {-0.061320736302208577782614593}, { 0.998118112900149207125155861},
+ { 0.662415777590171761113069817}, { 0.749136394523459325469203257},
+ {-0.749136394523459325469203257}, { 0.662415777590171761113069817},
+ { 0.898674465693953843041976744}, { 0.438616238538527637647025738},
+ {-0.438616238538527637647025738}, { 0.898674465693953843041976744},
+ { 0.325310292162262934135954708}, { 0.945607325380521325730945387},
+ {-0.945607325380521325730945387}, { 0.325310292162262934135954708},
+ { 0.966976471044852109087220226}, { 0.254865659604514571553980779},
+ {-0.254865659604514571553980779}, { 0.966976471044852109087220226},
+ { 0.503538383725717558691867071}, { 0.863972856121586737918147054},
+ {-0.863972856121586737918147054}, { 0.503538383725717558691867071},
+ { 0.795836904608883536262791915}, { 0.605511041404325513920626941},
+ {-0.605511041404325513920626941}, { 0.795836904608883536262791915},
+ { 0.134580708507126186316358409}, { 0.990902635427780025108237011},
+ {-0.990902635427780025108237011}, { 0.134580708507126186316358409},
+ { 0.987301418157858382399815802}, { 0.158858143333861441684385360},
+ {-0.158858143333861441684385360}, { 0.987301418157858382399815802},
+ { 0.585797857456438860328080838}, { 0.810457198252594791726703434},
+ {-0.810457198252594791726703434}, { 0.585797857456438860328080838},
+ { 0.851355193105265142261290312}, { 0.524589682678468906215098464},
+ {-0.524589682678468906215098464}, { 0.851355193105265142261290312},
+ { 0.231058108280671119643236018}, { 0.972939952205560145467720114},
+ {-0.972939952205560145467720114}, { 0.231058108280671119643236018},
+ { 0.937339011912574923201899593}, { 0.348418680249434568419308588},
+ {-0.348418680249434568419308588}, { 0.937339011912574923201899593},
+ { 0.416429560097637182562598911}, { 0.909167983090522376563884788},
+ {-0.909167983090522376563884788}, { 0.416429560097637182562598911},
+ { 0.732654271672412834615546649}, { 0.680600997795453050594430464},
+ {-0.680600997795453050594430464}, { 0.732654271672412834615546649},
+ { 0.036807222941358832324332691}, { 0.999322384588349500896221011},
+ {-0.999322384588349500896221011}, { 0.036807222941358832324332691},
+ { 0.999322384588349500896221011}, { 0.036807222941358832324332691},
+ {-0.036807222941358832324332691}, { 0.999322384588349500896221011},
+ { 0.680600997795453050594430464}, { 0.732654271672412834615546649},
+ {-0.732654271672412834615546649}, { 0.680600997795453050594430464},
+ { 0.909167983090522376563884788}, { 0.416429560097637182562598911},
+ {-0.416429560097637182562598911}, { 0.909167983090522376563884788},
+ { 0.348418680249434568419308588}, { 0.937339011912574923201899593},
+ {-0.937339011912574923201899593}, { 0.348418680249434568419308588},
+ { 0.972939952205560145467720114}, { 0.231058108280671119643236018},
+ {-0.231058108280671119643236018}, { 0.972939952205560145467720114},
+ { 0.524589682678468906215098464}, { 0.851355193105265142261290312},
+ {-0.851355193105265142261290312}, { 0.524589682678468906215098464},
+ { 0.810457198252594791726703434}, { 0.585797857456438860328080838},
+ {-0.585797857456438860328080838}, { 0.810457198252594791726703434},
+ { 0.158858143333861441684385360}, { 0.987301418157858382399815802},
+ {-0.987301418157858382399815802}, { 0.158858143333861441684385360},
+ { 0.990902635427780025108237011}, { 0.134580708507126186316358409},
+ {-0.134580708507126186316358409}, { 0.990902635427780025108237011},
+ { 0.605511041404325513920626941}, { 0.795836904608883536262791915},
+ {-0.795836904608883536262791915}, { 0.605511041404325513920626941},
+ { 0.863972856121586737918147054}, { 0.503538383725717558691867071},
+ {-0.503538383725717558691867071}, { 0.863972856121586737918147054},
+ { 0.254865659604514571553980779}, { 0.966976471044852109087220226},
+ {-0.966976471044852109087220226}, { 0.254865659604514571553980779},
+ { 0.945607325380521325730945387}, { 0.325310292162262934135954708},
+ {-0.325310292162262934135954708}, { 0.945607325380521325730945387},
+ { 0.438616238538527637647025738}, { 0.898674465693953843041976744},
+ {-0.898674465693953843041976744}, { 0.438616238538527637647025738},
+ { 0.749136394523459325469203257}, { 0.662415777590171761113069817},
+ {-0.662415777590171761113069817}, { 0.749136394523459325469203257},
+ { 0.061320736302208577782614593}, { 0.998118112900149207125155861},
+ {-0.998118112900149207125155861}, { 0.061320736302208577782614593},
+ { 0.996312612182778012627226190}, { 0.085797312344439890461556332},
+ {-0.085797312344439890461556332}, { 0.996312612182778012627226190},
+ { 0.643831542889791465068086063}, { 0.765167265622458925888815999},
+ {-0.765167265622458925888815999}, { 0.643831542889791465068086063},
+ { 0.887639620402853947760181617}, { 0.460538710958240023633181487},
+ {-0.460538710958240023633181487}, { 0.887639620402853947760181617},
+ { 0.302005949319228067003463232}, { 0.953306040354193836916740383},
+ {-0.953306040354193836916740383}, { 0.302005949319228067003463232},
+ { 0.960430519415565811199035138}, { 0.278519689385053105207848526},
+ {-0.278519689385053105207848526}, { 0.960430519415565811199035138},
+ { 0.482183772079122748517344481}, { 0.876070094195406607095844268},
+ {-0.876070094195406607095844268}, { 0.482183772079122748517344481},
+ { 0.780737228572094478301588484}, { 0.624859488142386377084072816},
+ {-0.624859488142386377084072816}, { 0.780737228572094478301588484},
+ { 0.110222207293883058807899140}, { 0.993906970002356041546922813},
+ {-0.993906970002356041546922813}, { 0.110222207293883058807899140},
+ { 0.983105487431216327180301155}, { 0.183039887955140958516532578},
+ {-0.183039887955140958516532578}, { 0.983105487431216327180301155},
+ { 0.565731810783613197389765011}, { 0.824589302785025264474803737},
+ {-0.824589302785025264474803737}, { 0.565731810783613197389765011},
+ { 0.838224705554838043186996856}, { 0.545324988422046422313987347},
+ {-0.545324988422046422313987347}, { 0.838224705554838043186996856},
+ { 0.207111376192218549708116020}, { 0.978317370719627633106240097},
+ {-0.978317370719627633106240097}, { 0.207111376192218549708116020},
+ { 0.928506080473215565937167396}, { 0.371317193951837543411934967},
+ {-0.371317193951837543411934967}, { 0.928506080473215565937167396},
+ { 0.393992040061048108596188661}, { 0.919113851690057743908477789},
+ {-0.919113851690057743908477789}, { 0.393992040061048108596188661},
+ { 0.715730825283818654125532623}, { 0.698376249408972853554813503},
+ {-0.698376249408972853554813503}, { 0.715730825283818654125532623},
+ { 0.012271538285719926079408262}, { 0.999924701839144540921646491},
+ {-0.999924701839144540921646491}, { 0.012271538285719926079408262},
+ { 0.999981175282601142656990438}, { 0.006135884649154475359640235},
+ {-0.006135884649154475359640235}, { 0.999981175282601142656990438},
+ { 0.702754744457225302452914421}, { 0.711432195745216441522130290},
+ {-0.711432195745216441522130290}, { 0.702754744457225302452914421},
+ { 0.921514039342041943465396332}, { 0.388345046698826291624993541},
+ {-0.388345046698826291624993541}, { 0.921514039342041943465396332},
+ { 0.377007410216418256726567823}, { 0.926210242138311341974793388},
+ {-0.926210242138311341974793388}, { 0.377007410216418256726567823},
+ { 0.979569765685440534439326110}, { 0.201104634842091911558443546},
+ {-0.201104634842091911558443546}, { 0.979569765685440534439326110},
+ { 0.550457972936604802977289893}, { 0.834862874986380056304401383},
+ {-0.834862874986380056304401383}, { 0.550457972936604802977289893},
+ { 0.828045045257755752067527592}, { 0.560661576197336023839710223},
+ {-0.560661576197336023839710223}, { 0.828045045257755752067527592},
+ { 0.189068664149806212754997837}, { 0.981963869109555264072848154},
+ {-0.981963869109555264072848154}, { 0.189068664149806212754997837},
+ { 0.994564570734255452119106243}, { 0.104121633872054579120943880},
+ {-0.104121633872054579120943880}, { 0.994564570734255452119106243},
+ { 0.629638238914927025372981341}, { 0.776888465673232450040827983},
+ {-0.776888465673232450040827983}, { 0.629638238914927025372981341},
+ { 0.879012226428633477831323711}, { 0.476799230063322133342158117},
+ {-0.476799230063322133342158117}, { 0.879012226428633477831323711},
+ { 0.284407537211271843618310615}, { 0.958703474895871555374645792},
+ {-0.958703474895871555374645792}, { 0.284407537211271843618310615},
+ { 0.955141168305770721498157712}, { 0.296150888243623824121786128},
+ {-0.296150888243623824121786128}, { 0.955141168305770721498157712},
+ { 0.465976495767966177902756065}, { 0.884797098430937780104007041},
+ {-0.884797098430937780104007041}, { 0.465976495767966177902756065},
+ { 0.769103337645579639346626069}, { 0.639124444863775743801488193},
+ {-0.639124444863775743801488193}, { 0.769103337645579639346626069},
+ { 0.091908956497132728624990979}, { 0.995767414467659793982495643},
+ {-0.995767414467659793982495643}, { 0.091908956497132728624990979},
+ { 0.998475580573294752208559038}, { 0.055195244349689939809447526},
+ {-0.055195244349689939809447526}, { 0.998475580573294752208559038},
+ { 0.666999922303637506650154222}, { 0.745057785441465962407907310},
+ {-0.745057785441465962407907310}, { 0.666999922303637506650154222},
+ { 0.901348847046022014570746093}, { 0.433093818853151968484222638},
+ {-0.433093818853151968484222638}, { 0.901348847046022014570746093},
+ { 0.331106305759876401737190737}, { 0.943593458161960361495301445},
+ {-0.943593458161960361495301445}, { 0.331106305759876401737190737},
+ { 0.968522094274417316221088329}, { 0.248927605745720168110682816},
+ {-0.248927605745720168110682816}, { 0.968522094274417316221088329},
+ { 0.508830142543107036931749324}, { 0.860866938637767279344583877},
+ {-0.860866938637767279344583877}, { 0.508830142543107036931749324},
+ { 0.799537269107905033500246232}, { 0.600616479383868926653875896},
+ {-0.600616479383868926653875896}, { 0.799537269107905033500246232},
+ { 0.140658239332849230714788846}, { 0.990058210262297105505906464},
+ {-0.990058210262297105505906464}, { 0.140658239332849230714788846},
+ { 0.988257567730749491404792538}, { 0.152797185258443427720336613},
+ {-0.152797185258443427720336613}, { 0.988257567730749491404792538},
+ { 0.590759701858874228423887908}, { 0.806847553543799272206514313},
+ {-0.806847553543799272206514313}, { 0.590759701858874228423887908},
+ { 0.854557988365400520767862276}, { 0.519355990165589587361829932},
+ {-0.519355990165589587361829932}, { 0.854557988365400520767862276},
+ { 0.237023605994367206867735915}, { 0.971503890986251775537099622},
+ {-0.971503890986251775537099622}, { 0.237023605994367206867735915},
+ { 0.939459223602189911962669246}, { 0.342660717311994397592781983},
+ {-0.342660717311994397592781983}, { 0.939459223602189911962669246},
+ { 0.422000270799799685941287941}, { 0.906595704514915365332960588},
+ {-0.906595704514915365332960588}, { 0.422000270799799685941287941},
+ { 0.736816568877369875090132520}, { 0.676092703575315960360419228},
+ {-0.676092703575315960360419228}, { 0.736816568877369875090132520},
+ { 0.042938256934940823077124540}, { 0.999077727752645382888781997},
+ {-0.999077727752645382888781997}, { 0.042938256934940823077124540},
+ { 0.999529417501093163079703322}, { 0.030674803176636625934021028},
+ {-0.030674803176636625934021028}, { 0.999529417501093163079703322},
+ { 0.685083667772700381362052545}, { 0.728464390448225196492035438},
+ {-0.728464390448225196492035438}, { 0.685083667772700381362052545},
+ { 0.911706032005429851404397325}, { 0.410843171057903942183466675},
+ {-0.410843171057903942183466675}, { 0.911706032005429851404397325},
+ { 0.354163525420490382357395796}, { 0.935183509938947577642207480},
+ {-0.935183509938947577642207480}, { 0.354163525420490382357395796},
+ { 0.974339382785575860518721668}, { 0.225083911359792835991642120},
+ {-0.225083911359792835991642120}, { 0.974339382785575860518721668},
+ { 0.529803624686294668216054671}, { 0.848120344803297251279133563},
+ {-0.848120344803297251279133563}, { 0.529803624686294668216054671},
+ { 0.814036329705948361654516690}, { 0.580813958095764545075595272},
+ {-0.580813958095764545075595272}, { 0.814036329705948361654516690},
+ { 0.164913120489969921418189113}, { 0.986308097244598647863297524},
+ {-0.986308097244598647863297524}, { 0.164913120489969921418189113},
+ { 0.991709753669099522860049931}, { 0.128498110793793172624415589},
+ {-0.128498110793793172624415589}, { 0.991709753669099522860049931},
+ { 0.610382806276309452716352152}, { 0.792106577300212351782342879},
+ {-0.792106577300212351782342879}, { 0.610382806276309452716352152},
+ { 0.867046245515692651480195629}, { 0.498227666972781852410983869},
+ {-0.498227666972781852410983869}, { 0.867046245515692651480195629},
+ { 0.260794117915275518280186509}, { 0.965394441697689374550843858},
+ {-0.965394441697689374550843858}, { 0.260794117915275518280186509},
+ { 0.947585591017741134653387321}, { 0.319502030816015677901518272},
+ {-0.319502030816015677901518272}, { 0.947585591017741134653387321},
+ { 0.444122144570429231642069418}, { 0.895966249756185155914560282},
+ {-0.895966249756185155914560282}, { 0.444122144570429231642069418},
+ { 0.753186799043612482483430486}, { 0.657806693297078656931182264},
+ {-0.657806693297078656931182264}, { 0.753186799043612482483430486},
+ { 0.067443919563664057897972422}, { 0.997723066644191609848546728},
+ {-0.997723066644191609848546728}, { 0.067443919563664057897972422},
+ { 0.996820299291165714972629398}, { 0.079682437971430121147120656},
+ {-0.079682437971430121147120656}, { 0.996820299291165714972629398},
+ { 0.648514401022112445084560551}, { 0.761202385484261814029709836},
+ {-0.761202385484261814029709836}, { 0.648514401022112445084560551},
+ { 0.890448723244757889952150560}, { 0.455083587126343823535869268},
+ {-0.455083587126343823535869268}, { 0.890448723244757889952150560},
+ { 0.307849640041534893682063646}, { 0.951435020969008369549175569},
+ {-0.951435020969008369549175569}, { 0.307849640041534893682063646},
+ { 0.962121404269041595429604316}, { 0.272621355449948984493347477},
+ {-0.272621355449948984493347477}, { 0.962121404269041595429604316},
+ { 0.487550160148435954641485027}, { 0.873094978418290098636085973},
+ {-0.873094978418290098636085973}, { 0.487550160148435954641485027},
+ { 0.784556597155575233023892575}, { 0.620057211763289178646268191},
+ {-0.620057211763289178646268191}, { 0.784556597155575233023892575},
+ { 0.116318630911904767252544319}, { 0.993211949234794533104601012},
+ {-0.993211949234794533104601012}, { 0.116318630911904767252544319},
+ { 0.984210092386929073193874387}, { 0.177004220412148756196839844},
+ {-0.177004220412148756196839844}, { 0.984210092386929073193874387},
+ { 0.570780745886967280232652864}, { 0.821102514991104679060430820},
+ {-0.821102514991104679060430820}, { 0.570780745886967280232652864},
+ { 0.841554977436898409603499520}, { 0.540171472729892881297845480},
+ {-0.540171472729892881297845480}, { 0.841554977436898409603499520},
+ { 0.213110319916091373967757518}, { 0.977028142657754351485866211},
+ {-0.977028142657754351485866211}, { 0.213110319916091373967757518},
+ { 0.930766961078983731944872340}, { 0.365612997804773870011745909},
+ {-0.365612997804773870011745909}, { 0.930766961078983731944872340},
+ { 0.399624199845646828544117031}, { 0.916679059921042663116457013},
+ {-0.916679059921042663116457013}, { 0.399624199845646828544117031},
+ { 0.720002507961381629076682999}, { 0.693971460889654009003734389},
+ {-0.693971460889654009003734389}, { 0.720002507961381629076682999},
+ { 0.018406729905804820927366313}, { 0.999830581795823422015722275},
+ {-0.999830581795823422015722275}, { 0.018406729905804820927366313},
+ { 0.999830581795823422015722275}, { 0.018406729905804820927366313},
+ {-0.018406729905804820927366313}, { 0.999830581795823422015722275},
+ { 0.693971460889654009003734389}, { 0.720002507961381629076682999},
+ {-0.720002507961381629076682999}, { 0.693971460889654009003734389},
+ { 0.916679059921042663116457013}, { 0.399624199845646828544117031},
+ {-0.399624199845646828544117031}, { 0.916679059921042663116457013},
+ { 0.365612997804773870011745909}, { 0.930766961078983731944872340},
+ {-0.930766961078983731944872340}, { 0.365612997804773870011745909},
+ { 0.977028142657754351485866211}, { 0.213110319916091373967757518},
+ {-0.213110319916091373967757518}, { 0.977028142657754351485866211},
+ { 0.540171472729892881297845480}, { 0.841554977436898409603499520},
+ {-0.841554977436898409603499520}, { 0.540171472729892881297845480},
+ { 0.821102514991104679060430820}, { 0.570780745886967280232652864},
+ {-0.570780745886967280232652864}, { 0.821102514991104679060430820},
+ { 0.177004220412148756196839844}, { 0.984210092386929073193874387},
+ {-0.984210092386929073193874387}, { 0.177004220412148756196839844},
+ { 0.993211949234794533104601012}, { 0.116318630911904767252544319},
+ {-0.116318630911904767252544319}, { 0.993211949234794533104601012},
+ { 0.620057211763289178646268191}, { 0.784556597155575233023892575},
+ {-0.784556597155575233023892575}, { 0.620057211763289178646268191},
+ { 0.873094978418290098636085973}, { 0.487550160148435954641485027},
+ {-0.487550160148435954641485027}, { 0.873094978418290098636085973},
+ { 0.272621355449948984493347477}, { 0.962121404269041595429604316},
+ {-0.962121404269041595429604316}, { 0.272621355449948984493347477},
+ { 0.951435020969008369549175569}, { 0.307849640041534893682063646},
+ {-0.307849640041534893682063646}, { 0.951435020969008369549175569},
+ { 0.455083587126343823535869268}, { 0.890448723244757889952150560},
+ {-0.890448723244757889952150560}, { 0.455083587126343823535869268},
+ { 0.761202385484261814029709836}, { 0.648514401022112445084560551},
+ {-0.648514401022112445084560551}, { 0.761202385484261814029709836},
+ { 0.079682437971430121147120656}, { 0.996820299291165714972629398},
+ {-0.996820299291165714972629398}, { 0.079682437971430121147120656},
+ { 0.997723066644191609848546728}, { 0.067443919563664057897972422},
+ {-0.067443919563664057897972422}, { 0.997723066644191609848546728},
+ { 0.657806693297078656931182264}, { 0.753186799043612482483430486},
+ {-0.753186799043612482483430486}, { 0.657806693297078656931182264},
+ { 0.895966249756185155914560282}, { 0.444122144570429231642069418},
+ {-0.444122144570429231642069418}, { 0.895966249756185155914560282},
+ { 0.319502030816015677901518272}, { 0.947585591017741134653387321},
+ {-0.947585591017741134653387321}, { 0.319502030816015677901518272},
+ { 0.965394441697689374550843858}, { 0.260794117915275518280186509},
+ {-0.260794117915275518280186509}, { 0.965394441697689374550843858},
+ { 0.498227666972781852410983869}, { 0.867046245515692651480195629},
+ {-0.867046245515692651480195629}, { 0.498227666972781852410983869},
+ { 0.792106577300212351782342879}, { 0.610382806276309452716352152},
+ {-0.610382806276309452716352152}, { 0.792106577300212351782342879},
+ { 0.128498110793793172624415589}, { 0.991709753669099522860049931},
+ {-0.991709753669099522860049931}, { 0.128498110793793172624415589},
+ { 0.986308097244598647863297524}, { 0.164913120489969921418189113},
+ {-0.164913120489969921418189113}, { 0.986308097244598647863297524},
+ { 0.580813958095764545075595272}, { 0.814036329705948361654516690},
+ {-0.814036329705948361654516690}, { 0.580813958095764545075595272},
+ { 0.848120344803297251279133563}, { 0.529803624686294668216054671},
+ {-0.529803624686294668216054671}, { 0.848120344803297251279133563},
+ { 0.225083911359792835991642120}, { 0.974339382785575860518721668},
+ {-0.974339382785575860518721668}, { 0.225083911359792835991642120},
+ { 0.935183509938947577642207480}, { 0.354163525420490382357395796},
+ {-0.354163525420490382357395796}, { 0.935183509938947577642207480},
+ { 0.410843171057903942183466675}, { 0.911706032005429851404397325},
+ {-0.911706032005429851404397325}, { 0.410843171057903942183466675},
+ { 0.728464390448225196492035438}, { 0.685083667772700381362052545},
+ {-0.685083667772700381362052545}, { 0.728464390448225196492035438},
+ { 0.030674803176636625934021028}, { 0.999529417501093163079703322},
+ {-0.999529417501093163079703322}, { 0.030674803176636625934021028},
+ { 0.999077727752645382888781997}, { 0.042938256934940823077124540},
+ {-0.042938256934940823077124540}, { 0.999077727752645382888781997},
+ { 0.676092703575315960360419228}, { 0.736816568877369875090132520},
+ {-0.736816568877369875090132520}, { 0.676092703575315960360419228},
+ { 0.906595704514915365332960588}, { 0.422000270799799685941287941},
+ {-0.422000270799799685941287941}, { 0.906595704514915365332960588},
+ { 0.342660717311994397592781983}, { 0.939459223602189911962669246},
+ {-0.939459223602189911962669246}, { 0.342660717311994397592781983},
+ { 0.971503890986251775537099622}, { 0.237023605994367206867735915},
+ {-0.237023605994367206867735915}, { 0.971503890986251775537099622},
+ { 0.519355990165589587361829932}, { 0.854557988365400520767862276},
+ {-0.854557988365400520767862276}, { 0.519355990165589587361829932},
+ { 0.806847553543799272206514313}, { 0.590759701858874228423887908},
+ {-0.590759701858874228423887908}, { 0.806847553543799272206514313},
+ { 0.152797185258443427720336613}, { 0.988257567730749491404792538},
+ {-0.988257567730749491404792538}, { 0.152797185258443427720336613},
+ { 0.990058210262297105505906464}, { 0.140658239332849230714788846},
+ {-0.140658239332849230714788846}, { 0.990058210262297105505906464},
+ { 0.600616479383868926653875896}, { 0.799537269107905033500246232},
+ {-0.799537269107905033500246232}, { 0.600616479383868926653875896},
+ { 0.860866938637767279344583877}, { 0.508830142543107036931749324},
+ {-0.508830142543107036931749324}, { 0.860866938637767279344583877},
+ { 0.248927605745720168110682816}, { 0.968522094274417316221088329},
+ {-0.968522094274417316221088329}, { 0.248927605745720168110682816},
+ { 0.943593458161960361495301445}, { 0.331106305759876401737190737},
+ {-0.331106305759876401737190737}, { 0.943593458161960361495301445},
+ { 0.433093818853151968484222638}, { 0.901348847046022014570746093},
+ {-0.901348847046022014570746093}, { 0.433093818853151968484222638},
+ { 0.745057785441465962407907310}, { 0.666999922303637506650154222},
+ {-0.666999922303637506650154222}, { 0.745057785441465962407907310},
+ { 0.055195244349689939809447526}, { 0.998475580573294752208559038},
+ {-0.998475580573294752208559038}, { 0.055195244349689939809447526},
+ { 0.995767414467659793982495643}, { 0.091908956497132728624990979},
+ {-0.091908956497132728624990979}, { 0.995767414467659793982495643},
+ { 0.639124444863775743801488193}, { 0.769103337645579639346626069},
+ {-0.769103337645579639346626069}, { 0.639124444863775743801488193},
+ { 0.884797098430937780104007041}, { 0.465976495767966177902756065},
+ {-0.465976495767966177902756065}, { 0.884797098430937780104007041},
+ { 0.296150888243623824121786128}, { 0.955141168305770721498157712},
+ {-0.955141168305770721498157712}, { 0.296150888243623824121786128},
+ { 0.958703474895871555374645792}, { 0.284407537211271843618310615},
+ {-0.284407537211271843618310615}, { 0.958703474895871555374645792},
+ { 0.476799230063322133342158117}, { 0.879012226428633477831323711},
+ {-0.879012226428633477831323711}, { 0.476799230063322133342158117},
+ { 0.776888465673232450040827983}, { 0.629638238914927025372981341},
+ {-0.629638238914927025372981341}, { 0.776888465673232450040827983},
+ { 0.104121633872054579120943880}, { 0.994564570734255452119106243},
+ {-0.994564570734255452119106243}, { 0.104121633872054579120943880},
+ { 0.981963869109555264072848154}, { 0.189068664149806212754997837},
+ {-0.189068664149806212754997837}, { 0.981963869109555264072848154},
+ { 0.560661576197336023839710223}, { 0.828045045257755752067527592},
+ {-0.828045045257755752067527592}, { 0.560661576197336023839710223},
+ { 0.834862874986380056304401383}, { 0.550457972936604802977289893},
+ {-0.550457972936604802977289893}, { 0.834862874986380056304401383},
+ { 0.201104634842091911558443546}, { 0.979569765685440534439326110},
+ {-0.979569765685440534439326110}, { 0.201104634842091911558443546},
+ { 0.926210242138311341974793388}, { 0.377007410216418256726567823},
+ {-0.377007410216418256726567823}, { 0.926210242138311341974793388},
+ { 0.388345046698826291624993541}, { 0.921514039342041943465396332},
+ {-0.921514039342041943465396332}, { 0.388345046698826291624993541},
+ { 0.711432195745216441522130290}, { 0.702754744457225302452914421},
+ {-0.702754744457225302452914421}, { 0.711432195745216441522130290},
+ { 0.006135884649154475359640235}, { 0.999981175282601142656990438},
+ {-0.999981175282601142656990438}, { 0.006135884649154475359640235},
+ { 0.999995293809576171511580126}, { 0.003067956762965976270145365},
+ {-0.003067956762965976270145365}, { 0.999995293809576171511580126},
+ { 0.704934080375904908852523758}, { 0.709272826438865651316533772},
+ {-0.709272826438865651316533772}, { 0.704934080375904908852523758},
+ { 0.922701128333878570437264227}, { 0.385516053843918864075607949},
+ {-0.385516053843918864075607949}, { 0.922701128333878570437264227},
+ { 0.379847208924051170576281147}, { 0.925049240782677590302371869},
+ {-0.925049240782677590302371869}, { 0.379847208924051170576281147},
+ { 0.980182135968117392690210009}, { 0.198098410717953586179324918},
+ {-0.198098410717953586179324918}, { 0.980182135968117392690210009},
+ { 0.553016705580027531764226988}, { 0.833170164701913186439915922},
+ {-0.833170164701913186439915922}, { 0.553016705580027531764226988},
+ { 0.829761233794523042469023765}, { 0.558118531220556115693702964},
+ {-0.558118531220556115693702964}, { 0.829761233794523042469023765},
+ { 0.192080397049892441679288205}, { 0.981379193313754574318224190},
+ {-0.981379193313754574318224190}, { 0.192080397049892441679288205},
+ { 0.994879330794805620591166107}, { 0.101069862754827824987887585},
+ {-0.101069862754827824987887585}, { 0.994879330794805620591166107},
+ { 0.632018735939809021909403706}, { 0.774953106594873878359129282},
+ {-0.774953106594873878359129282}, { 0.632018735939809021909403706},
+ { 0.880470889052160770806542929}, { 0.474100214650550014398580015},
+ {-0.474100214650550014398580015}, { 0.880470889052160770806542929},
+ { 0.287347459544729526477331841}, { 0.957826413027532890321037029},
+ {-0.957826413027532890321037029}, { 0.287347459544729526477331841},
+ { 0.956045251349996443270479823}, { 0.293219162694258650606608599},
+ {-0.293219162694258650606608599}, { 0.956045251349996443270479823},
+ { 0.468688822035827933697617870}, { 0.883363338665731594736308015},
+ {-0.883363338665731594736308015}, { 0.468688822035827933697617870},
+ { 0.771060524261813773200605759}, { 0.636761861236284230413943435},
+ {-0.636761861236284230413943435}, { 0.771060524261813773200605759},
+ { 0.094963495329638998938034312}, { 0.995480755491926941769171600},
+ {-0.995480755491926941769171600}, { 0.094963495329638998938034312},
+ { 0.998640218180265222418199049}, { 0.052131704680283321236358216},
+ {-0.052131704680283321236358216}, { 0.998640218180265222418199049},
+ { 0.669282588346636065720696366}, { 0.743007952135121693517362293},
+ {-0.743007952135121693517362293}, { 0.669282588346636065720696366},
+ { 0.902673318237258806751502391}, { 0.430326481340082633908199031},
+ {-0.430326481340082633908199031}, { 0.902673318237258806751502391},
+ { 0.333999651442009404650865481}, { 0.942573197601446879280758735},
+ {-0.942573197601446879280758735}, { 0.333999651442009404650865481},
+ { 0.969281235356548486048290738}, { 0.245955050335794611599924709},
+ {-0.245955050335794611599924709}, { 0.969281235356548486048290738},
+ { 0.511468850437970399504391001}, { 0.859301818357008404783582139},
+ {-0.859301818357008404783582139}, { 0.511468850437970399504391001},
+ { 0.801376171723140219430247777}, { 0.598160706996342311724958652},
+ {-0.598160706996342311724958652}, { 0.801376171723140219430247777},
+ { 0.143695033150294454819773349}, { 0.989622017463200834623694454},
+ {-0.989622017463200834623694454}, { 0.143695033150294454819773349},
+ { 0.988721691960323767604516485}, { 0.149764534677321517229695737},
+ {-0.149764534677321517229695737}, { 0.988721691960323767604516485},
+ { 0.593232295039799808047809426}, { 0.805031331142963597922659282},
+ {-0.805031331142963597922659282}, { 0.593232295039799808047809426},
+ { 0.856147328375194481019630732}, { 0.516731799017649881508753876},
+ {-0.516731799017649881508753876}, { 0.856147328375194481019630732},
+ { 0.240003022448741486568922365}, { 0.970772140728950302138169611},
+ {-0.970772140728950302138169611}, { 0.240003022448741486568922365},
+ { 0.940506070593268323787291309}, { 0.339776884406826857828825803},
+ {-0.339776884406826857828825803}, { 0.940506070593268323787291309},
+ { 0.424779681209108833357226189}, { 0.905296759318118774354048329},
+ {-0.905296759318118774354048329}, { 0.424779681209108833357226189},
+ { 0.738887324460615147933116508}, { 0.673829000378756060917568372},
+ {-0.673829000378756060917568372}, { 0.738887324460615147933116508},
+ { 0.046003182130914628814301788}, { 0.998941293186856850633930266},
+ {-0.998941293186856850633930266}, { 0.046003182130914628814301788},
+ { 0.999618822495178597116830637}, { 0.027608145778965741612354872},
+ {-0.027608145778965741612354872}, { 0.999618822495178597116830637},
+ { 0.687315340891759108199186948}, { 0.726359155084345976817494315},
+ {-0.726359155084345976817494315}, { 0.687315340891759108199186948},
+ { 0.912962190428398164628018233}, { 0.408044162864978680820747499},
+ {-0.408044162864978680820747499}, { 0.912962190428398164628018233},
+ { 0.357030961233430032614954036}, { 0.934092550404258914729877883},
+ {-0.934092550404258914729877883}, { 0.357030961233430032614954036},
+ { 0.975025345066994146844913468}, { 0.222093620973203534094094721},
+ {-0.222093620973203534094094721}, { 0.975025345066994146844913468},
+ { 0.532403127877197971442805218}, { 0.846490938774052078300544488},
+ {-0.846490938774052078300544488}, { 0.532403127877197971442805218},
+ { 0.815814410806733789010772660}, { 0.578313796411655563342245019},
+ {-0.578313796411655563342245019}, { 0.815814410806733789010772660},
+ { 0.167938294974731178054745536}, { 0.985797509167567424700995000},
+ {-0.985797509167567424700995000}, { 0.167938294974731178054745536},
+ { 0.992099313142191757112085445}, { 0.125454983411546238542336453},
+ {-0.125454983411546238542336453}, { 0.992099313142191757112085445},
+ { 0.612810082429409703935211936}, { 0.790230221437310055030217152},
+ {-0.790230221437310055030217152}, { 0.612810082429409703935211936},
+ { 0.868570705971340895340449876}, { 0.495565261825772531150266670},
+ {-0.495565261825772531150266670}, { 0.868570705971340895340449876},
+ { 0.263754678974831383611349322}, { 0.964589793289812723836432159},
+ {-0.964589793289812723836432159}, { 0.263754678974831383611349322},
+ { 0.948561349915730288158494826}, { 0.316593375556165867243047035},
+ {-0.316593375556165867243047035}, { 0.948561349915730288158494826},
+ { 0.446868840162374195353044389}, { 0.894599485631382678433072126},
+ {-0.894599485631382678433072126}, { 0.446868840162374195353044389},
+ { 0.755201376896536527598710756}, { 0.655492852999615385312679701},
+ {-0.655492852999615385312679701}, { 0.755201376896536527598710756},
+ { 0.070504573389613863027351471}, { 0.997511456140303459699448390},
+ {-0.997511456140303459699448390}, { 0.070504573389613863027351471},
+ { 0.997060070339482978987989949}, { 0.076623861392031492278332463},
+ {-0.076623861392031492278332463}, { 0.997060070339482978987989949},
+ { 0.650846684996380915068975573}, { 0.759209188978388033485525443},
+ {-0.759209188978388033485525443}, { 0.650846684996380915068975573},
+ { 0.891840709392342727796478697}, { 0.452349587233770874133026703},
+ {-0.452349587233770874133026703}, { 0.891840709392342727796478697},
+ { 0.310767152749611495835997250}, { 0.950486073949481721759926101},
+ {-0.950486073949481721759926101}, { 0.310767152749611495835997250},
+ { 0.962953266873683886347921481}, { 0.269668325572915106525464462},
+ {-0.269668325572915106525464462}, { 0.962953266873683886347921481},
+ { 0.490226483288291154229598449}, { 0.871595086655951034842481435},
+ {-0.871595086655951034842481435}, { 0.490226483288291154229598449},
+ { 0.786455213599085757522319464}, { 0.617647307937803932403979402},
+ {-0.617647307937803932403979402}, { 0.786455213599085757522319464},
+ { 0.119365214810991364593637790}, { 0.992850414459865090793563344},
+ {-0.992850414459865090793563344}, { 0.119365214810991364593637790},
+ { 0.984748501801904218556553176}, { 0.173983873387463827950700807},
+ {-0.173983873387463827950700807}, { 0.984748501801904218556553176},
+ { 0.573297166698042212820171239}, { 0.819347520076796960824689637},
+ {-0.819347520076796960824689637}, { 0.573297166698042212820171239},
+ { 0.843208239641845437161743865}, { 0.537587076295645482502214932},
+ {-0.537587076295645482502214932}, { 0.843208239641845437161743865},
+ { 0.216106797076219509948385131}, { 0.976369731330021149312732194},
+ {-0.976369731330021149312732194}, { 0.216106797076219509948385131},
+ { 0.931884265581668106718557199}, { 0.362755724367397216204854462},
+ {-0.362755724367397216204854462}, { 0.931884265581668106718557199},
+ { 0.402434650859418441082533934}, { 0.915448716088267819566431292},
+ {-0.915448716088267819566431292}, { 0.402434650859418441082533934},
+ { 0.722128193929215321243607198}, { 0.691759258364157774906734132},
+ {-0.691759258364157774906734132}, { 0.722128193929215321243607198},
+ { 0.021474080275469507418374898}, { 0.999769405351215321657617036},
+ {-0.999769405351215321657617036}, { 0.021474080275469507418374898},
+ { 0.999882347454212525633049627}, { 0.015339206284988101044151868},
+ {-0.015339206284988101044151868}, { 0.999882347454212525633049627},
+ { 0.696177131491462944788582591}, { 0.717870045055731736211325329},
+ {-0.717870045055731736211325329}, { 0.696177131491462944788582591},
+ { 0.917900775621390457642276297}, { 0.396809987416710328595290911},
+ {-0.396809987416710328595290911}, { 0.917900775621390457642276297},
+ { 0.368466829953372331712746222}, { 0.929640895843181265457918066},
+ {-0.929640895843181265457918066}, { 0.368466829953372331712746222},
+ { 0.977677357824509979943404762}, { 0.210111836880469621717489972},
+ {-0.210111836880469621717489972}, { 0.977677357824509979943404762},
+ { 0.542750784864515906586768661}, { 0.839893794195999504583383987},
+ {-0.839893794195999504583383987}, { 0.542750784864515906586768661},
+ { 0.822849781375826332046780034}, { 0.568258952670131549790548489},
+ {-0.568258952670131549790548489}, { 0.822849781375826332046780034},
+ { 0.180022901405699522679906590}, { 0.983662419211730274396237776},
+ {-0.983662419211730274396237776}, { 0.180022901405699522679906590},
+ { 0.993564135520595333782021697}, { 0.113270952177564349018228733},
+ {-0.113270952177564349018228733}, { 0.993564135520595333782021697},
+ { 0.622461279374149972519166721}, { 0.782650596166575738458949301},
+ {-0.782650596166575738458949301}, { 0.622461279374149972519166721},
+ { 0.874586652278176112634431897}, { 0.484869248000791101822951699},
+ {-0.484869248000791101822951699}, { 0.874586652278176112634431897},
+ { 0.275571819310958163076425168}, { 0.961280485811320641748659653},
+ {-0.961280485811320641748659653}, { 0.275571819310958163076425168},
+ { 0.952375012719765858529893608}, { 0.304929229735402406490728633},
+ {-0.304929229735402406490728633}, { 0.952375012719765858529893608},
+ { 0.457813303598877221904961155}, { 0.889048355854664562540777729},
+ {-0.889048355854664562540777729}, { 0.457813303598877221904961155},
+ { 0.763188417263381271704838297}, { 0.646176012983316364832802220},
+ {-0.646176012983316364832802220}, { 0.763188417263381271704838297},
+ { 0.082740264549375693111987083}, { 0.996571145790554847093566910},
+ {-0.996571145790554847093566910}, { 0.082740264549375693111987083},
+ { 0.997925286198596012623025462}, { 0.064382630929857460819324537},
+ {-0.064382630929857460819324537}, { 0.997925286198596012623025462},
+ { 0.660114342067420478559490747}, { 0.751165131909686411205819422},
+ {-0.751165131909686411205819422}, { 0.660114342067420478559490747},
+ { 0.897324580705418281231391836}, { 0.441371268731716692879988968},
+ {-0.441371268731716692879988968}, { 0.897324580705418281231391836},
+ { 0.322407678801069848384807478}, { 0.946600913083283570044599823},
+ {-0.946600913083283570044599823}, { 0.322407678801069848384807478},
+ { 0.966190003445412555433832961}, { 0.257831102162159005614471295},
+ {-0.257831102162159005614471295}, { 0.966190003445412555433832961},
+ { 0.500885382611240786241285004}, { 0.865513624090569082825488358},
+ {-0.865513624090569082825488358}, { 0.500885382611240786241285004},
+ { 0.793975477554337164895083757}, { 0.607949784967773667243642671},
+ {-0.607949784967773667243642671}, { 0.793975477554337164895083757},
+ { 0.131540028702883111103387493}, { 0.991310859846115418957349799},
+ {-0.991310859846115418957349799}, { 0.131540028702883111103387493},
+ { 0.986809401814185476970235952}, { 0.161886393780111837641387995},
+ {-0.161886393780111837641387995}, { 0.986809401814185476970235952},
+ { 0.583308652937698294392830961}, { 0.812250586585203913049744181},
+ {-0.812250586585203913049744181}, { 0.583308652937698294392830961},
+ { 0.849741768000852489471268395}, { 0.527199134781901348464274575},
+ {-0.527199134781901348464274575}, { 0.849741768000852489471268395},
+ { 0.228072083170885739254457379}, { 0.973644249650811925318383912},
+ {-0.973644249650811925318383912}, { 0.228072083170885739254457379},
+ { 0.936265667170278246576310996}, { 0.351292756085567125601307623},
+ {-0.351292756085567125601307623}, { 0.936265667170278246576310996},
+ { 0.413638312238434547471944324}, { 0.910441292258067196934095369},
+ {-0.910441292258067196934095369}, { 0.413638312238434547471944324},
+ { 0.730562769227827561177758850}, { 0.682845546385248068164596123},
+ {-0.682845546385248068164596123}, { 0.730562769227827561177758850},
+ { 0.033741171851377584833716112}, { 0.999430604555461772019008327},
+ {-0.999430604555461772019008327}, { 0.033741171851377584833716112},
+ { 0.999204758618363895492950001}, { 0.039872927587739811128578738},
+ {-0.039872927587739811128578738}, { 0.999204758618363895492950001},
+ { 0.678350043129861486873655042}, { 0.734738878095963464563223604},
+ {-0.734738878095963464563223604}, { 0.678350043129861486873655042},
+ { 0.907886116487666212038681480}, { 0.419216888363223956433010020},
+ {-0.419216888363223956433010020}, { 0.907886116487666212038681480},
+ { 0.345541324963989065539191723}, { 0.938403534063108112192420774},
+ {-0.938403534063108112192420774}, { 0.345541324963989065539191723},
+ { 0.972226497078936305708321144}, { 0.234041958583543423191242045},
+ {-0.234041958583543423191242045}, { 0.972226497078936305708321144},
+ { 0.521975292937154342694258318}, { 0.852960604930363657746588082},
+ {-0.852960604930363657746588082}, { 0.521975292937154342694258318},
+ { 0.808656181588174991946968128}, { 0.588281548222645304786439813},
+ {-0.588281548222645304786439813}, { 0.808656181588174991946968128},
+ { 0.155828397654265235743101486}, { 0.987784141644572154230969032},
+ {-0.987784141644572154230969032}, { 0.155828397654265235743101486},
+ { 0.990485084256457037998682243}, { 0.137620121586486044948441663},
+ {-0.137620121586486044948441663}, { 0.990485084256457037998682243},
+ { 0.603066598540348201693430617}, { 0.797690840943391108362662755},
+ {-0.797690840943391108362662755}, { 0.603066598540348201693430617},
+ { 0.862423956111040538690933878}, { 0.506186645345155291048942344},
+ {-0.506186645345155291048942344}, { 0.862423956111040538690933878},
+ { 0.251897818154216950498106628}, { 0.967753837093475465243391912},
+ {-0.967753837093475465243391912}, { 0.251897818154216950498106628},
+ { 0.944604837261480265659265493}, { 0.328209843579092526107916817},
+ {-0.328209843579092526107916817}, { 0.944604837261480265659265493},
+ { 0.435857079922255491032544080}, { 0.900015892016160228714535267},
+ {-0.900015892016160228714535267}, { 0.435857079922255491032544080},
+ { 0.747100605980180144323078847}, { 0.664710978203344868130324985},
+ {-0.664710978203344868130324985}, { 0.747100605980180144323078847},
+ { 0.058258264500435759613979782}, { 0.998301544933892840738782163},
+ {-0.998301544933892840738782163}, { 0.058258264500435759613979782},
+ { 0.996044700901251989887944810}, { 0.088853552582524596561586535},
+ {-0.088853552582524596561586535}, { 0.996044700901251989887944810},
+ { 0.641481012808583151988739898}, { 0.767138911935820381181694573},
+ {-0.767138911935820381181694573}, { 0.641481012808583151988739898},
+ { 0.886222530148880631647990821}, { 0.463259783551860197390719637},
+ {-0.463259783551860197390719637}, { 0.886222530148880631647990821},
+ { 0.299079826308040476750336973}, { 0.954228095109105629780430732},
+ {-0.954228095109105629780430732}, { 0.299079826308040476750336973},
+ { 0.959571513081984528335528181}, { 0.281464937925757984095231007},
+ {-0.281464937925757984095231007}, { 0.959571513081984528335528181},
+ { 0.479493757660153026679839798}, { 0.877545290207261291668470750},
+ {-0.877545290207261291668470750}, { 0.479493757660153026679839798},
+ { 0.778816512381475953374724325}, { 0.627251815495144113509622565},
+ {-0.627251815495144113509622565}, { 0.778816512381475953374724325},
+ { 0.107172424956808849175529148}, { 0.994240449453187946358413442},
+ {-0.994240449453187946358413442}, { 0.107172424956808849175529148},
+ { 0.982539302287441255907040396}, { 0.186055151663446648105438304},
+ {-0.186055151663446648105438304}, { 0.982539302287441255907040396},
+ { 0.563199344013834115007363772}, { 0.826321062845663480311195452},
+ {-0.826321062845663480311195452}, { 0.563199344013834115007363772},
+ { 0.836547727223511984524285790}, { 0.547894059173100165608820571},
+ {-0.547894059173100165608820571}, { 0.836547727223511984524285790},
+ { 0.204108966092816874181696950}, { 0.978948175319062194715480124},
+ {-0.978948175319062194715480124}, { 0.204108966092816874181696950},
+ { 0.927362525650401087274536959}, { 0.374164062971457997104393020},
+ {-0.374164062971457997104393020}, { 0.927362525650401087274536959},
+ { 0.391170384302253888687512949}, { 0.920318276709110566440076541},
+ {-0.920318276709110566440076541}, { 0.391170384302253888687512949},
+ { 0.713584868780793592903125099}, { 0.700568793943248366792866380},
+ {-0.700568793943248366792866380}, { 0.713584868780793592903125099},
+ { 0.009203754782059819315102378}, { 0.999957644551963866333120920},
+ {-0.999957644551963866333120920}, { 0.009203754782059819315102378},
+ { 0.999957644551963866333120920}, { 0.009203754782059819315102378},
+ {-0.009203754782059819315102378}, { 0.999957644551963866333120920},
+ { 0.700568793943248366792866380}, { 0.713584868780793592903125099},
+ {-0.713584868780793592903125099}, { 0.700568793943248366792866380},
+ { 0.920318276709110566440076541}, { 0.391170384302253888687512949},
+ {-0.391170384302253888687512949}, { 0.920318276709110566440076541},
+ { 0.374164062971457997104393020}, { 0.927362525650401087274536959},
+ {-0.927362525650401087274536959}, { 0.374164062971457997104393020},
+ { 0.978948175319062194715480124}, { 0.204108966092816874181696950},
+ {-0.204108966092816874181696950}, { 0.978948175319062194715480124},
+ { 0.547894059173100165608820571}, { 0.836547727223511984524285790},
+ {-0.836547727223511984524285790}, { 0.547894059173100165608820571},
+ { 0.826321062845663480311195452}, { 0.563199344013834115007363772},
+ {-0.563199344013834115007363772}, { 0.826321062845663480311195452},
+ { 0.186055151663446648105438304}, { 0.982539302287441255907040396},
+ {-0.982539302287441255907040396}, { 0.186055151663446648105438304},
+ { 0.994240449453187946358413442}, { 0.107172424956808849175529148},
+ {-0.107172424956808849175529148}, { 0.994240449453187946358413442},
+ { 0.627251815495144113509622565}, { 0.778816512381475953374724325},
+ {-0.778816512381475953374724325}, { 0.627251815495144113509622565},
+ { 0.877545290207261291668470750}, { 0.479493757660153026679839798},
+ {-0.479493757660153026679839798}, { 0.877545290207261291668470750},
+ { 0.281464937925757984095231007}, { 0.959571513081984528335528181},
+ {-0.959571513081984528335528181}, { 0.281464937925757984095231007},
+ { 0.954228095109105629780430732}, { 0.299079826308040476750336973},
+ {-0.299079826308040476750336973}, { 0.954228095109105629780430732},
+ { 0.463259783551860197390719637}, { 0.886222530148880631647990821},
+ {-0.886222530148880631647990821}, { 0.463259783551860197390719637},
+ { 0.767138911935820381181694573}, { 0.641481012808583151988739898},
+ {-0.641481012808583151988739898}, { 0.767138911935820381181694573},
+ { 0.088853552582524596561586535}, { 0.996044700901251989887944810},
+ {-0.996044700901251989887944810}, { 0.088853552582524596561586535},
+ { 0.998301544933892840738782163}, { 0.058258264500435759613979782},
+ {-0.058258264500435759613979782}, { 0.998301544933892840738782163},
+ { 0.664710978203344868130324985}, { 0.747100605980180144323078847},
+ {-0.747100605980180144323078847}, { 0.664710978203344868130324985},
+ { 0.900015892016160228714535267}, { 0.435857079922255491032544080},
+ {-0.435857079922255491032544080}, { 0.900015892016160228714535267},
+ { 0.328209843579092526107916817}, { 0.944604837261480265659265493},
+ {-0.944604837261480265659265493}, { 0.328209843579092526107916817},
+ { 0.967753837093475465243391912}, { 0.251897818154216950498106628},
+ {-0.251897818154216950498106628}, { 0.967753837093475465243391912},
+ { 0.506186645345155291048942344}, { 0.862423956111040538690933878},
+ {-0.862423956111040538690933878}, { 0.506186645345155291048942344},
+ { 0.797690840943391108362662755}, { 0.603066598540348201693430617},
+ {-0.603066598540348201693430617}, { 0.797690840943391108362662755},
+ { 0.137620121586486044948441663}, { 0.990485084256457037998682243},
+ {-0.990485084256457037998682243}, { 0.137620121586486044948441663},
+ { 0.987784141644572154230969032}, { 0.155828397654265235743101486},
+ {-0.155828397654265235743101486}, { 0.987784141644572154230969032},
+ { 0.588281548222645304786439813}, { 0.808656181588174991946968128},
+ {-0.808656181588174991946968128}, { 0.588281548222645304786439813},
+ { 0.852960604930363657746588082}, { 0.521975292937154342694258318},
+ {-0.521975292937154342694258318}, { 0.852960604930363657746588082},
+ { 0.234041958583543423191242045}, { 0.972226497078936305708321144},
+ {-0.972226497078936305708321144}, { 0.234041958583543423191242045},
+ { 0.938403534063108112192420774}, { 0.345541324963989065539191723},
+ {-0.345541324963989065539191723}, { 0.938403534063108112192420774},
+ { 0.419216888363223956433010020}, { 0.907886116487666212038681480},
+ {-0.907886116487666212038681480}, { 0.419216888363223956433010020},
+ { 0.734738878095963464563223604}, { 0.678350043129861486873655042},
+ {-0.678350043129861486873655042}, { 0.734738878095963464563223604},
+ { 0.039872927587739811128578738}, { 0.999204758618363895492950001},
+ {-0.999204758618363895492950001}, { 0.039872927587739811128578738},
+ { 0.999430604555461772019008327}, { 0.033741171851377584833716112},
+ {-0.033741171851377584833716112}, { 0.999430604555461772019008327},
+ { 0.682845546385248068164596123}, { 0.730562769227827561177758850},
+ {-0.730562769227827561177758850}, { 0.682845546385248068164596123},
+ { 0.910441292258067196934095369}, { 0.413638312238434547471944324},
+ {-0.413638312238434547471944324}, { 0.910441292258067196934095369},
+ { 0.351292756085567125601307623}, { 0.936265667170278246576310996},
+ {-0.936265667170278246576310996}, { 0.351292756085567125601307623},
+ { 0.973644249650811925318383912}, { 0.228072083170885739254457379},
+ {-0.228072083170885739254457379}, { 0.973644249650811925318383912},
+ { 0.527199134781901348464274575}, { 0.849741768000852489471268395},
+ {-0.849741768000852489471268395}, { 0.527199134781901348464274575},
+ { 0.812250586585203913049744181}, { 0.583308652937698294392830961},
+ {-0.583308652937698294392830961}, { 0.812250586585203913049744181},
+ { 0.161886393780111837641387995}, { 0.986809401814185476970235952},
+ {-0.986809401814185476970235952}, { 0.161886393780111837641387995},
+ { 0.991310859846115418957349799}, { 0.131540028702883111103387493},
+ {-0.131540028702883111103387493}, { 0.991310859846115418957349799},
+ { 0.607949784967773667243642671}, { 0.793975477554337164895083757},
+ {-0.793975477554337164895083757}, { 0.607949784967773667243642671},
+ { 0.865513624090569082825488358}, { 0.500885382611240786241285004},
+ {-0.500885382611240786241285004}, { 0.865513624090569082825488358},
+ { 0.257831102162159005614471295}, { 0.966190003445412555433832961},
+ {-0.966190003445412555433832961}, { 0.257831102162159005614471295},
+ { 0.946600913083283570044599823}, { 0.322407678801069848384807478},
+ {-0.322407678801069848384807478}, { 0.946600913083283570044599823},
+ { 0.441371268731716692879988968}, { 0.897324580705418281231391836},
+ {-0.897324580705418281231391836}, { 0.441371268731716692879988968},
+ { 0.751165131909686411205819422}, { 0.660114342067420478559490747},
+ {-0.660114342067420478559490747}, { 0.751165131909686411205819422},
+ { 0.064382630929857460819324537}, { 0.997925286198596012623025462},
+ {-0.997925286198596012623025462}, { 0.064382630929857460819324537},
+ { 0.996571145790554847093566910}, { 0.082740264549375693111987083},
+ {-0.082740264549375693111987083}, { 0.996571145790554847093566910},
+ { 0.646176012983316364832802220}, { 0.763188417263381271704838297},
+ {-0.763188417263381271704838297}, { 0.646176012983316364832802220},
+ { 0.889048355854664562540777729}, { 0.457813303598877221904961155},
+ {-0.457813303598877221904961155}, { 0.889048355854664562540777729},
+ { 0.304929229735402406490728633}, { 0.952375012719765858529893608},
+ {-0.952375012719765858529893608}, { 0.304929229735402406490728633},
+ { 0.961280485811320641748659653}, { 0.275571819310958163076425168},
+ {-0.275571819310958163076425168}, { 0.961280485811320641748659653},
+ { 0.484869248000791101822951699}, { 0.874586652278176112634431897},
+ {-0.874586652278176112634431897}, { 0.484869248000791101822951699},
+ { 0.782650596166575738458949301}, { 0.622461279374149972519166721},
+ {-0.622461279374149972519166721}, { 0.782650596166575738458949301},
+ { 0.113270952177564349018228733}, { 0.993564135520595333782021697},
+ {-0.993564135520595333782021697}, { 0.113270952177564349018228733},
+ { 0.983662419211730274396237776}, { 0.180022901405699522679906590},
+ {-0.180022901405699522679906590}, { 0.983662419211730274396237776},
+ { 0.568258952670131549790548489}, { 0.822849781375826332046780034},
+ {-0.822849781375826332046780034}, { 0.568258952670131549790548489},
+ { 0.839893794195999504583383987}, { 0.542750784864515906586768661},
+ {-0.542750784864515906586768661}, { 0.839893794195999504583383987},
+ { 0.210111836880469621717489972}, { 0.977677357824509979943404762},
+ {-0.977677357824509979943404762}, { 0.210111836880469621717489972},
+ { 0.929640895843181265457918066}, { 0.368466829953372331712746222},
+ {-0.368466829953372331712746222}, { 0.929640895843181265457918066},
+ { 0.396809987416710328595290911}, { 0.917900775621390457642276297},
+ {-0.917900775621390457642276297}, { 0.396809987416710328595290911},
+ { 0.717870045055731736211325329}, { 0.696177131491462944788582591},
+ {-0.696177131491462944788582591}, { 0.717870045055731736211325329},
+ { 0.015339206284988101044151868}, { 0.999882347454212525633049627},
+ {-0.999882347454212525633049627}, { 0.015339206284988101044151868},
+ { 0.999769405351215321657617036}, { 0.021474080275469507418374898},
+ {-0.021474080275469507418374898}, { 0.999769405351215321657617036},
+ { 0.691759258364157774906734132}, { 0.722128193929215321243607198},
+ {-0.722128193929215321243607198}, { 0.691759258364157774906734132},
+ { 0.915448716088267819566431292}, { 0.402434650859418441082533934},
+ {-0.402434650859418441082533934}, { 0.915448716088267819566431292},
+ { 0.362755724367397216204854462}, { 0.931884265581668106718557199},
+ {-0.931884265581668106718557199}, { 0.362755724367397216204854462},
+ { 0.976369731330021149312732194}, { 0.216106797076219509948385131},
+ {-0.216106797076219509948385131}, { 0.976369731330021149312732194},
+ { 0.537587076295645482502214932}, { 0.843208239641845437161743865},
+ {-0.843208239641845437161743865}, { 0.537587076295645482502214932},
+ { 0.819347520076796960824689637}, { 0.573297166698042212820171239},
+ {-0.573297166698042212820171239}, { 0.819347520076796960824689637},
+ { 0.173983873387463827950700807}, { 0.984748501801904218556553176},
+ {-0.984748501801904218556553176}, { 0.173983873387463827950700807},
+ { 0.992850414459865090793563344}, { 0.119365214810991364593637790},
+ {-0.119365214810991364593637790}, { 0.992850414459865090793563344},
+ { 0.617647307937803932403979402}, { 0.786455213599085757522319464},
+ {-0.786455213599085757522319464}, { 0.617647307937803932403979402},
+ { 0.871595086655951034842481435}, { 0.490226483288291154229598449},
+ {-0.490226483288291154229598449}, { 0.871595086655951034842481435},
+ { 0.269668325572915106525464462}, { 0.962953266873683886347921481},
+ {-0.962953266873683886347921481}, { 0.269668325572915106525464462},
+ { 0.950486073949481721759926101}, { 0.310767152749611495835997250},
+ {-0.310767152749611495835997250}, { 0.950486073949481721759926101},
+ { 0.452349587233770874133026703}, { 0.891840709392342727796478697},
+ {-0.891840709392342727796478697}, { 0.452349587233770874133026703},
+ { 0.759209188978388033485525443}, { 0.650846684996380915068975573},
+ {-0.650846684996380915068975573}, { 0.759209188978388033485525443},
+ { 0.076623861392031492278332463}, { 0.997060070339482978987989949},
+ {-0.997060070339482978987989949}, { 0.076623861392031492278332463},
+ { 0.997511456140303459699448390}, { 0.070504573389613863027351471},
+ {-0.070504573389613863027351471}, { 0.997511456140303459699448390},
+ { 0.655492852999615385312679701}, { 0.755201376896536527598710756},
+ {-0.755201376896536527598710756}, { 0.655492852999615385312679701},
+ { 0.894599485631382678433072126}, { 0.446868840162374195353044389},
+ {-0.446868840162374195353044389}, { 0.894599485631382678433072126},
+ { 0.316593375556165867243047035}, { 0.948561349915730288158494826},
+ {-0.948561349915730288158494826}, { 0.316593375556165867243047035},
+ { 0.964589793289812723836432159}, { 0.263754678974831383611349322},
+ {-0.263754678974831383611349322}, { 0.964589793289812723836432159},
+ { 0.495565261825772531150266670}, { 0.868570705971340895340449876},
+ {-0.868570705971340895340449876}, { 0.495565261825772531150266670},
+ { 0.790230221437310055030217152}, { 0.612810082429409703935211936},
+ {-0.612810082429409703935211936}, { 0.790230221437310055030217152},
+ { 0.125454983411546238542336453}, { 0.992099313142191757112085445},
+ {-0.992099313142191757112085445}, { 0.125454983411546238542336453},
+ { 0.985797509167567424700995000}, { 0.167938294974731178054745536},
+ {-0.167938294974731178054745536}, { 0.985797509167567424700995000},
+ { 0.578313796411655563342245019}, { 0.815814410806733789010772660},
+ {-0.815814410806733789010772660}, { 0.578313796411655563342245019},
+ { 0.846490938774052078300544488}, { 0.532403127877197971442805218},
+ {-0.532403127877197971442805218}, { 0.846490938774052078300544488},
+ { 0.222093620973203534094094721}, { 0.975025345066994146844913468},
+ {-0.975025345066994146844913468}, { 0.222093620973203534094094721},
+ { 0.934092550404258914729877883}, { 0.357030961233430032614954036},
+ {-0.357030961233430032614954036}, { 0.934092550404258914729877883},
+ { 0.408044162864978680820747499}, { 0.912962190428398164628018233},
+ {-0.912962190428398164628018233}, { 0.408044162864978680820747499},
+ { 0.726359155084345976817494315}, { 0.687315340891759108199186948},
+ {-0.687315340891759108199186948}, { 0.726359155084345976817494315},
+ { 0.027608145778965741612354872}, { 0.999618822495178597116830637},
+ {-0.999618822495178597116830637}, { 0.027608145778965741612354872},
+ { 0.998941293186856850633930266}, { 0.046003182130914628814301788},
+ {-0.046003182130914628814301788}, { 0.998941293186856850633930266},
+ { 0.673829000378756060917568372}, { 0.738887324460615147933116508},
+ {-0.738887324460615147933116508}, { 0.673829000378756060917568372},
+ { 0.905296759318118774354048329}, { 0.424779681209108833357226189},
+ {-0.424779681209108833357226189}, { 0.905296759318118774354048329},
+ { 0.339776884406826857828825803}, { 0.940506070593268323787291309},
+ {-0.940506070593268323787291309}, { 0.339776884406826857828825803},
+ { 0.970772140728950302138169611}, { 0.240003022448741486568922365},
+ {-0.240003022448741486568922365}, { 0.970772140728950302138169611},
+ { 0.516731799017649881508753876}, { 0.856147328375194481019630732},
+ {-0.856147328375194481019630732}, { 0.516731799017649881508753876},
+ { 0.805031331142963597922659282}, { 0.593232295039799808047809426},
+ {-0.593232295039799808047809426}, { 0.805031331142963597922659282},
+ { 0.149764534677321517229695737}, { 0.988721691960323767604516485},
+ {-0.988721691960323767604516485}, { 0.149764534677321517229695737},
+ { 0.989622017463200834623694454}, { 0.143695033150294454819773349},
+ {-0.143695033150294454819773349}, { 0.989622017463200834623694454},
+ { 0.598160706996342311724958652}, { 0.801376171723140219430247777},
+ {-0.801376171723140219430247777}, { 0.598160706996342311724958652},
+ { 0.859301818357008404783582139}, { 0.511468850437970399504391001},
+ {-0.511468850437970399504391001}, { 0.859301818357008404783582139},
+ { 0.245955050335794611599924709}, { 0.969281235356548486048290738},
+ {-0.969281235356548486048290738}, { 0.245955050335794611599924709},
+ { 0.942573197601446879280758735}, { 0.333999651442009404650865481},
+ {-0.333999651442009404650865481}, { 0.942573197601446879280758735},
+ { 0.430326481340082633908199031}, { 0.902673318237258806751502391},
+ {-0.902673318237258806751502391}, { 0.430326481340082633908199031},
+ { 0.743007952135121693517362293}, { 0.669282588346636065720696366},
+ {-0.669282588346636065720696366}, { 0.743007952135121693517362293},
+ { 0.052131704680283321236358216}, { 0.998640218180265222418199049},
+ {-0.998640218180265222418199049}, { 0.052131704680283321236358216},
+ { 0.995480755491926941769171600}, { 0.094963495329638998938034312},
+ {-0.094963495329638998938034312}, { 0.995480755491926941769171600},
+ { 0.636761861236284230413943435}, { 0.771060524261813773200605759},
+ {-0.771060524261813773200605759}, { 0.636761861236284230413943435},
+ { 0.883363338665731594736308015}, { 0.468688822035827933697617870},
+ {-0.468688822035827933697617870}, { 0.883363338665731594736308015},
+ { 0.293219162694258650606608599}, { 0.956045251349996443270479823},
+ {-0.956045251349996443270479823}, { 0.293219162694258650606608599},
+ { 0.957826413027532890321037029}, { 0.287347459544729526477331841},
+ {-0.287347459544729526477331841}, { 0.957826413027532890321037029},
+ { 0.474100214650550014398580015}, { 0.880470889052160770806542929},
+ {-0.880470889052160770806542929}, { 0.474100214650550014398580015},
+ { 0.774953106594873878359129282}, { 0.632018735939809021909403706},
+ {-0.632018735939809021909403706}, { 0.774953106594873878359129282},
+ { 0.101069862754827824987887585}, { 0.994879330794805620591166107},
+ {-0.994879330794805620591166107}, { 0.101069862754827824987887585},
+ { 0.981379193313754574318224190}, { 0.192080397049892441679288205},
+ {-0.192080397049892441679288205}, { 0.981379193313754574318224190},
+ { 0.558118531220556115693702964}, { 0.829761233794523042469023765},
+ {-0.829761233794523042469023765}, { 0.558118531220556115693702964},
+ { 0.833170164701913186439915922}, { 0.553016705580027531764226988},
+ {-0.553016705580027531764226988}, { 0.833170164701913186439915922},
+ { 0.198098410717953586179324918}, { 0.980182135968117392690210009},
+ {-0.980182135968117392690210009}, { 0.198098410717953586179324918},
+ { 0.925049240782677590302371869}, { 0.379847208924051170576281147},
+ {-0.379847208924051170576281147}, { 0.925049240782677590302371869},
+ { 0.385516053843918864075607949}, { 0.922701128333878570437264227},
+ {-0.922701128333878570437264227}, { 0.385516053843918864075607949},
+ { 0.709272826438865651316533772}, { 0.704934080375904908852523758},
+ {-0.704934080375904908852523758}, { 0.709272826438865651316533772},
+ { 0.003067956762965976270145365}, { 0.999995293809576171511580126},
+ {-0.999995293809576171511580126}, { 0.003067956762965976270145365}
+};
+
+const fpr fpr_p2_tab[] = {
+ { 2.00000000000 },
+ { 1.00000000000 },
+ { 0.50000000000 },
+ { 0.25000000000 },
+ { 0.12500000000 },
+ { 0.06250000000 },
+ { 0.03125000000 },
+ { 0.01562500000 },
+ { 0.00781250000 },
+ { 0.00390625000 },
+ { 0.00195312500 }
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.h b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.h
new file mode 100644
index 000000000..a0aefe702
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/fpr.h
@@ -0,0 +1,362 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/* ====================================================================== */
+
+#include
+
+/*
+ * We wrap the native 'double' type into a structure so that the C compiler
+ * complains if we inadvertently use raw arithmetic operators on the 'fpr'
+ * type instead of using the inline functions below. This should have no
+ * extra runtime cost, since all the functions below are 'inline'.
+ */
+typedef struct {
+ double v;
+} fpr;
+
+static inline fpr
+FPR(double v) {
+ fpr x;
+
+ x.v = v;
+ return x;
+}
+
+static inline fpr
+fpr_of(int64_t i) {
+ return FPR((double)i);
+}
+
+static const fpr fpr_q = { 12289.0 };
+static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
+static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
+static const fpr fpr_inv_sigma[] = {
+ { 0.0 }, /* unused */
+ { 0.0069054793295940891952143765991630516 },
+ { 0.0068102267767177975961393730687908629 },
+ { 0.0067188101910722710707826117910434131 },
+ { 0.0065883354370073665545865037227681924 },
+ { 0.0064651781207602900738053897763485516 },
+ { 0.0063486788828078995327741182928037856 },
+ { 0.0062382586529084374473367528433697537 },
+ { 0.0061334065020930261548984001431770281 },
+ { 0.0060336696681577241031668062510953022 },
+ { 0.0059386453095331159950250124336477482 }
+};
+static const fpr fpr_sigma_min[] = {
+ { 0.0 }, /* unused */
+ { 1.1165085072329102588881898380334015 },
+ { 1.1321247692325272405718031785357108 },
+ { 1.1475285353733668684571123112513188 },
+ { 1.1702540788534828939713084716509250 },
+ { 1.1925466358390344011122170489094133 },
+ { 1.2144300507766139921088487776957699 },
+ { 1.2359260567719808790104525941706723 },
+ { 1.2570545284063214162779743112075080 },
+ { 1.2778336969128335860256340575729042 },
+ { 1.2982803343442918539708792538826807 }
+};
+static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
+static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
+static const fpr fpr_bnorm_max = { 16822.4121 };
+static const fpr fpr_zero = { 0.0 };
+static const fpr fpr_one = { 1.0 };
+static const fpr fpr_two = { 2.0 };
+static const fpr fpr_onehalf = { 0.5 };
+static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
+static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
+static const fpr fpr_ptwo31 = { 2147483648.0 };
+static const fpr fpr_ptwo31m1 = { 2147483647.0 };
+static const fpr fpr_mtwo31m1 = { -2147483647.0 };
+static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
+static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
+static const fpr fpr_ptwo63 = { 9223372036854775808.0 };
+
+static inline int64_t
+fpr_rint(fpr x) {
+ /*
+ * We do not want to use llrint() since it might be not
+ * constant-time.
+ *
+ * Suppose that x >= 0. If x >= 2^52, then it is already an
+ * integer. Otherwise, if x < 2^52, then computing x+2^52 will
+ * yield a value that will be rounded to the nearest integer
+ * with exactly the right rules (round-to-nearest-even).
+ *
+ * In order to have constant-time processing, we must do the
+ * computation for both x >= 0 and x < 0 cases, and use a
+ * cast to an integer to access the sign and select the proper
+ * value. Such casts also allow us to find out if |x| < 2^52.
+ */
+ int64_t sx, tx, rp, rn, m;
+ uint32_t ub;
+
+ sx = (int64_t)(x.v - 1.0);
+ tx = (int64_t)x.v;
+ rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
+ rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;
+
+ /*
+ * If tx >= 2^52 or tx < -2^52, then result is tx.
+ * Otherwise, if sx >= 0, then result is rp.
+ * Otherwise, result is rn. We use the fact that when x is
+ * close to 0 (|x| <= 0.25) then both rp and rn are correct;
+ * and if x is not close to 0, then trunc(x-1.0) yields the
+ * appropriate sign.
+ */
+
+ /*
+ * Clamp rp to zero if tx < 0.
+ * Clamp rn to zero if tx >= 0.
+ */
+ m = sx >> 63;
+ rn &= m;
+ rp &= ~m;
+
+ /*
+ * Get the 12 upper bits of tx; if they are not all zeros or
+ * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
+ * rp and rn to zero. Otherwise, we clamp tx to zero.
+ */
+ ub = (uint32_t)((uint64_t)tx >> 52);
+ m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
+ rp &= m;
+ rn &= m;
+ tx &= ~m;
+
+ /*
+ * Only one of tx, rn or rp (at most) can be non-zero at this
+ * point.
+ */
+ return tx | rn | rp;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+ int64_t r;
+
+ /*
+ * The cast performs a trunc() (rounding toward 0) and thus is
+ * wrong by 1 for most negative values. The correction below is
+ * constant-time as long as the compiler turns the
+ * floating-point conversion result into a 0/1 integer without a
+ * conditional branch or another non-constant-time construction.
+ * This should hold on all modern architectures with an FPU (and
+ * if it is false on a given arch, then chances are that the FPU
+ * itself is not constant-time, making the point moot).
+ */
+ r = (int64_t)x.v;
+ return r - (x.v < (double)r);
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+ return (int64_t)x.v;
+}
+
+static inline fpr
+fpr_add(fpr x, fpr y) {
+ return FPR(x.v + y.v);
+}
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+ return FPR(x.v - y.v);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+ return FPR(-x.v);
+}
+
+static inline fpr
+fpr_half(fpr x) {
+ return FPR(x.v * 0.5);
+}
+
+static inline fpr
+fpr_double(fpr x) {
+ return FPR(x.v + x.v);
+}
+
+static inline fpr
+fpr_mul(fpr x, fpr y) {
+ return FPR(x.v * y.v);
+}
+
+static inline fpr
+fpr_sqr(fpr x) {
+ return FPR(x.v * x.v);
+}
+
+static inline fpr
+fpr_inv(fpr x) {
+ return FPR(1.0 / x.v);
+}
+
+static inline fpr
+fpr_div(fpr x, fpr y) {
+ return FPR(x.v / y.v);
+}
+
+static inline void
+fpr_sqrt_avx2(double *t) {
+ __m128d x;
+
+ x = _mm_load1_pd(t);
+ x = _mm_sqrt_pd(x);
+ _mm_storel_pd(t, x);
+}
+
+static inline fpr
+fpr_sqrt(fpr x) {
+ /*
+ * We prefer not to have a dependency on libm when it can be
+ * avoided. On x86, calling the sqrt() libm function inlines
+ * the relevant opcode (fsqrt or sqrtsd, depending on whether
+ * the 387 FPU or SSE2 is used for floating-point operations)
+ * but then makes an optional call to the library function
+ * for proper error handling, in case the operand is negative.
+ *
+ * To avoid this dependency, we use intrinsics or inline assembly
+ * on recognized platforms:
+ *
+ * - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
+ *
+ * - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
+ *
+ * - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
+ * to call the 387 FPU fsqrt opcode.
+ *
+ * - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
+ * the fsqrt opcode (Clang needs a special hack).
+ *
+ * - On GCC/Clang on ARM with hardware floating-point, we use
+ * inline assembly to call the vqsrt.f64 opcode. Due to a
+ * complex ecosystem of compilers and assembly syntaxes, we
+ * have to call it "fsqrt" or "fsqrtd", depending on case.
+ *
+ * If the platform is not recognized, a call to the system
+ * library function sqrt() is performed. On some compilers, this
+ * may actually inline the relevant opcode, and call the library
+ * function only when the input is invalid (e.g. negative);
+ * Falcon never actually calls sqrt() on a negative value, but
+ * the dependency to libm will still be there.
+ */
+
+ fpr_sqrt_avx2(&x.v);
+ return x;
+}
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+ return x.v < y.v;
+}
+
+static inline uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+ /*
+ * Polynomial approximation of exp(-x) is taken from FACCT:
+ * https://eprint.iacr.org/2018/1234
+ * Specifically, values are extracted from the implementation
+ * referenced from the FACCT article, and available at:
+ * https://github.com/raykzhao/gaussian
+ * Tests over more than 24 billions of random inputs in the
+ * 0..log(2) range have never shown a deviation larger than
+ * 2^(-50) from the true mathematical value.
+ */
+
+ /*
+ * AVX2 implementation uses more operations than Horner's method,
+ * but with a lower expression tree depth. This helps because
+ * additions and multiplications have a latency of 4 cycles on
+ * a Skylake, but the CPU can issue two of them per cycle.
+ */
+
+ static const union {
+ double d[12];
+ __m256d v[3];
+ } c = {
+ {
+ 0.999999999999994892974086724280,
+ 0.500000000000019206858326015208,
+ 0.166666666666984014666397229121,
+ 0.041666666666110491190622155955,
+ 0.008333333327800835146903501993,
+ 0.001388888894063186997887560103,
+ 0.000198412739277311890541063977,
+ 0.000024801566833585381209939524,
+ 0.000002755586350219122514855659,
+ 0.000000275607356160477811864927,
+ 0.000000025299506379442070029551,
+ 0.000000002073772366009083061987
+ }
+ };
+
+ double d1, d2, d4, d8, y;
+ __m256d d14, d58, d9c;
+
+ d1 = -x.v;
+ d2 = d1 * d1;
+ d4 = d2 * d2;
+ d8 = d4 * d4;
+ d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
+ d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
+ d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
+ d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
+ d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
+ d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
+ d9c = _mm256_hadd_pd(d9c, d9c);
+ y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
+ + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
+ y *= ccs.v;
+
+ /*
+ * Final conversion goes through int64_t first, because that's what
+ * the underlying opcode (vcvttsd2si) will do, and we know that the
+ * result will fit, since x >= 0 and ccs < 1. If we did the
+ * conversion directly to uint64_t, then the compiler would add some
+ * extra code to cover the case of a source value of 2^63 or more,
+ * and though the alternate path would never be exercised, the
+ * extra comparison would cost us some cycles.
+ */
+ return (uint64_t)(int64_t)(y * fpr_ptwo63.v);
+
+}
+
+#define fpr_gm_tab PQCLEAN_FALCONPADDED512_AVX2_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab PQCLEAN_FALCONPADDED512_AVX2_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/inner.h b/src/sig/falcon/pqclean_falcon-padded-512_avx2/inner.h
new file mode 100644
index 000000000..778174f93
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/inner.h
@@ -0,0 +1,827 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ * - All public functions (i.e. the non-static ones) must be referenced
+ * with the PQCLEAN_FALCONPADDED512_AVX2_ macro (e.g. PQCLEAN_FALCONPADDED512_AVX2_verify_raw for the verify_raw()
+ * function). That macro adds a prefix to the name, which is
+ * configurable with the FALCON_PREFIX macro. This allows compiling
+ * the code into a specific "namespace" and potentially including
+ * several versions of this code into a single application (e.g. to
+ * have an AVX2 and a non-AVX2 variants and select the one to use at
+ * runtime based on availability of AVX2 opcodes).
+ *
+ * - Functions that need temporary buffers expects them as a final
+ * tmp[] array of type uint8_t*, with a size which is documented for
+ * each function. However, most have some alignment requirements,
+ * because they will use the array to store 16-bit, 32-bit or 64-bit
+ * values (e.g. uint64_t or double). The caller must ensure proper
+ * alignment. What happens on unaligned access depends on the
+ * underlying architecture, ranging from a slight time penalty
+ * to immediate termination of the process.
+ *
+ * - Some functions rely on specific rounding rules and precision for
+ * floating-point numbers. On some systems (in particular 32-bit x86
+ * with the 387 FPU), this requires setting an hardware control
+ * word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ * oldcw = set_fpu_cw(2);
+ * PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(...);
+ * set_fpu_cw(oldcw);
+ *
+ * On systems where the native floating-point precision is already
+ * proper, or integer-based emulation is used, the set_fpu_cw()
+ * function does nothing, so it can be called systematically.
+ */
+
+#include
+#include
+#include
+
+/*
+ * This implementation uses AVX2 and optionally FMA intrinsics.
+ */
+#include
+#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c)
+#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c)
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+ return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context shake256incctx
+#define inner_shake256_init(sc) shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ * - modq: sequence of values modulo 12289, each encoded over exactly
+ * 14 bits. The encoder and decoder verify that integers are within
+ * the valid range (0..12288). Values are arrays of uint16.
+ *
+ * - trim: sequence of signed integers, a specified number of bits
+ * each. The number of bits is provided as parameter and includes
+ * the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ * (which means that the -2^(bits-1) value is forbidden); encode and
+ * decode functions check that property. Values are arrays of
+ * int16_t or int8_t, corresponding to names 'trim_i16' and
+ * 'trim_i8', respectively.
+ *
+ * - comp: variable-length encoding for signed integers; each integer
+ * uses a minimum of 9 bits, possibly more. This is normally used
+ * only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED512_AVX2_modq_encode(void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i16_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_AVX2_comp_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED512_AVX2_modq_decode(uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_AVX2_comp_decode(int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_AVX2_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ * c0[] contains the hashed nonce+message
+ * s2[] is the decoded signature
+ * h[] contains the public key, in NTT + Montgomery format
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ * h[] receives the public key (NOT in NTT/Montgomery format)
+ * c0[] contains the hashed nonce+message
+ * s1[] is the first signature half
+ * s2[] is the second signature half
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ * fpr
+ * type for a real number
+ *
+ * fpr fpr_of(int64_t i)
+ * cast an integer into a real number; source must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_scaled(int64_t i, int sc)
+ * compute i*2^sc as a real number; source 'i' must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_ldexp(fpr x, int e)
+ * compute x*2^e
+ *
+ * int64_t fpr_rint(fpr x)
+ * round x to the nearest integer; x must be in the -(2^63-1)
+ * to +(2^63-1) range
+ *
+ * int64_t fpr_trunc(fpr x)
+ * round to an integer; this rounds towards zero; value must
+ * be in the -(2^63-1) to +(2^63-1) range
+ *
+ * fpr fpr_add(fpr x, fpr y)
+ * compute x + y
+ *
+ * fpr fpr_sub(fpr x, fpr y)
+ * compute x - y
+ *
+ * fpr fpr_neg(fpr x)
+ * compute -x
+ *
+ * fpr fpr_half(fpr x)
+ * compute x/2
+ *
+ * fpr fpr_double(fpr x)
+ * compute x*2
+ *
+ * fpr fpr_mul(fpr x, fpr y)
+ * compute x * y
+ *
+ * fpr fpr_sqr(fpr x)
+ * compute x * x
+ *
+ * fpr fpr_inv(fpr x)
+ * compute 1/x
+ *
+ * fpr fpr_div(fpr x, fpr y)
+ * compute x/y
+ *
+ * fpr fpr_sqrt(fpr x)
+ * compute the square root of x
+ *
+ * int fpr_lt(fpr x, fpr y)
+ * return 1 if x < y, 0 otherwise
+ *
+ * uint64_t fpr_expm_p63(fpr x)
+ * return exp(x), assuming that 0 <= x < log(2). Returned value
+ * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ * rounded to the nearest integer). Computation should have a
+ * precision of at least 45 bits.
+ *
+ * const fpr fpr_gm_tab[]
+ * array of constants for FFT / iFFT
+ *
+ * const fpr fpr_p2_tab[]
+ * precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ * fpr fpr_q 12289
+ * fpr fpr_inverse_of_q 1/12289
+ * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2))
+ * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10)
+ * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10)
+ * fpr fpr_log2 log(2)
+ * fpr fpr_inv_log2 1/log(2)
+ * fpr fpr_bnorm_max 16822.4121
+ * fpr fpr_zero 0
+ * fpr fpr_one 1
+ * fpr fpr_two 2
+ * fpr fpr_onehalf 0.5
+ * fpr fpr_ptwo31 2^31
+ * fpr fpr_ptwo31m1 2^31-1
+ * fpr fpr_mtwo31m1 -(2^31-1)
+ * fpr fpr_ptwo63m1 2^63-1
+ * fpr fpr_mtwo63m1 -(2^63-1)
+ * fpr fpr_ptwo63 2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED512_AVX2_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+ union {
+ uint8_t d[512]; /* MUST be 512, exactly */
+ uint64_t dummy_u64;
+ } buf;
+ size_t ptr;
+ union {
+ uint8_t d[256];
+ uint64_t dummy_u64;
+ } state;
+ int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+ size_t u;
+
+ /*
+ * If there are less than 9 bytes in the buffer, we refill it.
+ * This means that we may drop the last few bytes, but this allows
+ * for faster extraction code. Also, it means that we never leave
+ * an empty buffer.
+ */
+ u = p->ptr;
+ if (u >= (sizeof p->buf.d) - 9) {
+ PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+ u = 0;
+ }
+ p->ptr = u + 8;
+
+ return (uint64_t)p->buf.d[u + 0]
+ | ((uint64_t)p->buf.d[u + 1] << 8)
+ | ((uint64_t)p->buf.d[u + 2] << 16)
+ | ((uint64_t)p->buf.d[u + 3] << 24)
+ | ((uint64_t)p->buf.d[u + 4] << 32)
+ | ((uint64_t)p->buf.d[u + 5] << 40)
+ | ((uint64_t)p->buf.d[u + 6] << 48)
+ | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+ unsigned v;
+
+ v = p->buf.d[p->ptr ++];
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+ }
+ return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_div_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_LDL_fft(const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1 136
+#define FALCON_KEYGEN_TEMP_2 272
+#define FALCON_KEYGEN_TEMP_3 224
+#define FALCON_KEYGEN_TEMP_4 448
+#define FALCON_KEYGEN_TEMP_5 896
+#define FALCON_KEYGEN_TEMP_6 1792
+#define FALCON_KEYGEN_TEMP_7 3584
+#define FALCON_KEYGEN_TEMP_8 7168
+#define FALCON_KEYGEN_TEMP_9 14336
+#define FALCON_KEYGEN_TEMP_10 28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED512_AVX2_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ * ctx pointer to the sampler_context structure
+ * mu center for the distribution
+ * isigma inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+ prng p;
+ fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED512_AVX2_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED512_AVX2_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/keygen.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/keygen.c
new file mode 100644
index 000000000..8644e9163
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/keygen.c
@@ -0,0 +1,4233 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ * Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ * Montgomery multiplication computes (x*y)/R mod p for
+ * operands x and y. Therefore:
+ *
+ * - if operands are x*R and y*R (Montgomery representations of x and
+ * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ * mod p, which is the Montgomery representation of the product x*y;
+ *
+ * - if operands are x*R and y (or x and y*R), then Montgomery
+ * multiplication returns x*y mod p: mixed-representation
+ * multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ * p The prime itself.
+ *
+ * g A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ * s The inverse of the product of all previous primes in the array,
+ * computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+ uint32_t p;
+ uint32_t g;
+ uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+ { 2147473409, 383167813, 10239 },
+ { 2147389441, 211808905, 471403745 },
+ { 2147387393, 37672282, 1329335065 },
+ { 2147377153, 1977035326, 968223422 },
+ { 2147358721, 1067163706, 132460015 },
+ { 2147352577, 1606082042, 598693809 },
+ { 2147346433, 2033915641, 1056257184 },
+ { 2147338241, 1653770625, 421286710 },
+ { 2147309569, 631200819, 1111201074 },
+ { 2147297281, 2038364663, 1042003613 },
+ { 2147295233, 1962540515, 19440033 },
+ { 2147239937, 2100082663, 353296760 },
+ { 2147235841, 1991153006, 1703918027 },
+ { 2147217409, 516405114, 1258919613 },
+ { 2147205121, 409347988, 1089726929 },
+ { 2147196929, 927788991, 1946238668 },
+ { 2147178497, 1136922411, 1347028164 },
+ { 2147100673, 868626236, 701164723 },
+ { 2147082241, 1897279176, 617820870 },
+ { 2147074049, 1888819123, 158382189 },
+ { 2147051521, 25006327, 522758543 },
+ { 2147043329, 327546255, 37227845 },
+ { 2147039233, 766324424, 1133356428 },
+ { 2146988033, 1862817362, 73861329 },
+ { 2146963457, 404622040, 653019435 },
+ { 2146959361, 1936581214, 995143093 },
+ { 2146938881, 1559770096, 634921513 },
+ { 2146908161, 422623708, 1985060172 },
+ { 2146885633, 1751189170, 298238186 },
+ { 2146871297, 578919515, 291810829 },
+ { 2146846721, 1114060353, 915902322 },
+ { 2146834433, 2069565474, 47859524 },
+ { 2146818049, 1552824584, 646281055 },
+ { 2146775041, 1906267847, 1597832891 },
+ { 2146756609, 1847414714, 1228090888 },
+ { 2146744321, 1818792070, 1176377637 },
+ { 2146738177, 1118066398, 1054971214 },
+ { 2146736129, 52057278, 933422153 },
+ { 2146713601, 592259376, 1406621510 },
+ { 2146695169, 263161877, 1514178701 },
+ { 2146656257, 685363115, 384505091 },
+ { 2146650113, 927727032, 537575289 },
+ { 2146646017, 52575506, 1799464037 },
+ { 2146643969, 1276803876, 1348954416 },
+ { 2146603009, 814028633, 1521547704 },
+ { 2146572289, 1846678872, 1310832121 },
+ { 2146547713, 919368090, 1019041349 },
+ { 2146508801, 671847612, 38582496 },
+ { 2146492417, 283911680, 532424562 },
+ { 2146490369, 1780044827, 896447978 },
+ { 2146459649, 327980850, 1327906900 },
+ { 2146447361, 1310561493, 958645253 },
+ { 2146441217, 412148926, 287271128 },
+ { 2146437121, 293186449, 2009822534 },
+ { 2146430977, 179034356, 1359155584 },
+ { 2146418689, 1517345488, 1790248672 },
+ { 2146406401, 1615820390, 1584833571 },
+ { 2146404353, 826651445, 607120498 },
+ { 2146379777, 3816988, 1897049071 },
+ { 2146363393, 1221409784, 1986921567 },
+ { 2146355201, 1388081168, 849968120 },
+ { 2146336769, 1803473237, 1655544036 },
+ { 2146312193, 1023484977, 273671831 },
+ { 2146293761, 1074591448, 467406983 },
+ { 2146283521, 831604668, 1523950494 },
+ { 2146203649, 712865423, 1170834574 },
+ { 2146154497, 1764991362, 1064856763 },
+ { 2146142209, 627386213, 1406840151 },
+ { 2146127873, 1638674429, 2088393537 },
+ { 2146099201, 1516001018, 690673370 },
+ { 2146093057, 1294931393, 315136610 },
+ { 2146091009, 1942399533, 973539425 },
+ { 2146078721, 1843461814, 2132275436 },
+ { 2146060289, 1098740778, 360423481 },
+ { 2146048001, 1617213232, 1951981294 },
+ { 2146041857, 1805783169, 2075683489 },
+ { 2146019329, 272027909, 1753219918 },
+ { 2145986561, 1206530344, 2034028118 },
+ { 2145976321, 1243769360, 1173377644 },
+ { 2145964033, 887200839, 1281344586 },
+ { 2145906689, 1651026455, 906178216 },
+ { 2145875969, 1673238256, 1043521212 },
+ { 2145871873, 1226591210, 1399796492 },
+ { 2145841153, 1465353397, 1324527802 },
+ { 2145832961, 1150638905, 554084759 },
+ { 2145816577, 221601706, 427340863 },
+ { 2145785857, 608896761, 316590738 },
+ { 2145755137, 1712054942, 1684294304 },
+ { 2145742849, 1302302867, 724873116 },
+ { 2145728513, 516717693, 431671476 },
+ { 2145699841, 524575579, 1619722537 },
+ { 2145691649, 1925625239, 982974435 },
+ { 2145687553, 463795662, 1293154300 },
+ { 2145673217, 771716636, 881778029 },
+ { 2145630209, 1509556977, 837364988 },
+ { 2145595393, 229091856, 851648427 },
+ { 2145587201, 1796903241, 635342424 },
+ { 2145525761, 715310882, 1677228081 },
+ { 2145495041, 1040930522, 200685896 },
+ { 2145466369, 949804237, 1809146322 },
+ { 2145445889, 1673903706, 95316881 },
+ { 2145390593, 806941852, 1428671135 },
+ { 2145372161, 1402525292, 159350694 },
+ { 2145361921, 2124760298, 1589134749 },
+ { 2145359873, 1217503067, 1561543010 },
+ { 2145355777, 338341402, 83865711 },
+ { 2145343489, 1381532164, 641430002 },
+ { 2145325057, 1883895478, 1528469895 },
+ { 2145318913, 1335370424, 65809740 },
+ { 2145312769, 2000008042, 1919775760 },
+ { 2145300481, 961450962, 1229540578 },
+ { 2145282049, 910466767, 1964062701 },
+ { 2145232897, 816527501, 450152063 },
+ { 2145218561, 1435128058, 1794509700 },
+ { 2145187841, 33505311, 1272467582 },
+ { 2145181697, 269767433, 1380363849 },
+ { 2145175553, 56386299, 1316870546 },
+ { 2145079297, 2106880293, 1391797340 },
+ { 2145021953, 1347906152, 720510798 },
+ { 2145015809, 206769262, 1651459955 },
+ { 2145003521, 1885513236, 1393381284 },
+ { 2144960513, 1810381315, 31937275 },
+ { 2144944129, 1306487838, 2019419520 },
+ { 2144935937, 37304730, 1841489054 },
+ { 2144894977, 1601434616, 157985831 },
+ { 2144888833, 98749330, 2128592228 },
+ { 2144880641, 1772327002, 2076128344 },
+ { 2144864257, 1404514762, 2029969964 },
+ { 2144827393, 801236594, 406627220 },
+ { 2144806913, 349217443, 1501080290 },
+ { 2144796673, 1542656776, 2084736519 },
+ { 2144778241, 1210734884, 1746416203 },
+ { 2144759809, 1146598851, 716464489 },
+ { 2144757761, 286328400, 1823728177 },
+ { 2144729089, 1347555695, 1836644881 },
+ { 2144727041, 1795703790, 520296412 },
+ { 2144696321, 1302475157, 852964281 },
+ { 2144667649, 1075877614, 504992927 },
+ { 2144573441, 198765808, 1617144982 },
+ { 2144555009, 321528767, 155821259 },
+ { 2144550913, 814139516, 1819937644 },
+ { 2144536577, 571143206, 962942255 },
+ { 2144524289, 1746733766, 2471321 },
+ { 2144512001, 1821415077, 124190939 },
+ { 2144468993, 917871546, 1260072806 },
+ { 2144458753, 378417981, 1569240563 },
+ { 2144421889, 175229668, 1825620763 },
+ { 2144409601, 1699216963, 351648117 },
+ { 2144370689, 1071885991, 958186029 },
+ { 2144348161, 1763151227, 540353574 },
+ { 2144335873, 1060214804, 919598847 },
+ { 2144329729, 663515846, 1448552668 },
+ { 2144327681, 1057776305, 590222840 },
+ { 2144309249, 1705149168, 1459294624 },
+ { 2144296961, 325823721, 1649016934 },
+ { 2144290817, 738775789, 447427206 },
+ { 2144243713, 962347618, 893050215 },
+ { 2144237569, 1655257077, 900860862 },
+ { 2144161793, 242206694, 1567868672 },
+ { 2144155649, 769415308, 1247993134 },
+ { 2144137217, 320492023, 515841070 },
+ { 2144120833, 1639388522, 770877302 },
+ { 2144071681, 1761785233, 964296120 },
+ { 2144065537, 419817825, 204564472 },
+ { 2144028673, 666050597, 2091019760 },
+ { 2144010241, 1413657615, 1518702610 },
+ { 2143952897, 1238327946, 475672271 },
+ { 2143940609, 307063413, 1176750846 },
+ { 2143918081, 2062905559, 786785803 },
+ { 2143899649, 1338112849, 1562292083 },
+ { 2143891457, 68149545, 87166451 },
+ { 2143885313, 921750778, 394460854 },
+ { 2143854593, 719766593, 133877196 },
+ { 2143836161, 1149399850, 1861591875 },
+ { 2143762433, 1848739366, 1335934145 },
+ { 2143756289, 1326674710, 102999236 },
+ { 2143713281, 808061791, 1156900308 },
+ { 2143690753, 388399459, 1926468019 },
+ { 2143670273, 1427891374, 1756689401 },
+ { 2143666177, 1912173949, 986629565 },
+ { 2143645697, 2041160111, 371842865 },
+ { 2143641601, 1279906897, 2023974350 },
+ { 2143635457, 720473174, 1389027526 },
+ { 2143621121, 1298309455, 1732632006 },
+ { 2143598593, 1548762216, 1825417506 },
+ { 2143567873, 620475784, 1073787233 },
+ { 2143561729, 1932954575, 949167309 },
+ { 2143553537, 354315656, 1652037534 },
+ { 2143541249, 577424288, 1097027618 },
+ { 2143531009, 357862822, 478640055 },
+ { 2143522817, 2017706025, 1550531668 },
+ { 2143506433, 2078127419, 1824320165 },
+ { 2143488001, 613475285, 1604011510 },
+ { 2143469569, 1466594987, 502095196 },
+ { 2143426561, 1115430331, 1044637111 },
+ { 2143383553, 9778045, 1902463734 },
+ { 2143377409, 1557401276, 2056861771 },
+ { 2143363073, 652036455, 1965915971 },
+ { 2143260673, 1464581171, 1523257541 },
+ { 2143246337, 1876119649, 764541916 },
+ { 2143209473, 1614992673, 1920672844 },
+ { 2143203329, 981052047, 2049774209 },
+ { 2143160321, 1847355533, 728535665 },
+ { 2143129601, 965558457, 603052992 },
+ { 2143123457, 2140817191, 8348679 },
+ { 2143100929, 1547263683, 694209023 },
+ { 2143092737, 643459066, 1979934533 },
+ { 2143082497, 188603778, 2026175670 },
+ { 2143062017, 1657329695, 377451099 },
+ { 2143051777, 114967950, 979255473 },
+ { 2143025153, 1698431342, 1449196896 },
+ { 2143006721, 1862741675, 1739650365 },
+ { 2142996481, 756660457, 996160050 },
+ { 2142976001, 927864010, 1166847574 },
+ { 2142965761, 905070557, 661974566 },
+ { 2142916609, 40932754, 1787161127 },
+ { 2142892033, 1987985648, 675335382 },
+ { 2142885889, 797497211, 1323096997 },
+ { 2142871553, 2068025830, 1411877159 },
+ { 2142861313, 1217177090, 1438410687 },
+ { 2142830593, 409906375, 1767860634 },
+ { 2142803969, 1197788993, 359782919 },
+ { 2142785537, 643817365, 513932862 },
+ { 2142779393, 1717046338, 218943121 },
+ { 2142724097, 89336830, 416687049 },
+ { 2142707713, 5944581, 1356813523 },
+ { 2142658561, 887942135, 2074011722 },
+ { 2142638081, 151851972, 1647339939 },
+ { 2142564353, 1691505537, 1483107336 },
+ { 2142533633, 1989920200, 1135938817 },
+ { 2142529537, 959263126, 1531961857 },
+ { 2142527489, 453251129, 1725566162 },
+ { 2142502913, 1536028102, 182053257 },
+ { 2142498817, 570138730, 701443447 },
+ { 2142416897, 326965800, 411931819 },
+ { 2142363649, 1675665410, 1517191733 },
+ { 2142351361, 968529566, 1575712703 },
+ { 2142330881, 1384953238, 1769087884 },
+ { 2142314497, 1977173242, 1833745524 },
+ { 2142289921, 95082313, 1714775493 },
+ { 2142283777, 109377615, 1070584533 },
+ { 2142277633, 16960510, 702157145 },
+ { 2142263297, 553850819, 431364395 },
+ { 2142208001, 241466367, 2053967982 },
+ { 2142164993, 1795661326, 1031836848 },
+ { 2142097409, 1212530046, 712772031 },
+ { 2142087169, 1763869720, 822276067 },
+ { 2142078977, 644065713, 1765268066 },
+ { 2142074881, 112671944, 643204925 },
+ { 2142044161, 1387785471, 1297890174 },
+ { 2142025729, 783885537, 1000425730 },
+ { 2142011393, 905662232, 1679401033 },
+ { 2141974529, 799788433, 468119557 },
+ { 2141943809, 1932544124, 449305555 },
+ { 2141933569, 1527403256, 841867925 },
+ { 2141931521, 1247076451, 743823916 },
+ { 2141902849, 1199660531, 401687910 },
+ { 2141890561, 150132350, 1720336972 },
+ { 2141857793, 1287438162, 663880489 },
+ { 2141833217, 618017731, 1819208266 },
+ { 2141820929, 999578638, 1403090096 },
+ { 2141786113, 81834325, 1523542501 },
+ { 2141771777, 120001928, 463556492 },
+ { 2141759489, 122455485, 2124928282 },
+ { 2141749249, 141986041, 940339153 },
+ { 2141685761, 889088734, 477141499 },
+ { 2141673473, 324212681, 1122558298 },
+ { 2141669377, 1175806187, 1373818177 },
+ { 2141655041, 1113654822, 296887082 },
+ { 2141587457, 991103258, 1585913875 },
+ { 2141583361, 1401451409, 1802457360 },
+ { 2141575169, 1571977166, 712760980 },
+ { 2141546497, 1107849376, 1250270109 },
+ { 2141515777, 196544219, 356001130 },
+ { 2141495297, 1733571506, 1060744866 },
+ { 2141483009, 321552363, 1168297026 },
+ { 2141458433, 505818251, 733225819 },
+ { 2141360129, 1026840098, 948342276 },
+ { 2141325313, 945133744, 2129965998 },
+ { 2141317121, 1871100260, 1843844634 },
+ { 2141286401, 1790639498, 1750465696 },
+ { 2141267969, 1376858592, 186160720 },
+ { 2141255681, 2129698296, 1876677959 },
+ { 2141243393, 2138900688, 1340009628 },
+ { 2141214721, 1933049835, 1087819477 },
+ { 2141212673, 1898664939, 1786328049 },
+ { 2141202433, 990234828, 940682169 },
+ { 2141175809, 1406392421, 993089586 },
+ { 2141165569, 1263518371, 289019479 },
+ { 2141073409, 1485624211, 507864514 },
+ { 2141052929, 1885134788, 311252465 },
+ { 2141040641, 1285021247, 280941862 },
+ { 2141028353, 1527610374, 375035110 },
+ { 2141011969, 1400626168, 164696620 },
+ { 2140999681, 632959608, 966175067 },
+ { 2140997633, 2045628978, 1290889438 },
+ { 2140993537, 1412755491, 375366253 },
+ { 2140942337, 719477232, 785367828 },
+ { 2140925953, 45224252, 836552317 },
+ { 2140917761, 1157376588, 1001839569 },
+ { 2140887041, 278480752, 2098732796 },
+ { 2140837889, 1663139953, 924094810 },
+ { 2140788737, 802501511, 2045368990 },
+ { 2140766209, 1820083885, 1800295504 },
+ { 2140764161, 1169561905, 2106792035 },
+ { 2140696577, 127781498, 1885987531 },
+ { 2140684289, 16014477, 1098116827 },
+ { 2140653569, 665960598, 1796728247 },
+ { 2140594177, 1043085491, 377310938 },
+ { 2140579841, 1732838211, 1504505945 },
+ { 2140569601, 302071939, 358291016 },
+ { 2140567553, 192393733, 1909137143 },
+ { 2140557313, 406595731, 1175330270 },
+ { 2140549121, 1748850918, 525007007 },
+ { 2140477441, 499436566, 1031159814 },
+ { 2140469249, 1886004401, 1029951320 },
+ { 2140426241, 1483168100, 1676273461 },
+ { 2140420097, 1779917297, 846024476 },
+ { 2140413953, 522948893, 1816354149 },
+ { 2140383233, 1931364473, 1296921241 },
+ { 2140366849, 1917356555, 147196204 },
+ { 2140354561, 16466177, 1349052107 },
+ { 2140348417, 1875366972, 1860485634 },
+ { 2140323841, 456498717, 1790256483 },
+ { 2140321793, 1629493973, 150031888 },
+ { 2140315649, 1904063898, 395510935 },
+ { 2140280833, 1784104328, 831417909 },
+ { 2140250113, 256087139, 697349101 },
+ { 2140229633, 388553070, 243875754 },
+ { 2140223489, 747459608, 1396270850 },
+ { 2140200961, 507423743, 1895572209 },
+ { 2140162049, 580106016, 2045297469 },
+ { 2140149761, 712426444, 785217995 },
+ { 2140137473, 1441607584, 536866543 },
+ { 2140119041, 346538902, 1740434653 },
+ { 2140090369, 282642885, 21051094 },
+ { 2140076033, 1407456228, 319910029 },
+ { 2140047361, 1619330500, 1488632070 },
+ { 2140041217, 2089408064, 2012026134 },
+ { 2140008449, 1705524800, 1613440760 },
+ { 2139924481, 1846208233, 1280649481 },
+ { 2139906049, 989438755, 1185646076 },
+ { 2139867137, 1522314850, 372783595 },
+ { 2139842561, 1681587377, 216848235 },
+ { 2139826177, 2066284988, 1784999464 },
+ { 2139824129, 480888214, 1513323027 },
+ { 2139789313, 847937200, 858192859 },
+ { 2139783169, 1642000434, 1583261448 },
+ { 2139770881, 940699589, 179702100 },
+ { 2139768833, 315623242, 964612676 },
+ { 2139666433, 331649203, 764666914 },
+ { 2139641857, 2118730799, 1313764644 },
+ { 2139635713, 519149027, 519212449 },
+ { 2139598849, 1526413634, 1769667104 },
+ { 2139574273, 551148610, 820739925 },
+ { 2139568129, 1386800242, 472447405 },
+ { 2139549697, 813760130, 1412328531 },
+ { 2139537409, 1615286260, 1609362979 },
+ { 2139475969, 1352559299, 1696720421 },
+ { 2139455489, 1048691649, 1584935400 },
+ { 2139432961, 836025845, 950121150 },
+ { 2139424769, 1558281165, 1635486858 },
+ { 2139406337, 1728402143, 1674423301 },
+ { 2139396097, 1727715782, 1483470544 },
+ { 2139383809, 1092853491, 1741699084 },
+ { 2139369473, 690776899, 1242798709 },
+ { 2139351041, 1768782380, 2120712049 },
+ { 2139334657, 1739968247, 1427249225 },
+ { 2139332609, 1547189119, 623011170 },
+ { 2139310081, 1346827917, 1605466350 },
+ { 2139303937, 369317948, 828392831 },
+ { 2139301889, 1560417239, 1788073219 },
+ { 2139283457, 1303121623, 595079358 },
+ { 2139248641, 1354555286, 573424177 },
+ { 2139240449, 60974056, 885781403 },
+ { 2139222017, 355573421, 1221054839 },
+ { 2139215873, 566477826, 1724006500 },
+ { 2139150337, 871437673, 1609133294 },
+ { 2139144193, 1478130914, 1137491905 },
+ { 2139117569, 1854880922, 964728507 },
+ { 2139076609, 202405335, 756508944 },
+ { 2139062273, 1399715741, 884826059 },
+ { 2139045889, 1051045798, 1202295476 },
+ { 2139033601, 1707715206, 632234634 },
+ { 2139006977, 2035853139, 231626690 },
+ { 2138951681, 183867876, 838350879 },
+ { 2138945537, 1403254661, 404460202 },
+ { 2138920961, 310865011, 1282911681 },
+ { 2138910721, 1328496553, 103472415 },
+ { 2138904577, 78831681, 993513549 },
+ { 2138902529, 1319697451, 1055904361 },
+ { 2138816513, 384338872, 1706202469 },
+ { 2138810369, 1084868275, 405677177 },
+ { 2138787841, 401181788, 1964773901 },
+ { 2138775553, 1850532988, 1247087473 },
+ { 2138767361, 874261901, 1576073565 },
+ { 2138757121, 1187474742, 993541415 },
+ { 2138748929, 1782458888, 1043206483 },
+ { 2138744833, 1221500487, 800141243 },
+ { 2138738689, 413465368, 1450660558 },
+ { 2138695681, 739045140, 342611472 },
+ { 2138658817, 1355845756, 672674190 },
+ { 2138644481, 608379162, 1538874380 },
+ { 2138632193, 1444914034, 686911254 },
+ { 2138607617, 484707818, 1435142134 },
+ { 2138591233, 539460669, 1290458549 },
+ { 2138572801, 2093538990, 2011138646 },
+ { 2138552321, 1149786988, 1076414907 },
+ { 2138546177, 840688206, 2108985273 },
+ { 2138533889, 209669619, 198172413 },
+ { 2138523649, 1975879426, 1277003968 },
+ { 2138490881, 1351891144, 1976858109 },
+ { 2138460161, 1817321013, 1979278293 },
+ { 2138429441, 1950077177, 203441928 },
+ { 2138400769, 908970113, 628395069 },
+ { 2138398721, 219890864, 758486760 },
+ { 2138376193, 1306654379, 977554090 },
+ { 2138351617, 298822498, 2004708503 },
+ { 2138337281, 441457816, 1049002108 },
+ { 2138320897, 1517731724, 1442269609 },
+ { 2138290177, 1355911197, 1647139103 },
+ { 2138234881, 531313247, 1746591962 },
+ { 2138214401, 1899410930, 781416444 },
+ { 2138202113, 1813477173, 1622508515 },
+ { 2138191873, 1086458299, 1025408615 },
+ { 2138183681, 1998800427, 827063290 },
+ { 2138173441, 1921308898, 749670117 },
+ { 2138103809, 1620902804, 2126787647 },
+ { 2138099713, 828647069, 1892961817 },
+ { 2138085377, 179405355, 1525506535 },
+ { 2138060801, 615683235, 1259580138 },
+ { 2138044417, 2030277840, 1731266562 },
+ { 2138042369, 2087222316, 1627902259 },
+ { 2138032129, 126388712, 1108640984 },
+ { 2138011649, 715026550, 1017980050 },
+ { 2137993217, 1693714349, 1351778704 },
+ { 2137888769, 1289762259, 1053090405 },
+ { 2137853953, 199991890, 1254192789 },
+ { 2137833473, 941421685, 896995556 },
+ { 2137817089, 750416446, 1251031181 },
+ { 2137792513, 798075119, 368077456 },
+ { 2137786369, 878543495, 1035375025 },
+ { 2137767937, 9351178, 1156563902 },
+ { 2137755649, 1382297614, 1686559583 },
+ { 2137724929, 1345472850, 1681096331 },
+ { 2137704449, 834666929, 630551727 },
+ { 2137673729, 1646165729, 1892091571 },
+ { 2137620481, 778943821, 48456461 },
+ { 2137618433, 1730837875, 1713336725 },
+ { 2137581569, 805610339, 1378891359 },
+ { 2137538561, 204342388, 1950165220 },
+ { 2137526273, 1947629754, 1500789441 },
+ { 2137516033, 719902645, 1499525372 },
+ { 2137491457, 230451261, 556382829 },
+ { 2137440257, 979573541, 412760291 },
+ { 2137374721, 927841248, 1954137185 },
+ { 2137362433, 1243778559, 861024672 },
+ { 2137313281, 1341338501, 980638386 },
+ { 2137311233, 937415182, 1793212117 },
+ { 2137255937, 795331324, 1410253405 },
+ { 2137243649, 150756339, 1966999887 },
+ { 2137182209, 163346914, 1939301431 },
+ { 2137171969, 1952552395, 758913141 },
+ { 2137159681, 570788721, 218668666 },
+ { 2137147393, 1896656810, 2045670345 },
+ { 2137141249, 358493842, 518199643 },
+ { 2137139201, 1505023029, 674695848 },
+ { 2137133057, 27911103, 830956306 },
+ { 2137122817, 439771337, 1555268614 },
+ { 2137116673, 790988579, 1871449599 },
+ { 2137110529, 432109234, 811805080 },
+ { 2137102337, 1357900653, 1184997641 },
+ { 2137098241, 515119035, 1715693095 },
+ { 2137090049, 408575203, 2085660657 },
+ { 2137085953, 2097793407, 1349626963 },
+ { 2137055233, 1556739954, 1449960883 },
+ { 2137030657, 1545758650, 1369303716 },
+ { 2136987649, 332602570, 103875114 },
+ { 2136969217, 1499989506, 1662964115 },
+ { 2136924161, 857040753, 4738842 },
+ { 2136895489, 1948872712, 570436091 },
+ { 2136893441, 58969960, 1568349634 },
+ { 2136887297, 2127193379, 273612548 },
+ { 2136850433, 111208983, 1181257116 },
+ { 2136809473, 1627275942, 1680317971 },
+ { 2136764417, 1574888217, 14011331 },
+ { 2136741889, 14011055, 1129154251 },
+ { 2136727553, 35862563, 1838555253 },
+ { 2136721409, 310235666, 1363928244 },
+ { 2136698881, 1612429202, 1560383828 },
+ { 2136649729, 1138540131, 800014364 },
+ { 2136606721, 602323503, 1433096652 },
+ { 2136563713, 182209265, 1919611038 },
+ { 2136555521, 324156477, 165591039 },
+ { 2136549377, 195513113, 217165345 },
+ { 2136526849, 1050768046, 939647887 },
+ { 2136508417, 1886286237, 1619926572 },
+ { 2136477697, 609647664, 35065157 },
+ { 2136471553, 679352216, 1452259468 },
+ { 2136457217, 128630031, 824816521 },
+ { 2136422401, 19787464, 1526049830 },
+ { 2136420353, 698316836, 1530623527 },
+ { 2136371201, 1651862373, 1804812805 },
+ { 2136334337, 326596005, 336977082 },
+ { 2136322049, 63253370, 1904972151 },
+ { 2136297473, 312176076, 172182411 },
+ { 2136248321, 381261841, 369032670 },
+ { 2136242177, 358688773, 1640007994 },
+ { 2136229889, 512677188, 75585225 },
+ { 2136219649, 2095003250, 1970086149 },
+ { 2136207361, 1909650722, 537760675 },
+ { 2136176641, 1334616195, 1533487619 },
+ { 2136158209, 2096285632, 1793285210 },
+ { 2136143873, 1897347517, 293843959 },
+ { 2136133633, 923586222, 1022655978 },
+ { 2136096769, 1464868191, 1515074410 },
+ { 2136094721, 2020679520, 2061636104 },
+ { 2136076289, 290798503, 1814726809 },
+ { 2136041473, 156415894, 1250757633 },
+ { 2135996417, 297459940, 1132158924 },
+ { 2135955457, 538755304, 1688831340 },
+ { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+ uint32_t w;
+
+ w = (uint32_t)x;
+ w += p & -(w >> 31);
+ return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+ return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+ uint32_t y;
+
+ y = 2 - p;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+ /*
+ * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+ * 2^31 - p.
+ */
+ return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a + b - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a - b;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+ a += p & -(a & 1);
+ return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+ uint64_t z, w;
+ uint32_t d;
+
+ z = (uint64_t)a * (uint64_t)b;
+ w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+ d = (uint32_t)((z + w) >> 31) - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+ uint32_t z;
+
+ /*
+ * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+ * representation), then double it with an addition.
+ */
+ z = modp_R(p);
+ z = modp_add(z, z, p);
+
+ /*
+ * Square it five times to obtain 2^32 in Montgomery representation
+ * (i.e. 2^63 mod p).
+ */
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+
+ /*
+ * Halve the value mod p to get 2^62.
+ */
+ z = (z + (p & -(z & 1))) >> 1;
+ return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+ int i;
+ uint32_t r, z;
+
+ /*
+ * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+ * representation of (2^31)^e mod p, where e = x-1.
+ * R2 is 2^31 in Montgomery representation.
+ */
+ x --;
+ r = R2;
+ z = modp_R(p);
+ for (i = 0; (1U << i) <= x; i ++) {
+ if ((x & (1U << i)) != 0) {
+ z = modp_montymul(z, r, p, p0i);
+ }
+ r = modp_montymul(r, r, p, p0i);
+ }
+ return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ * a dividend
+ * b divisor
+ * p odd prime modulus
+ * p0i -1/p mod 2^31
+ * R 2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+ uint32_t z, e;
+ int i;
+
+ e = p - 2;
+ z = R;
+ for (i = 30; i >= 0; i --) {
+ uint32_t z2;
+
+ z = modp_montymul(z, z, p, p0i);
+ z2 = modp_montymul(z, b, p, p0i);
+ z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+ }
+
+ /*
+ * The loop above just assumed that b was in Montgomery
+ * representation, i.e. really contained b*R; under that
+ * assumption, it returns 1/b in Montgomery representation,
+ * which is R/b. But we gave it b in normal representation,
+ * so the loop really returned R/(b/R) = R^2/b.
+ *
+ * We want a/b, so we need one Montgomery multiplication with a,
+ * which also remove one of the R factors, and another such
+ * multiplication to remove the second R factor.
+ */
+ z = modp_montymul(z, 1, p, p0i);
+ return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+ 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832,
+ 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928,
+ 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784,
+ 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976,
+ 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880,
+ 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904,
+ 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808,
+ 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000,
+ 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856,
+ 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952,
+ 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772,
+ 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964,
+ 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868,
+ 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916,
+ 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820,
+ 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012,
+ 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844,
+ 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940,
+ 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796,
+ 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988,
+ 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892,
+ 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898,
+ 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802,
+ 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994,
+ 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850,
+ 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946,
+ 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778,
+ 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970,
+ 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874,
+ 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922,
+ 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826,
+ 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018,
+ 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838,
+ 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934,
+ 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790,
+ 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982,
+ 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886,
+ 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910,
+ 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814,
+ 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006,
+ 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862,
+ 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958,
+ 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769,
+ 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961,
+ 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865,
+ 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913,
+ 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817,
+ 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009,
+ 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841,
+ 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937,
+ 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793,
+ 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985,
+ 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889,
+ 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901,
+ 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805,
+ 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997,
+ 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853,
+ 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949,
+ 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781,
+ 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973,
+ 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877,
+ 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925,
+ 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829,
+ 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021,
+ 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835,
+ 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931,
+ 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787,
+ 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979,
+ 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883,
+ 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907,
+ 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811,
+ 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003,
+ 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859,
+ 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955,
+ 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775,
+ 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967,
+ 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871,
+ 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919,
+ 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823,
+ 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015,
+ 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847,
+ 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943,
+ 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799,
+ 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991,
+ 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895,
+ 255, 767, 511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ * gm[rev(i)] = g^i mod p
+ * igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+ uint32_t g, uint32_t p, uint32_t p0i) {
+ size_t u, n;
+ unsigned k;
+ uint32_t ig, x1, x2, R2;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * We want g such that g^(2N) = 1 mod p, but the provided
+ * generator has order 2048. We must square it a few times.
+ */
+ R2 = modp_R2(p, p0i);
+ g = modp_montymul(g, R2, p, p0i);
+ for (k = logn; k < 10; k ++) {
+ g = modp_montymul(g, g, p, p0i);
+ }
+
+ ig = modp_div(R2, g, p, p0i, modp_R(p));
+ k = 10 - logn;
+ x1 = x2 = modp_R(p);
+ for (u = 0; u < n; u ++) {
+ size_t v;
+
+ v = REV10[u << k];
+ gm[v] = x1;
+ igm[v] = x2;
+ x1 = modp_montymul(x1, g, p, p0i);
+ x2 = modp_montymul(x2, ig, p, p0i);
+ }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, u, v1;
+
+ ht = t >> 1;
+ for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = gm[m + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + ht * stride;
+ for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = modp_montymul(*r2, s, p, p0i);
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_sub(x, y, p);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n, k;
+ uint32_t ni;
+ uint32_t *r;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = 1;
+ for (m = n; m > 1; m >>= 1) {
+ size_t hm, dt, u, v1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = igm[hm + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + t * stride;
+ for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = *r2;
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_montymul(
+ modp_sub(x, y, p), s, p, p0i);;
+ }
+ }
+ t = dt;
+ }
+
+ /*
+ * We need 1/n in Montgomery representation, i.e. R/n. Since
+ * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+ * thus a simple shift will do.
+ */
+ ni = (uint32_t)1 << (31 - logn);
+ for (k = 0, r = a; k < n; k ++, r += stride) {
+ *r = modp_montymul(*r, ni, p, p0i);
+ }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ size_t hn, u;
+
+ hn = (size_t)1 << (logn - 1);
+ for (u = 0; u < hn; u ++) {
+ uint32_t w0, w1;
+
+ w0 = f[(u << 1) + 0];
+ w1 = f[(u << 1) + 1];
+ f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ * - Rebuild the resultant and the polynomial coefficients from their
+ * values modulo small primes (of length 31 bits each).
+ *
+ * - Compute an extended GCD between the two computed resultants.
+ *
+ * - Extract top bits and add scaled values during the successive steps
+ * of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+ uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ cc = 0;
+ m = -ctl;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, w;
+
+ aw = a[u];
+ w = aw - b[u] - cc;
+ cc = w >> 31;
+ aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+ a[u] = aw;
+ }
+ return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < mlen; u ++) {
+ uint64_t z;
+
+ z = (uint64_t)m[u] * (uint64_t)x + cc;
+ m[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ * d is unsigned
+ * p is prime
+ * 2^30 < p < 2^31
+ * p0i = -(1/p) mod 2^31
+ * R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ uint32_t x;
+ size_t u;
+
+ /*
+ * Algorithm: we inject words one by one, starting with the high
+ * word. Each step is:
+ * - multiply x by 2^31
+ * - add new word
+ */
+ x = 0;
+ u = dlen;
+ while (u -- > 0) {
+ uint32_t w;
+
+ x = modp_montymul(x, R2, p, p0i);
+ w = d[u] - p;
+ w += p & -(w >> 31);
+ x = modp_add(x, w, p);
+ }
+ return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+ uint32_t z;
+
+ if (dlen == 0) {
+ return 0;
+ }
+ z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+ z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+ return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+ const uint32_t *y, size_t len, uint32_t s) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t xw, yw;
+ uint64_t z;
+
+ xw = x[u];
+ yw = y[u];
+ z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+ size_t u;
+ uint32_t r, bb;
+
+ /*
+ * Compare x with p/2. We use the shifted version of p, and p
+ * is odd, so we really compare with (p-1)/2; we want to perform
+ * the subtraction if and only if x > (p-1)/2.
+ */
+ r = 0;
+ bb = 0;
+ u = len;
+ while (u -- > 0) {
+ uint32_t wx, wp, cc;
+
+ /*
+ * Get the two words to compare in wx and wp (both over
+ * 31 bits exactly).
+ */
+ wx = x[u];
+ wp = (p[u] >> 1) | (bb << 30);
+ bb = p[u] & 1;
+
+ /*
+ * We set cc to -1, 0 or 1, depending on whether wp is
+ * lower than, equal to, or greater than wx.
+ */
+ cc = wp - wx;
+ cc = ((-cc) >> 31) | -(cc >> 31);
+
+ /*
+ * If r != 0 then it is either 1 or -1, and we keep its
+ * value. Otherwise, if r = 0, then we replace it with cc.
+ */
+ r |= cc & ((r & 1) - 1);
+ }
+
+ /*
+ * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+ * is lower than, equal to, or greater than x. We thus want to
+ * do the subtraction only if r = -1.
+ */
+ zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+ size_t num, const small_prime *primes, int normalize_signed,
+ uint32_t *tmp) {
+ size_t u;
+ uint32_t *x;
+
+ tmp[0] = primes[0].p;
+ for (u = 1; u < xlen; u ++) {
+ /*
+ * At the entry of each loop iteration:
+ * - the first u words of each array have been
+ * reassembled;
+ * - the first u words of tmp[] contains the
+ * product of the prime moduli processed so far.
+ *
+ * We call 'q' the product of all previous primes.
+ */
+ uint32_t p, p0i, s, R2;
+ size_t v;
+
+ p = primes[u].p;
+ s = primes[u].s;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ for (v = 0, x = xx; v < num; v ++, x += xstride) {
+ uint32_t xp, xq, xr;
+ /*
+ * xp = the integer x modulo the prime p for this
+ * iteration
+ * xq = (x mod q) mod p
+ */
+ xp = x[u];
+ xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+ /*
+ * New value is (x mod q) + q * (s * (xp - xq) mod p)
+ */
+ xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+ zint_add_mul_small(x, tmp, u, xr);
+ }
+
+ /*
+ * Update product of primes in tmp[].
+ */
+ tmp[u] = zint_mul_small(tmp, u, p);
+ }
+
+ /*
+ * Normalize the reconstructed values around 0.
+ */
+ if (normalize_signed) {
+ for (u = 0, x = xx; u < num; u ++, x += xstride) {
+ zint_norm_zero(x, tmp, xlen);
+ }
+ }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ /*
+ * If ctl = 1 then we flip the bits of a by XORing with
+ * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+ * with 0 and add 0, which leaves the value unchanged.
+ */
+ cc = ctl;
+ m = -ctl >> 1;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw;
+
+ aw = a[u];
+ aw = (aw ^ m) + cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ * 0 both values were positive
+ * 1 new a had to be negated
+ * 2 new b had to be negated
+ * 3 both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+ int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t nega, negb;
+
+ cca = 0;
+ ccb = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ nega = (uint32_t)((uint64_t)cca >> 63);
+ negb = (uint32_t)((uint64_t)ccb >> 63);
+ zint_negate(a, len, nega);
+ zint_negate(b, len, negb);
+ return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ * if neg = 1, then -m <= a < 0
+ * if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+ size_t u;
+ uint32_t cc, xm, ym;
+
+ /*
+ * First pass: compare a (assumed nonnegative) with m. Note that
+ * if the top word uses 32 bits, subtracting m must yield a
+ * value less than 2^31 since a < 2*m.
+ */
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ cc = (a[u] - m[u] - cc) >> 31;
+ }
+
+ /*
+ * If neg = 1 then we must add m (regardless of cc)
+ * If neg = 0 and cc = 0 then we must subtract m
+ * If neg = 0 and cc = 1 then we must do nothing
+ *
+ * In the loop below, we conditionally subtract either m or -m
+ * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+ * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+ */
+ xm = -neg >> 1;
+ ym = -(neg | (1 - cc));
+ cc = neg;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, mw;
+
+ aw = a[u];
+ mw = (m[u] ^ xm) & ym;
+ aw = aw - mw - cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+ uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t fa, fb;
+
+ /*
+ * These are actually four combined Montgomery multiplications.
+ */
+ cca = 0;
+ ccb = 0;
+ fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+ fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb
+ + m[u] * (uint64_t)fa + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+ + m[u] * (uint64_t)fb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ /*
+ * At this point:
+ * -m <= a < 2*m
+ * -m <= b < 2*m
+ * (this is a case of Montgomery reduction)
+ * The top words of 'a' and 'b' may have a 32-th bit set.
+ * We want to add or subtract the modulus, as required.
+ */
+ zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+ zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ * 0 <= u <= y
+ * 0 <= v <= x
+ * x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+ const uint32_t *x, const uint32_t *y,
+ size_t len, uint32_t *tmp) {
+ /*
+ * Algorithm is an extended binary GCD. We maintain 6 values
+ * a, b, u0, u1, v0 and v1 with the following invariants:
+ *
+ * a = x*u0 - y*v0
+ * b = x*u1 - y*v1
+ * 0 <= a <= x
+ * 0 <= b <= y
+ * 0 <= u0 < y
+ * 0 <= v0 < x
+ * 0 <= u1 <= y
+ * 0 <= v1 < x
+ *
+ * Initial values are:
+ *
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ *
+ * Each iteration reduces either a or b, and maintains the
+ * invariants. Algorithm stops when a = b, at which point their
+ * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+ * the values (u,v) we want to return.
+ *
+ * The formal definition of the algorithm is a sequence of steps:
+ *
+ * - If a is even, then:
+ * a <- a/2
+ * u0 <- u0/2 mod y
+ * v0 <- v0/2 mod x
+ *
+ * - Otherwise, if b is even, then:
+ * b <- b/2
+ * u1 <- u1/2 mod y
+ * v1 <- v1/2 mod x
+ *
+ * - Otherwise, if a > b, then:
+ * a <- (a-b)/2
+ * u0 <- (u0-u1)/2 mod y
+ * v0 <- (v0-v1)/2 mod x
+ *
+ * - Otherwise:
+ * b <- (b-a)/2
+ * u1 <- (u1-u0)/2 mod y
+ * v1 <- (v1-v0)/2 mod y
+ *
+ * We can show that the operations above preserve the invariants:
+ *
+ * - If a is even, then u0 and v0 are either both even or both
+ * odd (since a = x*u0 - y*v0, and x and y are both odd).
+ * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+ * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+ * the a = x*u0 - y*v0 invariant is preserved.
+ *
+ * - The same holds for the case where b is even.
+ *
+ * - If a and b are odd, and a > b, then:
+ *
+ * a-b = x*(u0-u1) - y*(v0-v1)
+ *
+ * In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+ * a-b > 0; therefore, it must be that v0 < v1, and the
+ * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+ * which preserves the invariants. Otherwise, if u0 > u1,
+ * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+ * b >= 0, hence a-b <= x. It follows that, in that case,
+ * v0-v1 >= 0. The first part of the update is then:
+ * (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+ * invariants.
+ *
+ * Either way, once the subtraction is done, the new value of
+ * a, which is the difference of two odd values, is even,
+ * and the remaining of this step is a subcase of the
+ * first algorithm case (i.e. when a is even).
+ *
+ * - If a and b are odd, and b > a, then the a similar
+ * argument holds.
+ *
+ * The values a and b start at x and y, respectively. Since x
+ * and y are odd, their GCD is odd, and it is easily seen that
+ * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+ * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+ * or b is reduced by at least one bit at each iteration, so
+ * the algorithm necessarily converges on the case a = b, at
+ * which point the common value is the GCD.
+ *
+ * In the algorithm expressed above, when a = b, the fourth case
+ * applies, and sets b = 0. Since a contains the GCD of x and y,
+ * which are both odd, a must be odd, and subsequent iterations
+ * (if any) will simply divide b by 2 repeatedly, which has no
+ * consequence. Thus, the algorithm can run for more iterations
+ * than necessary; the final GCD will be in a, and the (u,v)
+ * coefficients will be (u0,v0).
+ *
+ *
+ * The presentation above is bit-by-bit. It can be sped up by
+ * noticing that all decisions are taken based on the low bits
+ * and high bits of a and b. We can extract the two top words
+ * and low word of each of a and b, and compute reduction
+ * parameters pa, pb, qa and qb such that the new values for
+ * a and b are:
+ * a' = (a*pa + b*pb) / (2^31)
+ * b' = (a*qa + b*qb) / (2^31)
+ * the two divisions being exact. The coefficients are obtained
+ * just from the extracted words, and may be slightly off, requiring
+ * an optional correction: if a' < 0, then we replace pa with -pa
+ * and pb with -pb. Each such step will reduce the total length
+ * (sum of lengths of a and b) by at least 30 bits at each
+ * iteration.
+ */
+ uint32_t *u0, *u1, *v0, *v1, *a, *b;
+ uint32_t x0i, y0i;
+ uint32_t num, rc;
+ size_t j;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ /*
+ * u0 and v0 are the u and v result buffers; the four other
+ * values (u1, v1, a and b) are taken from tmp[].
+ */
+ u0 = u;
+ v0 = v;
+ u1 = tmp;
+ v1 = u1 + len;
+ a = v1 + len;
+ b = a + len;
+
+ /*
+ * We'll need the Montgomery reduction coefficients.
+ */
+ x0i = modp_ninv31(x[0]);
+ y0i = modp_ninv31(y[0]);
+
+ /*
+ * Initialize a, b, u0, u1, v0 and v1.
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ * Note that x is odd, so computing x-1 is easy.
+ */
+ memcpy(a, x, len * sizeof * x);
+ memcpy(b, y, len * sizeof * y);
+ u0[0] = 1;
+ memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+ memset(v0, 0, len * sizeof * v0);
+ memcpy(u1, y, len * sizeof * u1);
+ memcpy(v1, x, len * sizeof * v1);
+ v1[0] --;
+
+ /*
+ * Each input operand may be as large as 31*len bits, and we
+ * reduce the total length by at least 30 bits at each iteration.
+ */
+ for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+ uint32_t c0, c1;
+ uint32_t a0, a1, b0, b1;
+ uint64_t a_hi, b_hi;
+ uint32_t a_lo, b_lo;
+ int64_t pa, pb, qa, qb;
+ int i;
+ uint32_t r;
+
+ /*
+ * Extract the top words of a and b. If j is the highest
+ * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+ * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+ * If a and b are down to one word each, then we use
+ * a[0] and b[0].
+ */
+ c0 = (uint32_t) -1;
+ c1 = (uint32_t) -1;
+ a0 = 0;
+ a1 = 0;
+ b0 = 0;
+ b1 = 0;
+ j = len;
+ while (j -- > 0) {
+ uint32_t aw, bw;
+
+ aw = a[j];
+ bw = b[j];
+ a0 ^= (a0 ^ aw) & c0;
+ a1 ^= (a1 ^ aw) & c1;
+ b0 ^= (b0 ^ bw) & c0;
+ b1 ^= (b1 ^ bw) & c1;
+ c1 = c0;
+ c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+ }
+
+ /*
+ * If c1 = 0, then we grabbed two words for a and b.
+ * If c1 != 0 but c0 = 0, then we grabbed one word. It
+ * is not possible that c1 != 0 and c0 != 0, because that
+ * would mean that both integers are zero.
+ */
+ a1 |= a0 & c1;
+ a0 &= ~c1;
+ b1 |= b0 & c1;
+ b0 &= ~c1;
+ a_hi = ((uint64_t)a0 << 31) + a1;
+ b_hi = ((uint64_t)b0 << 31) + b1;
+ a_lo = a[0];
+ b_lo = b[0];
+
+ /*
+ * Compute reduction factors:
+ *
+ * a' = a*pa + b*pb
+ * b' = a*qa + b*qb
+ *
+ * such that a' and b' are both multiple of 2^31, but are
+ * only marginally larger than a and b.
+ */
+ pa = 1;
+ pb = 0;
+ qa = 0;
+ qb = 1;
+ for (i = 0; i < 31; i ++) {
+ /*
+ * At each iteration:
+ *
+ * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+ * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+ * a <- a/2 if: a is even
+ * b <- b/2 if: a is odd, b is even
+ *
+ * We multiply a_lo and b_lo by 2 at each
+ * iteration, thus a division by 2 really is a
+ * non-multiplication by 2.
+ */
+ uint32_t rt, oa, ob, cAB, cBA, cA;
+ uint64_t rz;
+
+ /*
+ * rt = 1 if a_hi > b_hi, 0 otherwise.
+ */
+ rz = b_hi - a_hi;
+ rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+ & (a_hi ^ rz))) >> 63);
+
+ /*
+ * cAB = 1 if b must be subtracted from a
+ * cBA = 1 if a must be subtracted from b
+ * cA = 1 if a must be divided by 2
+ *
+ * Rules:
+ *
+ * cAB and cBA cannot both be 1.
+ * If a is not divided by 2, b is.
+ */
+ oa = (a_lo >> i) & 1;
+ ob = (b_lo >> i) & 1;
+ cAB = oa & ob & rt;
+ cBA = oa & ob & ~rt;
+ cA = cAB | (oa ^ 1);
+
+ /*
+ * Conditional subtractions.
+ */
+ a_lo -= b_lo & -cAB;
+ a_hi -= b_hi & -(uint64_t)cAB;
+ pa -= qa & -(int64_t)cAB;
+ pb -= qb & -(int64_t)cAB;
+ b_lo -= a_lo & -cBA;
+ b_hi -= a_hi & -(uint64_t)cBA;
+ qa -= pa & -(int64_t)cBA;
+ qb -= pb & -(int64_t)cBA;
+
+ /*
+ * Shifting.
+ */
+ a_lo += a_lo & (cA - 1);
+ pa += pa & ((int64_t)cA - 1);
+ pb += pb & ((int64_t)cA - 1);
+ a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+ b_lo += b_lo & -cA;
+ qa += qa & -(int64_t)cA;
+ qb += qb & -(int64_t)cA;
+ b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+ }
+
+ /*
+ * Apply the computed parameters to our values. We
+ * may have to correct pa and pb depending on the
+ * returned value of zint_co_reduce() (when a and/or b
+ * had to be negated).
+ */
+ r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+ pa -= (pa + pa) & -(int64_t)(r & 1);
+ pb -= (pb + pb) & -(int64_t)(r & 1);
+ qa -= (qa + qa) & -(int64_t)(r >> 1);
+ qb -= (qb + qb) & -(int64_t)(r >> 1);
+ zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+ zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+ }
+
+ /*
+ * At that point, array a[] should contain the GCD, and the
+ * results (u,v) should already be set. We check that the GCD
+ * is indeed 1. We also check that the two operands x and y
+ * are odd.
+ */
+ rc = a[0] ^ 1;
+ for (j = 1; j < len; j ++) {
+ rc |= a[j];
+ }
+ return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, int32_t k,
+ uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ int32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t wy, wys, ccu;
+ uint64_t z;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ /*
+ * The expression below does not overflow.
+ */
+ z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+ /*
+ * Right-shifting the signed value z would yield
+ * implementation-defined results (arithmetic shift is
+ * not guaranteed). However, we can cast to unsigned,
+ * and get the next carry as an unsigned word. We can
+ * then convert it back to signed by using the guaranteed
+ * fact that 'int32_t' uses two's complement with no
+ * trap representation or padding bit, and with a layout
+ * compatible with that of 'uint32_t'.
+ */
+ ccu = (uint32_t)(z >> 31);
+ cc = *(int32_t *)&ccu;
+ }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ uint32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t w, wy, wys;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ w = x[u] - wys - cc;
+ x[u] = w & 0x7FFFFFFF;
+ cc = w >> 31;
+ }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+ uint32_t w;
+
+ w = x[0];
+ w |= (w & 0x40000000) << 1;
+ return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+ unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ if (flen == 0) {
+ for (u = 0; u < n; u ++) {
+ d[u] = fpr_zero;
+ }
+ return;
+ }
+ for (u = 0; u < n; u ++, f += fstride) {
+ size_t v;
+ uint32_t neg, cc, xm;
+ fpr x, fsc;
+
+ /*
+ * Get sign of the integer; if it is negative, then we
+ * will load its absolute value instead, and negate the
+ * result.
+ */
+ neg = -(f[flen - 1] >> 30);
+ xm = neg >> 1;
+ cc = neg & 1;
+ x = fpr_zero;
+ fsc = fpr_one;
+ for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+ uint32_t w;
+
+ w = (f[v] ^ xm) + cc;
+ cc = w >> 31;
+ w &= 0x7FFFFFFF;
+ w -= (w << 1) & neg;
+ x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+ }
+ d[u] = x;
+ }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = zint_one_to_plain(s + u);
+ if (z < -lim || z > lim) {
+ return 0;
+ }
+ d[u] = (int8_t)z;
+ }
+ return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t kf;
+ size_t v;
+ uint32_t *x;
+ const uint32_t *y;
+
+ kf = -k[u];
+ x = F + u * Fstride;
+ y = f;
+ for (v = 0; v < n; v ++) {
+ zint_add_scaled_mul_small(
+ x, Flen, y, flen, kf, sch, scl);
+ if (u + v == n - 1) {
+ x = F;
+ kf = -kf;
+ } else {
+ x += Fstride;
+ }
+ y += fstride;
+ }
+ }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+ uint32_t *tmp) {
+ uint32_t *gm, *igm, *fk, *t1, *x;
+ const uint32_t *y;
+ size_t n, u, tlen;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ tlen = flen + 1;
+ gm = tmp;
+ igm = gm + MKN(logn);
+ fk = igm + MKN(logn);
+ t1 = fk + n * tlen;
+
+ primes = PRIMES;
+
+ /*
+ * Compute k*f in fk[], in RNS notation.
+ */
+ for (u = 0; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0; v < n; v ++) {
+ t1[v] = modp_set(k[v], p);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, y = f, x = fk + u;
+ v < n; v ++, y += fstride, x += tlen) {
+ *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+ }
+ modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+ for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+ *x = modp_montymul(
+ modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+ }
+ modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild k*f.
+ */
+ zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+ /*
+ * Subtract k*f, scaled, from F.
+ */
+ for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+ zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+ }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+ /*
+ * We enforce little-endian representation.
+ */
+
+ /*
+ * On little-endian systems we just interpret the bytes "as is"
+ * (this is correct because the exact-width types such as
+ * 'uint64_t' are guaranteed to have no padding and no trap
+ * representation).
+ */
+ uint64_t r;
+
+ inner_shake256_extract(rng, (uint8_t *)&r, sizeof r);
+ return r;
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ * D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+ 1283868770400643928u, 6416574995475331444u, 4078260278032692663u,
+ 2353523259288686585u, 1227179971273316331u, 575931623374121527u,
+ 242543240509105209u, 91437049221049666u, 30799446349977173u,
+ 9255276791179340u, 2478152334826140u, 590642893610164u,
+ 125206034929641u, 23590435911403u, 3948334035941u,
+ 586753615614u, 77391054539u, 9056793210u,
+ 940121950u, 86539696u, 7062824u,
+ 510971u, 32764u, 1862u,
+ 94u, 4u, 0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+ unsigned u, g;
+ int val;
+
+ g = 1U << (10 - logn);
+ val = 0;
+ for (u = 0; u < g; u ++) {
+ /*
+ * Each iteration generates one value with the
+ * Gaussian distribution for N = 1024.
+ *
+ * We use two random 64-bit values. First value
+ * decides on whether the generated value is 0, and,
+ * if not, the sign of the value. Second random 64-bit
+ * word is used to generate the non-zero value.
+ *
+ * For constant-time code we have to read the complete
+ * table. This has negligible cost, compared with the
+ * remainder of the keygen process (solving the NTRU
+ * equation).
+ */
+ uint64_t r;
+ uint32_t f, v, k, neg;
+
+ /*
+ * First value:
+ * - flag 'neg' is randomly selected to be 0 or 1.
+ * - flag 'f' is set to 1 if the generated value is zero,
+ * or set to 0 otherwise.
+ */
+ r = get_rng_u64(rng);
+ neg = (uint32_t)(r >> 63);
+ r &= ~((uint64_t)1 << 63);
+ f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+ /*
+ * We produce a new random 63-bit integer r, and go over
+ * the array, starting at index 1. We store in v the
+ * index of the first array element which is not greater
+ * than r, unless the flag f was already 1.
+ */
+ v = 0;
+ r = get_rng_u64(rng);
+ r &= ~((uint64_t)1 << 63);
+ for (k = 1; k < (sizeof gauss_1024_12289)
+ / (sizeof gauss_1024_12289[0]); k ++) {
+ uint32_t t;
+
+ t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+ v |= k & -(t & (f ^ 1));
+ f |= t;
+ }
+
+ /*
+ * We apply the sign ('neg' flag). If the value is zero,
+ * the sign has no effect.
+ */
+ v = (v ^ -neg) + neg;
+
+ /*
+ * Generated value is added to val.
+ */
+ val += *(int32_t *)&v;
+ }
+ return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ * MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ * - Within an array, values grow.
+ *
+ * - The 'SMALL' array must have an entry for maximum depth, corresponding
+ * to the size of values used in the binary GCD. There is no such value
+ * for the 'LARGE' array (the binary GCD yields already reduced
+ * coefficients).
+ *
+ * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ * - Values must be large enough to handle the common cases, with some
+ * margins.
+ *
+ * - Values must not be "too large" either because we will convert some
+ * integers into floating-point values by considering the top 10 words,
+ * i.e. 310 bits; hence, for values of length more than 10 words, we
+ * should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51)
+ * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55)
+ * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77)
+ * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31)
+ * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04)
+ * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87)
+ * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38)
+ * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39)
+ * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73)
+ * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41)
+ * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+ 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+ 2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+ int avg;
+ int std;
+} BITLENGTH[] = {
+ { 4, 0 },
+ { 11, 1 },
+ { 24, 1 },
+ { 50, 1 },
+ { 102, 1 },
+ { 202, 2 },
+ { 401, 4 },
+ { 794, 5 },
+ { 1577, 8 },
+ { 3138, 13 },
+ { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG 4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = MKN(logn);
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = f[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(fpr);
+ if (km) {
+ k += (sizeof(fpr)) - km;
+ }
+ return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(uint32_t);
+ if (km) {
+ k += (sizeof(uint32_t)) - km;
+ }
+ return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ x[u] = fpr_of(f[u]);
+ }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+ int in_ntt, int out_ntt) {
+ size_t n, hn, u;
+ size_t slen, tlen;
+ uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+ const small_prime *primes;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ slen = MAX_BL_SMALL[depth];
+ tlen = MAX_BL_SMALL[depth + 1];
+ primes = PRIMES;
+
+ /*
+ * Prepare room for the result.
+ */
+ fd = data;
+ gd = fd + hn * tlen;
+ fs = gd + hn * tlen;
+ gs = fs + n * slen;
+ gm = gs + n * slen;
+ igm = gm + n;
+ t1 = igm + n;
+ memmove(fs, data, 2 * n * slen * sizeof * data);
+
+ /*
+ * First slen words: we use the input values directly, and apply
+ * inverse NTT as we go.
+ */
+ for (u = 0; u < slen; u ++) {
+ uint32_t p, p0i, R2;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+ }
+
+ for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+
+ /*
+ * Since the fs and gs words have been de-NTTized, we can use the
+ * CRT to rebuild the values.
+ */
+ zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+ zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+ /*
+ * Remaining words: use modular reductions to extract the values.
+ */
+ for (u = slen; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+ for (v = 0, x = fs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ for (v = 0, x = gs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ * 0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+ unsigned logn, unsigned depth, int out_ntt) {
+ size_t n, u;
+ uint32_t *ft, *gt, p0;
+ unsigned d;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ ft = data;
+ gt = ft + n;
+ primes = PRIMES;
+ p0 = primes[0].p;
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p0);
+ gt[u] = modp_set(g[u], p0);
+ }
+
+ if (depth == 0 && out_ntt) {
+ uint32_t *gm, *igm;
+ uint32_t p, p0i;
+
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ gm = gt + n;
+ igm = gm + MKN(logn);
+ modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ return;
+ }
+
+ if (depth == 0) {
+ return;
+ }
+
+ if (depth == 1) {
+ make_fg_step(data, logn, 0, 0, out_ntt);
+ return;
+ }
+
+ make_fg_step(data, logn, 0, 0, 1);
+ for (d = 1; d + 1 < depth; d ++) {
+ make_fg_step(data, logn - d, d, 1, 1);
+ }
+ make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t len;
+ uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+ const small_prime *primes;
+
+ len = MAX_BL_SMALL[logn_top];
+ primes = PRIMES;
+
+ Fp = tmp;
+ Gp = Fp + len;
+ fp = Gp + len;
+ gp = fp + len;
+ t1 = gp + len;
+
+ make_fg(fp, f, g, logn_top, logn_top, 0);
+
+ /*
+ * We use the CRT to rebuild the resultants as big integers.
+ * There are two such big integers. The resultants are always
+ * nonnegative.
+ */
+ zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+ /*
+ * Apply the binary GCD. The zint_bezout() function works only
+ * if both inputs are odd.
+ *
+ * We can test on the result and return 0 because that would
+ * imply failure of the NTRU solving equation, and the (f,g)
+ * values will be abandoned in that case.
+ */
+ if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+ return 0;
+ }
+
+ /*
+ * Multiply the two values by the target value q. Values must
+ * fit in the destination arrays.
+ * We can again test on the returned words: a non-zero output
+ * of zint_mul_small() means that we exceeded our array
+ * capacity, and that implies failure and rejection of (f,g).
+ */
+ q = 12289;
+ if (zint_mul_small(Fp, len, q) != 0
+ || zint_mul_small(Gp, len, q) != 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+ const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+ /*
+ * In this function, 'logn' is the log2 of the degree for
+ * this step. If N = 2^logn, then:
+ * - the F and G values already in fk->tmp (from the deeper
+ * levels) have degree N/2;
+ * - this function should return F and G of degree N.
+ */
+ unsigned logn;
+ size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+ int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+ uint32_t *x, *y;
+ int32_t *k;
+ const small_prime *primes;
+
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2 or N/3)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+ primes = PRIMES;
+
+ /*
+ * Fd and Gd are the F and G from the deeper level.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+
+ /*
+ * Compute the input f and g for this level. Note that we get f
+ * and g in RNS + NTT representation.
+ */
+ ft = Gd + dlen * hn;
+ make_fg(ft, f, g, logn_top, depth, 1);
+
+ /*
+ * Move the newly computed f and g to make room for our candidate
+ * F and G (unreduced).
+ */
+ Ft = tmp;
+ Gt = Ft + n * llen;
+ t1 = Gt + n * llen;
+ memmove(t1, ft, 2 * n * slen * sizeof * ft);
+ ft = t1;
+ gt = ft + slen * n;
+ t1 = gt + slen * n;
+
+ /*
+ * Move Fd and Gd _after_ f and g.
+ */
+ memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+ Fd = t1;
+ Gd = Fd + hn * dlen;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt (only n/2 values in each).
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * We do not need Fd and Gd after that point.
+ */
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * If we processed slen words, then f and g have been
+ * de-NTTized, and are in RNS; we can rebuild them.
+ */
+ if (u == slen) {
+ zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+ zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+ }
+
+ gm = t1;
+ igm = gm + n;
+ fx = igm + n;
+ gx = fx + n;
+
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ if (u < slen) {
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = *x;
+ gx[v] = *y;
+ }
+ modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+ modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+ } else {
+ uint32_t Rx;
+
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ for (v = 0, x = ft, y = gt;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = zint_mod_small_signed(x, slen,
+ p, p0i, R2, Rx);
+ gx[v] = zint_mod_small_signed(y, slen,
+ p, p0i, R2, Rx);
+ }
+ modp_NTT2(fx, gm, logn, p, p0i);
+ modp_NTT2(gx, gm, logn, p, p0i);
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed in
+ * a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * General case:
+ *
+ * we divide degree by d = 2 or 3
+ * f'(x^d) = N(f)(x^d) = f * adj(f)
+ * g'(x^d) = N(g)(x^d) = g * adj(g)
+ * f'*G' - g'*F' = q
+ * F = F'(x^d) * adj(g)
+ * G = G'(x^d) * adj(f)
+ *
+ * We compute things in the NTT. We group roots of phi
+ * such that all roots x in a group share the same x^d.
+ * If the roots in a group are x_1, x_2... x_d, then:
+ *
+ * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+ *
+ * Thus, we have:
+ *
+ * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * ...
+ * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+ *
+ * In all cases, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, in our chosen NTT representation, roots
+ * from the same group are consecutive in RAM.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+ v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild F and G with the CRT.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+ zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+ /*
+ * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+ * order).
+ */
+
+ /*
+ * Apply Babai reduction to bring back F and G to size slen.
+ *
+ * We use the FFT to compute successive approximations of the
+ * reduction coefficient. We first isolate the top bits of
+ * the coefficients of f and g, and convert them to floating
+ * point; with the FFT, we compute adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)).
+ *
+ * Then, we repeatedly apply the following:
+ *
+ * - Get the top bits of the coefficients of F and G into
+ * floating point, and use the FFT to compute:
+ * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+ *
+ * - Convert back that value into normal representation, and
+ * round it to the nearest integers, yielding a polynomial k.
+ * Proper scaling is applied to f, g, F and G so that the
+ * coefficients fit on 32 bits (signed).
+ *
+ * - Subtract k*f from F and k*g from G.
+ *
+ * Under normal conditions, this process reduces the size of F
+ * and G by some bits at each iteration. For constant-time
+ * operation, we do not want to measure the actual length of
+ * F and G; instead, we do the following:
+ *
+ * - f and g are converted to floating-point, with some scaling
+ * if necessary to keep values in the representable range.
+ *
+ * - For each iteration, we _assume_ a maximum size for F and G,
+ * and use the values at that size. If we overreach, then
+ * we get zeros, which is harmless: the resulting coefficients
+ * of k will be 0 and the value won't be reduced.
+ *
+ * - We conservatively assume that F and G will be reduced by
+ * at least 25 bits at each iteration.
+ *
+ * Even when reaching the bottom of the reduction, reduction
+ * coefficient will remain low. If it goes out-of-range, then
+ * something wrong occurred and the whole NTRU solving fails.
+ */
+
+ /*
+ * Memory layout:
+ * - We need to compute and keep adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+ * respectively).
+ * - At each iteration we need two extra fp buffer (N fp values),
+ * and produce a k (N 32-bit words). k will be shared with one
+ * of the fp buffers.
+ * - To compute k*f and k*g efficiently (with the NTT), we need
+ * some extra room; we reuse the space of the temporary buffers.
+ *
+ * Arrays of 'fpr' are obtained from the temporary array itself.
+ * We ensure that the base is at a properly aligned offset (the
+ * source array tmp[] is supposed to be already aligned).
+ */
+
+ rt3 = align_fpr(tmp, t1);
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt1 = rt5 + (n >> 1);
+ k = (int32_t *)align_u32(tmp, rt1);
+ rt2 = align_fpr(tmp, k + n);
+ if (rt2 < (rt1 + n)) {
+ rt2 = rt1 + n;
+ }
+ t1 = (uint32_t *)k + n;
+
+ /*
+ * Get f and g into rt3 and rt4 as floating-point approximations.
+ *
+ * We need to "scale down" the floating-point representation of
+ * coefficients when they are too big. We want to keep the value
+ * below 2^310 or so. Thus, when values are larger than 10 words,
+ * we consider only the top 10 words. Array lengths have been
+ * computed so that average maximum length will fall in the
+ * middle or the upper half of these top 10 words.
+ */
+ if (slen > 10) {
+ rlen = 10;
+ } else {
+ rlen = slen;
+ }
+ poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+ poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+ /*
+ * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+ */
+ scale_fg = 31 * (int)(slen - rlen);
+
+ /*
+ * Estimated boundaries for the maximum size (in bits) of the
+ * coefficients of (f,g). We use the measured average, and
+ * allow for a deviation of at most six times the standard
+ * deviation.
+ */
+ minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+ maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+ /*
+ * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+ * and adj(g) in rt3 and rt4, respectively.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt4, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt3, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt4, logn);
+
+ /*
+ * Reduce F and G repeatedly.
+ *
+ * The expected maximum bit length of coefficients of F and G
+ * is kept in maxbl_FG, with the corresponding word length in
+ * FGlen.
+ */
+ FGlen = llen;
+ maxbl_FG = 31 * (int)llen;
+
+ /*
+ * Each reduction operation computes the reduction polynomial
+ * "k". We need that polynomial to have coefficients that fit
+ * on 32-bit signed integers, with some scaling; thus, we use
+ * a descending sequence of scaling values, down to zero.
+ *
+ * The size of the coefficients of k is (roughly) the difference
+ * between the size of the coefficients of (F,G) and the size
+ * of the coefficients of (f,g). Thus, the maximum size of the
+ * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+ * this is our starting scale value for k.
+ *
+ * We need to estimate the size of (F,G) during the execution of
+ * the algorithm; we are allowed some overestimation but not too
+ * much (poly_big_to_fp() uses a 310-bit window). Generally
+ * speaking, after applying a reduction with k scaled to
+ * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+ * where 'dd' is a few bits to account for the fact that the
+ * reduction is never perfect (intuitively, dd is on the order
+ * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+ * bits).
+ *
+ * The size of (f,g) is not known exactly, but maxbl_fg is an
+ * upper bound.
+ */
+ scale_k = maxbl_FG - minbl_fg;
+
+ for (;;) {
+ int scale_FG, dc, new_maxbl_FG;
+ uint32_t scl, sch;
+ fpr pdc, pt;
+
+ /*
+ * Convert current F and G into floating-point. We apply
+ * scaling if the current length is more than 10 words.
+ */
+ if (FGlen > 10) {
+ rlen = 10;
+ } else {
+ rlen = FGlen;
+ }
+ scale_FG = 31 * (int)(FGlen - rlen);
+ poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+ poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(rt2, rt1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt2, rt5, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt2, logn);
+
+ /*
+ * (f,g) are scaled by 'scale_fg', meaning that the
+ * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+ * to have their true mathematical value.
+ *
+ * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+ * the value we computed in rt2 is scaled by
+ * 'scale_FG-scale_fg'.
+ *
+ * We want that value to be scaled by 'scale_k', hence we
+ * apply a corrective scaling. After scaling, the values
+ * should fit in -2^31-1..+2^31-1.
+ */
+ dc = scale_k - scale_FG + scale_fg;
+
+ /*
+ * We will need to multiply values by 2^(-dc). The value
+ * 'dc' is not secret, so we can compute 2^(-dc) with a
+ * non-constant-time process.
+ * (We could use ldexp(), but we prefer to avoid any
+ * dependency on libm. When using FP emulation, we could
+ * use our fpr_ldexp(), which is constant-time.)
+ */
+ if (dc < 0) {
+ dc = -dc;
+ pt = fpr_two;
+ } else {
+ pt = fpr_onehalf;
+ }
+ pdc = fpr_one;
+ while (dc != 0) {
+ if ((dc & 1) != 0) {
+ pdc = fpr_mul(pdc, pt);
+ }
+ dc >>= 1;
+ pt = fpr_sqr(pt);
+ }
+
+ for (u = 0; u < n; u ++) {
+ fpr xv;
+
+ xv = fpr_mul(rt2[u], pdc);
+
+ /*
+ * Sometimes the values can be out-of-bounds if
+ * the algorithm fails; we must not call
+ * fpr_rint() (and cast to int32_t) if the value
+ * is not in-bounds. Note that the test does not
+ * break constant-time discipline, since any
+ * failure here implies that we discard the current
+ * secret key (f,g).
+ */
+ if (!fpr_lt(fpr_mtwo31m1, xv)
+ || !fpr_lt(xv, fpr_ptwo31m1)) {
+ return 0;
+ }
+ k[u] = (int32_t)fpr_rint(xv);
+ }
+
+ /*
+ * Values in k[] are integers. They really are scaled
+ * down by maxbl_FG - minbl_fg bits.
+ *
+ * If we are at low depth, then we use the NTT to
+ * compute k*f and k*g.
+ */
+ sch = (uint32_t)(scale_k / 31);
+ scl = (uint32_t)(scale_k % 31);
+ if (depth <= DEPTH_INT_FG) {
+ poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn, t1);
+ poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn, t1);
+ } else {
+ poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn);
+ poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn);
+ }
+
+ /*
+ * We compute the new maximum size of (F,G), assuming that
+ * (f,g) has _maximal_ length (i.e. that reduction is
+ * "late" instead of "early". We also adjust FGlen
+ * accordingly.
+ */
+ new_maxbl_FG = scale_k + maxbl_fg + 10;
+ if (new_maxbl_FG < maxbl_FG) {
+ maxbl_FG = new_maxbl_FG;
+ if ((int)FGlen * 31 >= maxbl_FG + 31) {
+ FGlen --;
+ }
+ }
+
+ /*
+ * We suppose that scaling down achieves a reduction by
+ * at least 25 bits per iteration. We stop when we have
+ * done the loop with an unscaled k.
+ */
+ if (scale_k <= 0) {
+ break;
+ }
+ scale_k -= 25;
+ if (scale_k < 0) {
+ scale_k = 0;
+ }
+ }
+
+ /*
+ * If (F,G) length was lowered below 'slen', then we must take
+ * care to re-extend the sign.
+ */
+ if (FGlen < slen) {
+ for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+ size_t v;
+ uint32_t sw;
+
+ sw = -(Ft[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Ft[v] = sw;
+ }
+ sw = -(Gt[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Gt[v] = sw;
+ }
+ }
+ }
+
+ /*
+ * Compress encoding of all values to 'slen' words (this is the
+ * expected output format).
+ */
+ for (u = 0, x = tmp, y = tmp;
+ u < (n << 1); u ++, x += slen, y += llen) {
+ memmove(x, y, slen * sizeof * y);
+ }
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ /*
+ * The first half of this function is a copy of the corresponding
+ * part in solve_NTRU_intermediate(), for the reconstruction of
+ * the unreduced F and G. The second half (Babai reduction) is
+ * done differently, because the unreduced F and G fit in 53 bits
+ * of precision, allowing a much simpler process with lower RAM
+ * usage.
+ */
+ unsigned depth, logn;
+ size_t n_top, n, hn, slen, dlen, llen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+ uint32_t *x, *y;
+
+ depth = 1;
+ n_top = (size_t)1 << logn_top;
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ */
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+
+ /*
+ * Fd and Gd are the F and G from the deeper level. Ft and Gt
+ * are the destination arrays for the unreduced F and G.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+ Ft = Gd + dlen * hn;
+ Gt = Ft + llen * n;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * Now Fd and Gd are not needed anymore; we can squeeze them out.
+ */
+ memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+ Ft = tmp;
+ memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+ Gt = Ft + llen * n;
+ ft = Gt + llen * n;
+ gt = ft + slen * n;
+
+ t1 = gt + slen * n;
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ unsigned e;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * We recompute things from the source f and g, of full
+ * degree. However, we will need only the n first elements
+ * of the inverse NTT table (igm); the call to modp_mkgm()
+ * below will fill n_top elements in igm[] (thus overflowing
+ * into fx[]) but later code will overwrite these extra
+ * elements.
+ */
+ gm = t1;
+ igm = gm + n_top;
+ fx = igm + n;
+ gx = fx + n_top;
+ modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+ /*
+ * Set ft and gt to f and g modulo p, respectively.
+ */
+ for (v = 0; v < n_top; v ++) {
+ fx[v] = modp_set(f[v], p);
+ gx[v] = modp_set(g[v], p);
+ }
+
+ /*
+ * Convert to NTT and compute our f and g.
+ */
+ modp_NTT2(fx, gm, logn_top, p, p0i);
+ modp_NTT2(gx, gm, logn_top, p, p0i);
+ for (e = logn_top; e > logn; e --) {
+ modp_poly_rec_res(fx, e, p, p0i, R2);
+ modp_poly_rec_res(gx, e, p, p0i, R2);
+ }
+
+ /*
+ * From that point onward, we only need tables for
+ * degree n, so we can save some space.
+ */
+ if (depth > 0) { /* always true */
+ memmove(gm + n, igm, n * sizeof * igm);
+ igm = gm + n;
+ memmove(igm + n, fx, n * sizeof * ft);
+ fx = igm + n;
+ memmove(fx + n, gx, n * sizeof * gt);
+ gx = fx + n;
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed
+ * in a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * Equations are:
+ *
+ * f'(x^2) = N(f)(x^2) = f * adj(f)
+ * g'(x^2) = N(g)(x^2) = g * adj(g)
+ *
+ * f'*G' - g'*F' = q
+ *
+ * F = F'(x^2) * adj(g)
+ * G = G'(x^2) * adj(f)
+ *
+ * The NTT representation of f is f(w) for all w which
+ * are roots of phi. In the binary case, as well as in
+ * the ternary case for all depth except the deepest,
+ * these roots can be grouped in pairs (w,-w), and we
+ * then have:
+ *
+ * f(w) = adj(f)(-w)
+ * f(-w) = adj(f)(w)
+ *
+ * and w^2 is then a root for phi at the half-degree.
+ *
+ * At the deepest level in the ternary case, this still
+ * holds, in the following sense: the roots of x^2-x+1
+ * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+ * have:
+ *
+ * f(w) = adj(f)(-w^2)
+ * f(-w^2) = adj(f)(w)
+ *
+ * In all case, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, the two roots for each pair are consecutive
+ * in our bit-reversal encoding.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+ /*
+ * Also save ft and gt (only up to size slen).
+ */
+ if (u < slen) {
+ modp_iNTT2(fx, igm, logn, p, p0i);
+ modp_iNTT2(gx, igm, logn, p, p0i);
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ *x = fx[v];
+ *y = gx[v];
+ }
+ }
+ }
+
+ /*
+ * Rebuild f, g, F and G with the CRT. Note that the elements of F
+ * and G are consecutive, and thus can be rebuilt in a single
+ * loop; similarly, the elements of f and g are consecutive.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+ zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+ /*
+ * Here starts the Babai reduction, specialized for depth = 1.
+ *
+ * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+ * are converted to floating point. There is no scaling, and a
+ * single pass is sufficient.
+ */
+
+ /*
+ * Convert F and G into floating point (rt1 and rt2).
+ */
+ rt1 = align_fpr(tmp, gt + slen * n);
+ rt2 = rt1 + n;
+ poly_big_to_fp(rt1, Ft, llen, llen, logn);
+ poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+ /*
+ * Integer representation of F and G is no longer needed, we
+ * can remove it.
+ */
+ memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+ ft = tmp;
+ gt = ft + slen * n;
+ rt3 = align_fpr(tmp, gt + slen * n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * Convert f and g into floating point (rt3 and rt4).
+ */
+ poly_big_to_fp(rt3, ft, slen, slen, logn);
+ poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+ /*
+ * Remove unneeded ft and gt.
+ */
+ memmove(tmp, rt1, 4 * n * sizeof * rt1);
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * We now have:
+ * rt1 = F
+ * rt2 = G
+ * rt3 = f
+ * rt4 = g
+ * in that order in RAM. We convert all of them to FFT.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = F*adj(f) + G*adj(g)
+ * rt6 = 1 / (f*adj(f) + g*adj(g))
+ * (Note that rt6 is half-length.)
+ */
+ rt5 = rt4 + n;
+ rt6 = rt5 + n;
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+ /*
+ * Compute k as the rounded version of rt5. Check that none of
+ * the values is larger than 2^63-1 (in absolute value)
+ * because that would make the fpr_rint() do something undefined;
+ * note that any out-of-bounds value here implies a failure and
+ * (f,g) will be discarded, so we can make a simple test.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt5, logn);
+ for (u = 0; u < n; u ++) {
+ fpr z;
+
+ z = rt5[u];
+ if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+ return 0;
+ }
+ rt5[u] = fpr_of(fpr_rint(z));
+ }
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt5, logn);
+
+ /*
+ * Subtract k*f from F, and k*g from G.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt3, rt5, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(rt4, rt5, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_sub(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_sub(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt2, logn);
+
+ /*
+ * Convert back F and G to integers, and return.
+ */
+ Ft = tmp;
+ Gt = Ft + n;
+ rt3 = align_fpr(tmp, Gt + n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ for (u = 0; u < n; u ++) {
+ Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+ Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t n, hn, u;
+ uint32_t p, p0i, R2;
+ uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+ uint32_t *gm, *igm, *ft, *gt;
+ fpr *rt2, *rt3;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ *
+ * Everything should fit in 31-bit integers, hence we can just use
+ * the first small prime p = 2147473409.
+ */
+ p = PRIMES[0].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ Fp = tmp;
+ Gp = Fp + hn;
+ ft = Gp + hn;
+ gt = ft + n;
+ gm = gt + n;
+ igm = gm + n;
+
+ modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F' anf G' in NTT representation.
+ */
+ for (u = 0; u < hn; u ++) {
+ Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+ Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Load f and g and convert them to NTT representation.
+ */
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+
+ /*
+ * Build the unreduced F,G in ft and gt.
+ */
+ for (u = 0; u < n; u += 2) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = ft[u + 0];
+ ftB = ft[u + 1];
+ gtA = gt[u + 0];
+ gtB = gt[u + 1];
+ mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+ mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+ ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+ ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+ gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+ gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2(ft, igm, logn, p, p0i);
+ modp_iNTT2(gt, igm, logn, p, p0i);
+
+ Gp = Fp + n;
+ t1 = Gp + n;
+ memmove(Fp, ft, 2 * n * sizeof * ft);
+
+ /*
+ * We now need to apply the Babai reduction. At that point,
+ * we have F and G in two n-word arrays.
+ *
+ * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+ * modulo p, using the NTT. We still move memory around in
+ * order to save RAM.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+
+ /*
+ * Compute the NTT tables in t1 and t2. We do not keep t2
+ * (we'll recompute it later on).
+ */
+ modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F and G to NTT.
+ */
+ modp_NTT2(Fp, t1, logn, p, p0i);
+ modp_NTT2(Gp, t1, logn, p, p0i);
+
+ /*
+ * Load f and adj(f) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(f[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[n - u] = modp_set(-f[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Compute F*adj(f) in t2, and f*adj(f) in t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_montymul(w, Fp[u], p, p0i);
+ t3[u] = modp_montymul(w, t4[u], p, p0i);
+ }
+
+ /*
+ * Load g and adj(g) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(g[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(g[u], p);
+ t5[n - u] = modp_set(-g[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Add G*adj(g) to t2, and g*adj(g) to t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_add(t2[u],
+ modp_montymul(w, Gp[u], p, p0i), p);
+ t3[u] = modp_add(t3[u],
+ modp_montymul(w, t4[u], p, p0i), p);
+ }
+
+ /*
+ * Convert back t2 and t3 to normal representation (normalized
+ * around 0), and then
+ * move them to t1 and t2. We first need to recompute the
+ * inverse table for NTT.
+ */
+ modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+ modp_iNTT2(t2, t4, logn, p, p0i);
+ modp_iNTT2(t3, t4, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint32_t)modp_norm(t2[u], p);
+ t2[u] = (uint32_t)modp_norm(t3[u], p);
+ }
+
+ /*
+ * At that point, array contents are:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * F*adj(f)+G*adj(g) (t1)
+ * f*adj(f)+g*adj(g) (t2)
+ *
+ * We want to divide t1 by t2. The result is not integral; it
+ * must be rounded. We thus need to use the FFT.
+ */
+
+ /*
+ * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+ * polynomial is auto-adjoint, all its coordinates in FFT
+ * representation are actually real, so we can truncate off
+ * the imaginary parts.
+ */
+ rt3 = align_fpr(tmp, t3);
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t2)[u]);
+ }
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+ rt2 = align_fpr(tmp, t2);
+ memmove(rt2, rt3, hn * sizeof * rt3);
+
+ /*
+ * Convert F*adj(f)+G*adj(g) in FFT representation.
+ */
+ rt3 = rt2 + hn;
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t1)[u]);
+ }
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt3, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+ * its rounded normal representation in t1.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_div_autoadj_fft(rt3, rt2, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt3, logn);
+ for (u = 0; u < n; u ++) {
+ t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+ }
+
+ /*
+ * RAM contents are now:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * k (t1)
+ *
+ * We want to compute F-k*f, and G-k*g.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+ modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(t1, t2, logn, p, p0i);
+ modp_NTT2(t4, t2, logn, p, p0i);
+ modp_NTT2(t5, t2, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t kw;
+
+ kw = modp_montymul(t1[u], R2, p, p0i);
+ Fp[u] = modp_sub(Fp[u],
+ modp_montymul(kw, t4[u], p, p0i), p);
+ Gp[u] = modp_sub(Gp[u],
+ modp_montymul(kw, t5[u], p, p0i), p);
+ }
+ modp_iNTT2(Fp, t3, logn, p, p0i);
+ modp_iNTT2(Gp, t3, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+ Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+ }
+
+ return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+ const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+ size_t n, u;
+ uint32_t *ft, *gt, *Ft, *Gt, *gm;
+ uint32_t p, p0i, r;
+ const small_prime *primes;
+
+ n = MKN(logn);
+
+ if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+ return 0;
+ }
+
+ /*
+ * For logn <= 2, we need to use solve_NTRU_intermediate()
+ * directly, because coefficients are a bit too large and
+ * do not fit the hypotheses in solve_NTRU_binary_depth0().
+ */
+ if (logn <= 2) {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 0) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ } else {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 2) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+ return 0;
+ }
+ if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+ return 0;
+ }
+ }
+
+ /*
+ * If no buffer has been provided for G, use a temporary one.
+ */
+ if (G == NULL) {
+ G = (int8_t *)(tmp + 2 * n);
+ }
+
+ /*
+ * Final F and G are in fk->tmp, one word per coefficient
+ * (signed value over 31 bits).
+ */
+ if (!poly_big_to_small(F, tmp, lim, logn)
+ || !poly_big_to_small(G, tmp + n, lim, logn)) {
+ return 0;
+ }
+
+ /*
+ * Verify that the NTRU equation is fulfilled. Since all elements
+ * have short lengths, verifying modulo a small prime p works, and
+ * allows using the NTT.
+ *
+ * We put Gt[] first in tmp[], and process it first, so that it does
+ * not overlap with G[] in case we allocated it ourselves.
+ */
+ Gt = tmp;
+ ft = Gt + n;
+ gt = ft + n;
+ Ft = gt + n;
+ gm = Ft + n;
+
+ primes = PRIMES;
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Gt[u] = modp_set(G[u], p);
+ }
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ Ft[u] = modp_set(F[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ modp_NTT2(Ft, gm, logn, p, p0i);
+ modp_NTT2(Gt, gm, logn, p, p0i);
+ r = modp_montymul(12289, 1, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t z;
+
+ z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+ modp_montymul(gt[u], Ft[u], p, p0i), p);
+ if (z != r) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+ size_t n, u;
+ unsigned mod2;
+
+ n = MKN(logn);
+ mod2 = 0;
+ for (u = 0; u < n; u ++) {
+ int s;
+
+restart:
+ s = mkgauss(rng, logn);
+
+ /*
+ * We need the coefficient to fit within -127..+127;
+ * realistically, this is always the case except for
+ * the very low degrees (N = 2 or 4), for which there
+ * is no real security anyway.
+ */
+ if (s < -127 || s > 127) {
+ goto restart;
+ }
+
+ /*
+ * We need the sum of all coefficients to be 1; otherwise,
+ * the resultant of the polynomial with X^N+1 will be even,
+ * and the binary GCD will fail.
+ */
+ if (u == n - 1) {
+ if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+ goto restart;
+ }
+ } else {
+ mod2 ^= (unsigned)(s & 1);
+ }
+ f[u] = (int8_t)s;
+ }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp) {
+ /*
+ * Algorithm is the following:
+ *
+ * - Generate f and g with the Gaussian distribution.
+ *
+ * - If either Res(f,phi) or Res(g,phi) is even, try again.
+ *
+ * - If ||(f,g)|| is too large, try again.
+ *
+ * - If ||B~_{f,g}|| is too large, try again.
+ *
+ * - If f is not invertible mod phi mod q, try again.
+ *
+ * - Compute h = g/f mod phi mod q.
+ *
+ * - Solve the NTRU equation fG - gF = q; if the solving fails,
+ * try again. Usual failure condition is when Res(f,phi)
+ * and Res(g,phi) are not prime to each other.
+ */
+ size_t n, u;
+ uint16_t *h2, *tmp2;
+ RNG_CONTEXT *rc;
+
+ n = MKN(logn);
+ rc = rng;
+
+ /*
+ * We need to generate f and g randomly, until we find values
+ * such that the norm of (g,-f), and of the orthogonalized
+ * vector, are satisfying. The orthogonalized vector is:
+ * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+ * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+ *
+ * In the binary case, coefficients of f and g are generated
+ * independently of each other, with a discrete Gaussian
+ * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+ * the two vectors have expected norm 1.17*sqrt(q), which is
+ * also our acceptance bound: we require both vectors to be no
+ * larger than that (this will be satisfied about 1/4th of the
+ * time, thus we expect sampling new (f,g) about 4 times for that
+ * step).
+ *
+ * We require that Res(f,phi) and Res(g,phi) are both odd (the
+ * NTRU equation solver requires it).
+ */
+ for (;;) {
+ fpr *rt1, *rt2, *rt3;
+ fpr bnorm;
+ uint32_t normf, normg, norm;
+ int lim;
+
+ /*
+ * The poly_small_mkgauss() function makes sure
+ * that the sum of coefficients is 1 modulo 2
+ * (i.e. the resultant of the polynomial with phi
+ * will be odd).
+ */
+ poly_small_mkgauss(rc, f, logn);
+ poly_small_mkgauss(rc, g, logn);
+
+ /*
+ * Verify that all coefficients are within the bounds
+ * defined in max_fg_bits. This is the case with
+ * overwhelming probability; this guarantees that the
+ * key will be encodable with FALCON_COMP_TRIM.
+ */
+ lim = 1 << (PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[logn] - 1);
+ for (u = 0; u < n; u ++) {
+ /*
+ * We can use non-CT tests since on any failure
+ * we will discard f and g.
+ */
+ if (f[u] >= lim || f[u] <= -lim
+ || g[u] >= lim || g[u] <= -lim) {
+ lim = -1;
+ break;
+ }
+ }
+ if (lim < 0) {
+ continue;
+ }
+
+ /*
+ * Bound is 1.17*sqrt(q). We compute the squared
+ * norms. With q = 12289, the squared bound is:
+ * (1.17^2)* 12289 = 16822.4121
+ * Since f and g are integral, the squared norm
+ * of (g,-f) is an integer.
+ */
+ normf = poly_small_sqnorm(f, logn);
+ normg = poly_small_sqnorm(g, logn);
+ norm = (normf + normg) | -((normf | normg) >> 31);
+ if (norm >= 16823) {
+ continue;
+ }
+
+ /*
+ * We compute the orthogonalized vector norm.
+ */
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ poly_small_to_fp(rt1, f, logn);
+ poly_small_to_fp(rt2, g, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_adj_fft(rt2, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(rt1, fpr_q, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(rt2, fpr_q, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_autoadj_fft(rt2, rt3, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(rt2, logn);
+ bnorm = fpr_zero;
+ for (u = 0; u < n; u ++) {
+ bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+ bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+ }
+ if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+ continue;
+ }
+
+ /*
+ * Compute public key h = g/f mod X^N+1 mod q. If this
+ * fails, we must restart.
+ */
+ if (h == NULL) {
+ h2 = (uint16_t *)tmp;
+ tmp2 = h2 + n;
+ } else {
+ h2 = h;
+ tmp2 = (uint16_t *)tmp;
+ }
+ if (!PQCLEAN_FALCONPADDED512_AVX2_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+ continue;
+ }
+
+ /*
+ * Solve the NTRU equation to get F and G.
+ */
+ lim = (1 << (PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[logn] - 1)) - 1;
+ if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+ continue;
+ }
+
+ /*
+ * Key pair is generated.
+ */
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/pqclean.c
new file mode 100644
index 000000000..171105004
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include
+#include
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN 40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ * private key:
+ * header byte: 0101nnnn
+ * private f (6 or 5 bits by element, depending on degree)
+ * private g (6 or 5 bits by element, depending on degree)
+ * private F (8 bits by element)
+ *
+ * public key:
+ * header byte: 0000nnnn
+ * public h (14 bits by element)
+ *
+ * signature:
+ * header byte: 0011nnnn
+ * nonce (r) 40 bytes
+ * value (s) compressed format
+ * padding to 666 bytes
+ *
+ * message + signature:
+ * signature 666 bytes
+ * message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk) {
+ union {
+ uint8_t b[FALCON_KEYGEN_TEMP_9];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[512], g[512], F[512];
+ uint16_t h[512];
+ unsigned char seed[48];
+ inner_shake256_context rng;
+ size_t u, v;
+
+ /*
+ * Generate key pair.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&rng);
+ inner_shake256_inject(&rng, seed, sizeof seed);
+ inner_shake256_flip(&rng);
+ PQCLEAN_FALCONPADDED512_AVX2_keygen(&rng, f, g, F, NULL, h, 9, tmp.b);
+ inner_shake256_ctx_release(&rng);
+
+ /*
+ * Encode private key.
+ */
+ sk[0] = 0x50 + 9;
+ u = 1;
+ v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u,
+ f, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u,
+ g, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u,
+ F, 9, PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[9]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+
+ /*
+ * Encode public key.
+ */
+ pk[0] = 0x00 + 9;
+ v = PQCLEAN_FALCONPADDED512_AVX2_modq_encode(
+ pk + 1, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1,
+ h, 9);
+ if (v != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ union {
+ uint8_t b[72 * 512];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[512], g[512], F[512], G[512];
+ struct {
+ int16_t sig[512];
+ uint16_t hm[512];
+ } r;
+ unsigned char seed[48];
+ inner_shake256_context sc;
+ size_t u, v;
+
+ /*
+ * Decode the private key.
+ */
+ if (sk[0] != 0x50 + 9) {
+ return -1;
+ }
+ u = 1;
+ v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+ f, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9],
+ sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+ g, 9, PQCLEAN_FALCONPADDED512_AVX2_max_fg_bits[9],
+ sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_AVX2_trim_i8_decode(
+ F, 9, PQCLEAN_FALCONPADDED512_AVX2_max_FG_bits[9],
+ sk + u, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+ if (!PQCLEAN_FALCONPADDED512_AVX2_complete_private(G, f, g, F, 9, tmp.b)) {
+ return -1;
+ }
+
+ /*
+ * Create a random nonce (40 bytes).
+ */
+ randombytes(nonce, NONCELEN);
+
+ /*
+ * Hash message nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(&sc, r.hm, 9, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Initialize a RNG.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, seed, sizeof seed);
+ inner_shake256_flip(&sc);
+
+ /*
+ * Compute and return the signature. This loops until a signature
+ * value is found that fits in the provided buffer.
+ */
+ for (;;) {
+ PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+ v = PQCLEAN_FALCONPADDED512_AVX2_comp_encode(sigbuf, sigbuflen, r.sig, 9);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ memset(sigbuf + v, 0, sigbuflen - v);
+ return 0;
+ }
+ }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+ const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ union {
+ uint8_t b[2 * 512];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ uint16_t h[512], hm[512];
+ int16_t sig[512];
+ inner_shake256_context sc;
+ size_t v;
+
+ /*
+ * Decode public key.
+ */
+ if (pk[0] != 0x00 + 9) {
+ return -1;
+ }
+ if (PQCLEAN_FALCONPADDED512_AVX2_modq_decode(h, 9,
+ pk + 1, PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1)
+ != PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+ PQCLEAN_FALCONPADDED512_AVX2_to_ntt_monty(h, 9);
+
+ /*
+ * Decode signature.
+ */
+ if (sigbuflen == 0) {
+ return -1;
+ }
+
+ v = PQCLEAN_FALCONPADDED512_AVX2_comp_decode(sig, 9, sigbuf, sigbuflen);
+ if (v == 0) {
+ return -1;
+ }
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
+
+ /*
+ * Hash nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED512_AVX2_hash_to_point_ct(&sc, hm, 9, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Verify signature.
+ */
+ if (!PQCLEAN_FALCONPADDED512_AVX2_verify_raw(hm, sig, h, 9, tmp.b)) {
+ return -1;
+ }
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ size_t vlen;
+
+ vlen = PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sig[0] = 0x30 + 9;
+ *siglen = 1 + NONCELEN + vlen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ if (siglen < 1 + NONCELEN) {
+ return -1;
+ }
+ if (sig[0] != 0x30 + 9) {
+ return -1;
+ }
+ return do_verify(sig + 1,
+ sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ uint8_t *sigbuf;
+ size_t sigbuflen;
+
+ /*
+ * Move the message to its final location; this is a memmove() so
+ * it handles overlaps properly.
+ */
+ memmove(sm + PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES, m, mlen);
+ sigbuf = sm + 1 + NONCELEN;
+ sigbuflen = PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sm[0] = 0x30 + 9;
+ sigbuflen ++;
+ *smlen = mlen + NONCELEN + sigbuflen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+ const uint8_t *sigbuf;
+ size_t pmlen, sigbuflen;
+
+ if (smlen < PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES) {
+ return -1;
+ }
+ sigbuflen = PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES - NONCELEN - 1;
+ pmlen = smlen - PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES;
+ if (sm[0] != 0x30 + 9) {
+ return -1;
+ }
+ sigbuf = sm + 1 + NONCELEN;
+
+ /*
+ * The one-byte signature header has been verified. Nonce is at sm+1
+ * followed by the signature (pointed to by sigbuf). The message
+ * follows the signature value.
+ */
+ if (do_verify(sm + 1, sigbuf, sigbuflen,
+ sm + PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES, pmlen, pk) < 0) {
+ return -1;
+ }
+
+ /*
+ * Signature is correct, we just have to copy/move the message
+ * to its final destination. The memmove() properly handles
+ * overlaps.
+ */
+ memmove(m, sm + PQCLEAN_FALCONPADDED512_AVX2_CRYPTO_BYTES, pmlen);
+ *mlen = pmlen;
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/rng.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/rng.c
new file mode 100644
index 000000000..203d31f9d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/rng.c
@@ -0,0 +1,179 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_prng_init(prng *p, inner_shake256_context *src) {
+ inner_shake256_extract(src, p->state.d, 56);
+ PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED512_AVX2_prng_refill(prng *p) {
+
+ static const uint32_t CW[] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+ };
+
+ uint64_t cc;
+ size_t u;
+ int i;
+ uint32_t *sw;
+ union {
+ uint32_t w[16];
+ __m256i y[2]; /* for alignment */
+ } t;
+ __m256i state[16], init[16];
+
+ sw = (uint32_t *)p->state.d;
+
+ /*
+ * XOR next counter values into state.
+ */
+ cc = *(uint64_t *)(p->state.d + 48);
+ for (u = 0; u < 8; u ++) {
+ t.w[u] = (uint32_t)(cc + u);
+ t.w[u + 8] = (uint32_t)((cc + u) >> 32);
+ }
+ *(uint64_t *)(p->state.d + 48) = cc + 8;
+
+ /*
+ * Load state.
+ */
+ for (u = 0; u < 4; u ++) {
+ state[u] = init[u] =
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)CW[u]));
+ }
+ for (u = 0; u < 10; u ++) {
+ state[u + 4] = init[u + 4] =
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[u]));
+ }
+ state[14] = init[14] = _mm256_xor_si256(
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[10])),
+ _mm256_loadu_si256((__m256i *)&t.w[0]));
+ state[15] = init[15] = _mm256_xor_si256(
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128((int)sw[11])),
+ _mm256_loadu_si256((__m256i *)&t.w[8]));
+
+ /*
+ * Do all rounds.
+ */
+ for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d) do { \
+ state[a] = _mm256_add_epi32(state[a], state[b]); \
+ state[d] = _mm256_xor_si256(state[d], state[a]); \
+ state[d] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[d], 16), \
+ _mm256_srli_epi32(state[d], 16)); \
+ state[c] = _mm256_add_epi32(state[c], state[d]); \
+ state[b] = _mm256_xor_si256(state[b], state[c]); \
+ state[b] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[b], 12), \
+ _mm256_srli_epi32(state[b], 20)); \
+ state[a] = _mm256_add_epi32(state[a], state[b]); \
+ state[d] = _mm256_xor_si256(state[d], state[a]); \
+ state[d] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[d], 8), \
+ _mm256_srli_epi32(state[d], 24)); \
+ state[c] = _mm256_add_epi32(state[c], state[d]); \
+ state[b] = _mm256_xor_si256(state[b], state[c]); \
+ state[b] = _mm256_or_si256( \
+ _mm256_slli_epi32(state[b], 7), \
+ _mm256_srli_epi32(state[b], 25)); \
+ } while (0)
+
+ QROUND( 0, 4, 8, 12);
+ QROUND( 1, 5, 9, 13);
+ QROUND( 2, 6, 10, 14);
+ QROUND( 3, 7, 11, 15);
+ QROUND( 0, 5, 10, 15);
+ QROUND( 1, 6, 11, 12);
+ QROUND( 2, 7, 8, 13);
+ QROUND( 3, 4, 9, 14);
+
+#undef QROUND
+
+ }
+
+ /*
+ * Add initial state back and encode the result in the destination
+ * buffer. We can dump the AVX2 values "as is" because the non-AVX2
+ * code uses a compatible order of values.
+ */
+ for (u = 0; u < 16; u ++) {
+ _mm256_storeu_si256((__m256i *)&p->buf.d[u << 5],
+ _mm256_add_epi32(state[u], init[u]));
+ }
+
+ p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_prng_get_bytes(prng *p, void *dst, size_t len) {
+ uint8_t *buf;
+
+ buf = dst;
+ while (len > 0) {
+ size_t clen;
+
+ clen = (sizeof p->buf.d) - p->ptr;
+ if (clen > len) {
+ clen = len;
+ }
+ memcpy(buf, p->buf.d, clen);
+ buf += clen;
+ len -= clen;
+ p->ptr += clen;
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED512_AVX2_prng_refill(p);
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/sign.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/sign.c
new file mode 100644
index 000000000..0e8eb7173
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/sign.c
@@ -0,0 +1,1319 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ * N = 2^logn
+ * phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+ /*
+ * For logn = 0 (polynomials are constant), the "tree" is a
+ * single element. Otherwise, the tree node has size 2^logn, and
+ * has two child trees for size logn-1 each. Thus, treesize s()
+ * must fulfill these two relations:
+ *
+ * s(0) = 1
+ * s(logn) = (2^logn) + 2*s(logn-1)
+ */
+ return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+ fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g0[0];
+ return;
+ }
+ hn = n >> 1;
+
+ /*
+ * The LDL decomposition yields L (which is written in the tree)
+ * and the diagonal of D. Since d00 = g0, we just write d11
+ * into tmp.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+ /*
+ * Split d00 (currently in g0) and d11 (currently in tmp). We
+ * reuse g0 and g1 as temporary storage spaces:
+ * d00 splits into g1, g1+hn
+ * d11 splits into g0, g0+hn
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(g1, g1 + hn, g0, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+ /*
+ * Each split result is the first row of a new auto-adjoint
+ * quasicyclic matrix for the next recursive step.
+ */
+ ffLDL_fft_inner(tree + n,
+ g1, g1 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+ const fpr *g01, const fpr *g11,
+ unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *d00, *d11;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g00[0];
+ return;
+ }
+ hn = n >> 1;
+ d00 = tmp;
+ d11 = tmp + n;
+ tmp += n << 1;
+
+ memcpy(d00, g00, n * sizeof * g00);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(tmp, tmp + hn, d00, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(d00, d00 + hn, d11, logn);
+ memcpy(d11, tmp, n * sizeof * tmp);
+ ffLDL_fft_inner(tree + n,
+ d11, d11 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+ /*
+ * TODO: make an iterative version.
+ */
+ size_t n;
+
+ n = MKN(logn);
+ if (n == 1) {
+ /*
+ * We actually store in the tree leaf the inverse of
+ * the value mandated by the specification: this
+ * saves a division both here and in the sampler.
+ */
+ tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+ } else {
+ ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+ ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+ orig_logn, logn - 1);
+ }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ r[u] = fpr_of(t[u]);
+ }
+}
+
+/*
+ * The expanded private key contains:
+ * - The B0 matrix (four elements)
+ * - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+ (void)logn;
+ return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+ return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+ return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+ return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+ return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp) {
+ size_t n;
+ fpr *rf, *rg, *rF, *rG;
+ fpr *b00, *b01, *b10, *b11;
+ fpr *g00, *g01, *g11, *gxx;
+ fpr *tree;
+
+ n = MKN(logn);
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * We load the private key elements directly into the B0 matrix,
+ * since B0 = [[g, -f], [G, -F]].
+ */
+ rf = b01;
+ rg = b00;
+ rF = b11;
+ rG = b10;
+
+ smallints_to_fpr(rf, f, logn);
+ smallints_to_fpr(rg, g, logn);
+ smallints_to_fpr(rF, F, logn);
+ smallints_to_fpr(rG, G, logn);
+
+ /*
+ * Compute the FFT for the key elements, and negate f and F.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rf, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rg, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rF, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(rG, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_neg(rf, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_neg(rF, logn);
+
+ /*
+ * The Gram matrix is G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle).
+ */
+ g00 = (fpr *)tmp;
+ g01 = g00 + n;
+ g11 = g01 + n;
+ gxx = g11 + n;
+
+ memcpy(g00, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(g00, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(g00, gxx, logn);
+
+ memcpy(g01, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(g01, b10, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(gxx, b11, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(g01, gxx, logn);
+
+ memcpy(g11, b10, n * sizeof * b10);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(g11, logn);
+ memcpy(gxx, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(g11, gxx, logn);
+
+ /*
+ * Compute the Falcon tree.
+ */
+ ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+ /*
+ * Normalize tree.
+ */
+ ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+ fpr *t0, fpr *t1,
+ fpr *g00, fpr *g01, fpr *g11,
+ unsigned orig_logn, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *z0, *z1;
+
+ /*
+ * Deepest level: the LDL tree leaf value is just g00 (the
+ * array has length only 1 at this point); we normalize it
+ * with regards to sigma, then use it for sampling.
+ */
+ if (logn == 0) {
+ fpr leaf;
+
+ leaf = g00[0];
+ leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+ t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+ t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+ return;
+ }
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Decompose G into LDL. We only need d00 (identical to g00),
+ * d11, and l10; we do that in place.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_LDL_fft(g00, g01, g11, logn);
+
+ /*
+ * Split d00 and d11 and expand them into half-size quasi-cyclic
+ * Gram matrices. We also save l10 in tmp[].
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(tmp, tmp + hn, g00, logn);
+ memcpy(g00, tmp, n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(tmp, tmp + hn, g11, logn);
+ memcpy(g11, tmp, n * sizeof * tmp);
+ memcpy(tmp, g01, n * sizeof * g01);
+ memcpy(g01, g00, hn * sizeof * g00);
+ memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+ /*
+ * The half-size Gram matrices for the recursive LDL tree
+ * building are now:
+ * - left sub-tree: g00, g00+hn, g01
+ * - right sub-tree: g11, g11+hn, g01+hn
+ * l10 is in tmp[].
+ */
+
+ /*
+ * We split t1 and use the first recursive call on the two
+ * halves, using the right sub-tree. The result is merged
+ * back into tmp + 2*n.
+ */
+ z1 = tmp + n;
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+ g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * l10.
+ * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+ * in tmp + (n << 1). The buffer in z1 is free.
+ *
+ * In the end, z1 is written over t1, and tb0 is in t0.
+ */
+ memcpy(z1, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_sub(z1, tmp + (n << 1), logn);
+ memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(t0, tmp, logn);
+
+ /*
+ * Second recursive invocation, on the split tb0 (currently in t0)
+ * and the left sub-tree.
+ */
+ z0 = tmp;
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z0, z0 + hn, t0, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+ g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+ fpr *z0, fpr *z1,
+ const fpr *tree,
+ const fpr *t0, const fpr *t1, unsigned logn,
+ fpr *tmp) {
+ size_t n, hn;
+ const fpr *tree0, *tree1;
+
+ /*
+ * When logn == 2, we inline the last two recursion levels.
+ */
+ if (logn == 2) {
+ fpr w0, w1, w2, w3, sigma;
+ __m128d ww0, ww1, wa, wb, wc, wd;
+ __m128d wy0, wy1, wz0, wz1;
+ __m128d half, invsqrt8, invsqrt2, neghi, neglo;
+ int si0, si1, si2, si3;
+
+ tree0 = tree + 4;
+ tree1 = tree + 8;
+
+ half = _mm_set1_pd(0.5);
+ invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
+ invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
+ neghi = _mm_set_pd(-0.0, 0.0);
+ neglo = _mm_set_pd(0.0, -0.0);
+
+ /*
+ * We split t1 into w*, then do the recursive invocation,
+ * with output in w*. We finally merge back into z1.
+ */
+ ww0 = _mm_loadu_pd(&t1[0].v);
+ ww1 = _mm_loadu_pd(&t1[2].v);
+ wa = _mm_unpacklo_pd(ww0, ww1);
+ wb = _mm_unpackhi_pd(ww0, ww1);
+ wc = _mm_add_pd(wa, wb);
+ ww0 = _mm_mul_pd(wc, half);
+ wc = _mm_sub_pd(wa, wb);
+ wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+ ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+ w2.v = _mm_cvtsd_f64(ww1);
+ w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+ wa = ww1;
+ sigma = tree1[3];
+ si2 = samp(samp_ctx, w2, sigma);
+ si3 = samp(samp_ctx, w3, sigma);
+ ww1 = _mm_set_pd((double)si3, (double)si2);
+ wa = _mm_sub_pd(wa, ww1);
+ wb = _mm_loadu_pd(&tree1[0].v);
+ wc = _mm_mul_pd(wa, wb);
+ wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+ wa = _mm_unpacklo_pd(wc, wd);
+ wb = _mm_unpackhi_pd(wc, wd);
+ ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+ w0.v = _mm_cvtsd_f64(ww0);
+ w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+ sigma = tree1[2];
+ si0 = samp(samp_ctx, w0, sigma);
+ si1 = samp(samp_ctx, w1, sigma);
+ ww0 = _mm_set_pd((double)si1, (double)si0);
+
+ wc = _mm_mul_pd(
+ _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+ invsqrt2);
+ wa = _mm_add_pd(ww0, wc);
+ wb = _mm_sub_pd(ww0, wc);
+ ww0 = _mm_unpacklo_pd(wa, wb);
+ ww1 = _mm_unpackhi_pd(wa, wb);
+ _mm_storeu_pd(&z1[0].v, ww0);
+ _mm_storeu_pd(&z1[2].v, ww1);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+ */
+ wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
+ wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
+ wz0 = _mm_loadu_pd(&tree[0].v);
+ wz1 = _mm_loadu_pd(&tree[2].v);
+ ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
+ ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
+ ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
+ ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
+
+ /*
+ * Second recursive invocation.
+ */
+ wa = _mm_unpacklo_pd(ww0, ww1);
+ wb = _mm_unpackhi_pd(ww0, ww1);
+ wc = _mm_add_pd(wa, wb);
+ ww0 = _mm_mul_pd(wc, half);
+ wc = _mm_sub_pd(wa, wb);
+ wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
+ ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
+
+ w2.v = _mm_cvtsd_f64(ww1);
+ w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
+ wa = ww1;
+ sigma = tree0[3];
+ si2 = samp(samp_ctx, w2, sigma);
+ si3 = samp(samp_ctx, w3, sigma);
+ ww1 = _mm_set_pd((double)si3, (double)si2);
+ wa = _mm_sub_pd(wa, ww1);
+ wb = _mm_loadu_pd(&tree0[0].v);
+ wc = _mm_mul_pd(wa, wb);
+ wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
+ wa = _mm_unpacklo_pd(wc, wd);
+ wb = _mm_unpackhi_pd(wc, wd);
+ ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
+ w0.v = _mm_cvtsd_f64(ww0);
+ w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
+ sigma = tree0[2];
+ si0 = samp(samp_ctx, w0, sigma);
+ si1 = samp(samp_ctx, w1, sigma);
+ ww0 = _mm_set_pd((double)si1, (double)si0);
+
+ wc = _mm_mul_pd(
+ _mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
+ invsqrt2);
+ wa = _mm_add_pd(ww0, wc);
+ wb = _mm_sub_pd(ww0, wc);
+ ww0 = _mm_unpacklo_pd(wa, wb);
+ ww1 = _mm_unpackhi_pd(wa, wb);
+ _mm_storeu_pd(&z0[0].v, ww0);
+ _mm_storeu_pd(&z0[2].v, ww1);
+
+ return;
+ }
+
+ /*
+ * Case logn == 1 is reachable only when using Falcon-2 (the
+ * smallest size for which Falcon is mathematically defined, but
+ * of course way too insecure to be of any use).
+ */
+ if (logn == 1) {
+ fpr x0, x1, y0, y1, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ x0 = t1[0];
+ x1 = t1[1];
+ sigma = tree[3];
+ z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree[0];
+ b_im = tree[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, t0[0]);
+ x1 = fpr_add(c_im, t0[1]);
+ sigma = tree[2];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+ return;
+ }
+
+ /*
+ * Normal end of recursion is for logn == 0. Since the last
+ * steps of the recursions were inlined in the blocks above
+ * (when logn == 1 or 2), this case is not reachable, and is
+ * retained here only for documentation purposes.
+
+ if (logn == 0) {
+ fpr x0, x1, sigma;
+
+ x0 = t0[0];
+ x1 = t1[0];
+ sigma = tree[0];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+ return;
+ }
+
+ */
+
+ /*
+ * General recursive case (logn >= 3).
+ */
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ tree0 = tree + n;
+ tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+ /*
+ * We split t1 into z1 (reused as temporary storage), then do
+ * the recursive invocation, with output in tmp. We finally
+ * merge back into z1.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree1, z1, z1 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+ */
+ memcpy(tmp, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_sub(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tmp, tree, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(tmp, t0, logn);
+
+ /*
+ * Second recursive invocation.
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_poly_split_fft(z0, z0 + hn, tmp, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree0, z0, z0 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const fpr *expanded_key,
+ const uint16_t *hm,
+ unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ const fpr *b00, *b01, *b10, *b11, *tree;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+ t0 = tmp;
+ t1 = t0 + n;
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t0, ni, logn);
+
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Apply sampling. Output is written back in [tx, ty].
+ */
+ ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(t0, tx, n * sizeof * tx);
+ memcpy(t1, ty, n * sizeof * ty);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(t1, ty, logn);
+
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(t1, logn);
+
+ /*
+ * Compute the signature.
+ */
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED512_AVX2_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+
+ /*
+ * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b11, logn);
+
+ /*
+ * Compute the Gram matrix G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle). g10 is not kept
+ * since it is equal to adj(g01).
+ *
+ * We _replace_ the matrix B with the Gram matrix, but we
+ * must keep b01 and b11 for computing the target vector.
+ */
+ t0 = b11 + n;
+ t1 = t0 + n;
+
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(t0, logn); // t0 <- b01*adj(b01)
+
+ memcpy(t1, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(t1, b10, logn); // t1 <- b00*adj(b10)
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(b00, logn); // b00 <- b00*adj(b00)
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(b00, t0, logn); // b00 <- g00
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_muladj_fft(b01, b11, logn); // b01 <- b01*adj(b11)
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(b01, t1, logn); // b01 <- g01
+
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(b10, logn); // b10 <- b10*adj(b10)
+ memcpy(t1, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulselfadj_fft(t1, logn); // t1 <- b11*adj(b11)
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(b10, t1, logn); // b10 <- g11
+
+ /*
+ * We rename variables to make things clearer. The three elements
+ * of the Gram matrix uses the first 3*n slots of tmp[], followed
+ * by b11 and b01 (in that order).
+ */
+ g00 = b00;
+ g01 = b01;
+ g11 = b10;
+ b01 = t0;
+ t0 = b01 + n;
+ t1 = t0 + n;
+
+ /*
+ * Memory layout at that point:
+ * g00 g01 g11 b11 b01 t0 t1
+ */
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mulconst(t0, ni, logn);
+
+ /*
+ * b01 and b11 can be discarded, so we move back (t0,t1).
+ * Memory layout is now:
+ * g00 g01 g11 t0 t1
+ */
+ memcpy(b11, t0, n * 2 * sizeof * t0);
+ t0 = g11 + n;
+ t1 = t0 + n;
+
+ /*
+ * Apply sampling; result is written over (t0,t1).
+ */
+ ffSampling_fft_dyntree(samp, samp_ctx,
+ t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+ /*
+ * We arrange the layout back to:
+ * b00 b01 b10 b11 t0 t1
+ *
+ * We did not conserve the matrix basis, so we must recompute
+ * it now.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ memmove(b11 + n, t0, n * 2 * sizeof * t0);
+ t0 = b11 + n;
+ t1 = t0 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_neg(b11, logn);
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(tx, t0, n * sizeof * t0);
+ memcpy(ty, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_poly_add(t1, ty, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED512_AVX2_iFFT(t1, logn);
+
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED512_AVX2_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED512_AVX2_gaussian0_sampler(prng *p) {
+
+ /*
+ * High words.
+ */
+ static const union {
+ uint16_t u16[16];
+ __m256i ymm[1];
+ } rhi15 = {
+ {
+ 0x51FB, 0x2A69, 0x113E, 0x0568,
+ 0x014A, 0x003B, 0x0008, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000
+ }
+ };
+
+ static const union {
+ uint64_t u64[20];
+ __m256i ymm[5];
+ } rlo57 = {
+ {
+ 0x1F42ED3AC391802, 0x12B181F3F7DDB82,
+ 0x1CDD0934829C1FF, 0x1754377C7994AE4,
+ 0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
+ 0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
+ 0x01F80D88A7B6428, 0x001C3FDB2040C69,
+ 0x00012CF24D031FB, 0x00000949F8B091F,
+ 0x0000003665DA998, 0x00000000EBF6EBB,
+ 0x0000000002F5D7E, 0x000000000007098,
+ 0x0000000000000C6, 0x000000000000001,
+ 0x000000000000000, 0x000000000000000
+ }
+ };
+
+ uint64_t lo;
+ unsigned hi;
+ __m256i xhi, rhi, gthi, eqhi, eqm;
+ __m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
+ __m128i t, zt;
+ int r;
+
+ /*
+ * Get a 72-bit random value and split it into a low part
+ * (57 bits) and a high part (15 bits)
+ */
+ lo = prng_get_u64(p);
+ hi = prng_get_u8(p);
+ hi = (hi << 7) | (unsigned)(lo >> 57);
+ lo &= 0x1FFFFFFFFFFFFFF;
+
+ /*
+ * Broadcast the high part and compare it with the relevant
+ * values. We need both a "greater than" and an "equal"
+ * comparisons.
+ */
+ xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128((int)hi));
+ rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
+ gthi = _mm256_cmpgt_epi16(rhi, xhi);
+ eqhi = _mm256_cmpeq_epi16(rhi, xhi);
+
+ /*
+ * The result is the number of 72-bit values (among the list of 19)
+ * which are greater than the 72-bit random value. We first count
+ * all non-zero 16-bit elements in the first eight of gthi. Such
+ * elements have value -1 or 0, so we first negate them.
+ */
+ t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
+ zt = _mm_setzero_si128();
+ t = _mm_hadd_epi16(t, zt);
+ t = _mm_hadd_epi16(t, zt);
+ t = _mm_hadd_epi16(t, zt);
+ r = _mm_cvtsi128_si32(t);
+
+ /*
+ * We must look at the low bits for all values for which the
+ * high bits are an "equal" match; values 8-18 all have the
+ * same high bits (0).
+ * On 32-bit systems, 'lo' really is two registers, requiring
+ * some extra code.
+ */
+ #if defined(__x86_64__) || defined(_M_X64)
+ xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
+ #else
+ {
+ uint32_t e0, e1;
+ int32_t f0, f1;
+
+ e0 = (uint32_t)lo;
+ e1 = (uint32_t)(lo >> 32);
+ f0 = *(int32_t *)&e0;
+ f1 = *(int32_t *)&e1;
+ xlo = _mm256_set_epi32(f1, f0, f1, f0, f1, f0, f1, f0);
+ }
+ #endif
+ gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo);
+ gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo);
+ gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo);
+ gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo);
+ gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo);
+
+ /*
+ * Keep only comparison results that correspond to the non-zero
+ * elements in eqhi.
+ */
+ gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
+ _mm256_castsi256_si128(eqhi)));
+ gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
+ _mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
+ eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
+ gtlo2 = _mm256_and_si256(gtlo2, eqm);
+ gtlo3 = _mm256_and_si256(gtlo3, eqm);
+ gtlo4 = _mm256_and_si256(gtlo4, eqm);
+
+ /*
+ * Add all values to count the total number of "-1" elements.
+ * Since the first eight "high" words are all different, only
+ * one element (at most) in gtlo0:gtlo1 can be non-zero; however,
+ * if the high word of the random value is zero, then many
+ * elements of gtlo2:gtlo3:gtlo4 can be non-zero.
+ */
+ gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
+ gtlo0 = _mm256_add_epi64(
+ _mm256_add_epi64(gtlo0, gtlo2),
+ _mm256_add_epi64(gtlo3, gtlo4));
+ t = _mm_add_epi64(
+ _mm256_castsi256_si128(gtlo0),
+ _mm256_extracti128_si256(gtlo0, 1));
+ t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
+ r -= _mm_cvtsi128_si32(t);
+
+ return r;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+ int s, i;
+ fpr r;
+ uint32_t sw, w;
+ uint64_t z;
+
+ /*
+ * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+ * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+ */
+ s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+ r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+ /*
+ * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+ * (the minimum value for sigma), r = 0 and b = 1, then we get
+ * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+ * with probability about 0.000000000230383991, which is
+ * approximatively equal to 2^(-32). In any case, if s >= 64,
+ * then BerExp will be non-zero with probability less than
+ * 2^(-64), so we can simply saturate s at 63.
+ */
+ sw = (uint32_t)s;
+ sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+ s = (int)sw;
+
+ /*
+ * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+ * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+ * We scale it up to 2^64, then right-shift it by s bits because
+ * we really want exp(-x) = 2^(-s)*exp(-r).
+ *
+ * The "-1" operation makes sure that the value fits on 64 bits
+ * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+ * case). The bias is negligible since fpr_expm_p63() only computes
+ * with 51 bits of precision or so.
+ */
+ z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+ /*
+ * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+ * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+ * PRNG output to limit its consumption, the sign of the difference
+ * yields the expected result.
+ */
+ i = 64;
+ do {
+ i -= 8;
+ w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+ } while (!w && i > 0);
+ return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED512_AVX2_sampler(void *ctx, fpr mu, fpr isigma) {
+ sampler_context *spc;
+ int s;
+ fpr r, dss, ccs;
+
+ spc = ctx;
+
+ /*
+ * Center is mu. We compute mu = s + r where s is an integer
+ * and 0 <= r < 1.
+ */
+ s = (int)fpr_floor(mu);
+ r = fpr_sub(mu, fpr_of(s));
+
+ /*
+ * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+ */
+ dss = fpr_half(fpr_sqr(isigma));
+
+ /*
+ * ccs = sigma_min / sigma = sigma_min * isigma.
+ */
+ ccs = fpr_mul(isigma, spc->sigma_min);
+
+ /*
+ * We now need to sample on center r.
+ */
+ for (;;) {
+ int z0, z, b;
+ fpr x;
+
+ /*
+ * Sample z for a Gaussian distribution. Then get a
+ * random bit b to turn the sampling into a bimodal
+ * distribution: if b = 1, we use z+1, otherwise we
+ * use -z. We thus have two situations:
+ *
+ * - b = 1: z >= 1 and sampled against a Gaussian
+ * centered on 1.
+ * - b = 0: z <= 0 and sampled against a Gaussian
+ * centered on 0.
+ */
+ z0 = PQCLEAN_FALCONPADDED512_AVX2_gaussian0_sampler(&spc->p);
+ b = (int)prng_get_u8(&spc->p) & 1;
+ z = b + ((b << 1) - 1) * z0;
+
+ /*
+ * Rejection sampling. We want a Gaussian centered on r;
+ * but we sampled against a Gaussian centered on b (0 or
+ * 1). But we know that z is always in the range where
+ * our sampling distribution is greater than the Gaussian
+ * distribution, so rejection works.
+ *
+ * We got z with distribution:
+ * G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+ * We target distribution:
+ * S(z) = exp(-((z-r)^2)/(2*sigma^2))
+ * Rejection sampling works by keeping the value z with
+ * probability S(z)/G(z), and starting again otherwise.
+ * This requires S(z) <= G(z), which is the case here.
+ * Thus, we simply need to keep our z with probability:
+ * P = exp(-x)
+ * where:
+ * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+ *
+ * Here, we scale up the Bernouilli distribution, which
+ * makes rejection more probable, but makes rejection
+ * rate sufficiently decorrelated from the Gaussian
+ * center and standard deviation that the whole sampler
+ * can be said to be constant-time.
+ */
+ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+ x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+ if (BerExp(&spc->p, x, ccs)) {
+ /*
+ * Rejection sampling was centered on r, but the
+ * actual center is mu = s + r.
+ */
+ return s + z;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED512_AVX2_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED512_AVX2_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_tree(samp, samp_ctx, sig,
+ expanded_key, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED512_AVX2_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED512_AVX2_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_dyn(samp, samp_ctx, sig,
+ f, g, F, G, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_avx2/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-512_avx2/vrfy.c
new file mode 100644
index 000000000..6abf55d18
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_avx2/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ * n = 2^logn (2 <= n <= 1024)
+ * phi = X^n + 1
+ * q = 12289
+ * q0i = -1/q mod 2^16
+ * R = 2^16 mod q
+ * R2 = 2^32 mod q
+ */
+
+#define Q 12289
+#define Q0I 12287
+#define R 4091
+#define R2 10952
+
+/*
+ * Table for NTT, binary case:
+ * GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+ 4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759,
+ 1591, 6399, 9477, 5266, 586, 5825, 7538, 9710,
+ 1134, 6407, 1711, 965, 7099, 7674, 3743, 6442,
+ 10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180,
+ 12210, 6240, 997, 117, 4783, 4407, 1549, 7072,
+ 2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042,
+ 12189, 432, 10751, 1237, 7610, 1534, 3983, 7863,
+ 2181, 6308, 8720, 6570, 4843, 1690, 14, 3872,
+ 5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340,
+ 1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045,
+ 3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180,
+ 9277, 6130, 3323, 883, 10469, 489, 1502, 2851,
+ 11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195,
+ 730, 1762, 3854, 2030, 5892, 10922, 9020, 5274,
+ 9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446,
+ 7613, 9386, 834, 7703, 6836, 3403, 5351, 12276,
+ 3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525,
+ 10401, 2749, 7338, 10574, 6040, 943, 9330, 1477,
+ 6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680,
+ 8188, 6902, 3533, 9807, 6090, 727, 10099, 7003,
+ 6945, 1949, 9731, 10559, 6057, 378, 7871, 8763,
+ 8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821,
+ 5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159,
+ 1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188,
+ 737, 3698, 4699, 5753, 9046, 3687, 16, 914,
+ 5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381,
+ 10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357,
+ 6409, 8197, 2962, 6375, 5064, 6634, 5625, 278,
+ 932, 10229, 8927, 7642, 351, 9298, 237, 5858,
+ 7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204,
+ 4602, 1748, 11300, 340, 3711, 4614, 300, 10993,
+ 5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654,
+ 3835, 5553, 1224, 8476, 9237, 3845, 250, 11209,
+ 4225, 6326, 9680, 12254, 4136, 2778, 692, 8808,
+ 6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433,
+ 6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416,
+ 8418, 10824, 11986, 5733, 876, 7030, 2167, 2436,
+ 3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434,
+ 7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328,
+ 8557, 1867, 9454, 2416, 3816, 9076, 686, 5393,
+ 2523, 4339, 6115, 619, 937, 2834, 7775, 3279,
+ 2363, 7488, 6112, 5056, 824, 10204, 11690, 1113,
+ 2727, 9848, 896, 2028, 5075, 2654, 10464, 7884,
+ 12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520,
+ 1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399,
+ 11192, 315, 4511, 1158, 6061, 6751, 11865, 357,
+ 7367, 4550, 983, 8534, 8352, 10126, 7530, 9253,
+ 4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652,
+ 3374, 11477, 1753, 292, 8681, 2806, 10378, 12188,
+ 5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928,
+ 4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650,
+ 7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344,
+ 8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561,
+ 6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114,
+ 7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323,
+ 10438, 9471, 1271, 408, 6911, 3079, 360, 8276,
+ 11535, 9156, 9049, 11539, 850, 8617, 784, 7919,
+ 8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600,
+ 9779, 1012, 721, 2784, 6676, 6552, 5348, 4424,
+ 6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333,
+ 8801, 9661, 7308, 5788, 4910, 909, 11613, 4395,
+ 8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216,
+ 4296, 11918, 695, 4371, 9793, 4884, 2411, 10230,
+ 2650, 841, 3890, 10231, 7248, 8505, 11196, 6688,
+ 4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868,
+ 11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525,
+ 7938, 7982, 11977, 6755, 537, 4562, 1623, 8227,
+ 11453, 7544, 906, 11816, 9548, 10858, 9703, 2815,
+ 11736, 6813, 6979, 819, 8903, 6271, 10843, 348,
+ 7514, 8339, 6439, 694, 852, 5659, 2781, 3716,
+ 11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885,
+ 2978, 7289, 11884, 9123, 9323, 11830, 98, 2526,
+ 2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224,
+ 10871, 8092, 9651, 5989, 7140, 8480, 1670, 159,
+ 10923, 4918, 128, 7312, 725, 9157, 5006, 6393,
+ 3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668,
+ 3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365,
+ 5110, 45, 2400, 1921, 4377, 2720, 1695, 51,
+ 2808, 650, 1896, 9997, 9971, 11980, 8098, 4833,
+ 4135, 4257, 5838, 4765, 10985, 11532, 590, 12198,
+ 482, 12173, 2006, 7064, 10018, 3912, 12016, 10519,
+ 11362, 6954, 2210, 284, 5413, 6601, 3865, 10339,
+ 11188, 6231, 517, 9564, 11281, 3863, 1210, 4604,
+ 8160, 11447, 153, 7204, 5763, 5089, 9248, 12154,
+ 11748, 1354, 6672, 179, 5532, 2646, 5941, 12185,
+ 862, 3158, 477, 7279, 5678, 7914, 4254, 302,
+ 2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824,
+ 10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449,
+ 5159, 1308, 8315, 3404, 1877, 1231, 112, 6398,
+ 11724, 12272, 7286, 1459, 12274, 9896, 3456, 800,
+ 1397, 10678, 103, 7420, 7976, 936, 764, 632,
+ 7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946,
+ 6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139,
+ 4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850,
+ 7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217,
+ 10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711,
+ 2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729,
+ 4997, 7415, 6315, 12044, 4374, 7157, 4844, 211,
+ 8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875,
+ 8192, 986, 7527, 1401, 870, 3615, 8465, 2756,
+ 9770, 2034, 10168, 3264, 6132, 54, 2880, 4763,
+ 11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038,
+ 2567, 708, 893, 6465, 4962, 10024, 2090, 5718,
+ 10743, 780, 4733, 4623, 2134, 2087, 4802, 884,
+ 5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664,
+ 4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791,
+ 6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032,
+ 11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062,
+ 8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348,
+ 4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499,
+ 2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326,
+ 5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830,
+ 11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582,
+ 3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762,
+ 7496, 10383, 755, 1654, 12053, 4952, 10134, 4394,
+ 6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674,
+ 10358, 4901, 7414, 8771, 710, 6764, 8462, 7193,
+ 5371, 7274, 11084, 290, 7864, 6827, 11822, 2509,
+ 6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105,
+ 11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776,
+ 7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277,
+ 9182, 11456, 633, 10046, 11554, 5633, 9587, 2333,
+ 7008, 7084, 5047, 7199, 9865, 8997, 569, 6390,
+ 10845, 9679, 8268, 11472, 4203, 1997, 2, 9331,
+ 162, 6182, 2000, 3649, 9792, 6363, 7557, 6187,
+ 8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067,
+ 5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165,
+ 6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ * iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+ 4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329,
+ 2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698,
+ 3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875,
+ 5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155,
+ 8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108,
+ 4426, 8306, 10755, 4679, 11052, 1538, 11857, 100,
+ 8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460,
+ 5217, 10740, 7882, 7506, 12172, 11292, 6049, 79,
+ 13, 6938, 8886, 5453, 4586, 11455, 2903, 4676,
+ 9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110,
+ 7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559,
+ 11094, 2211, 1808, 7319, 48, 9547, 2560, 1228,
+ 9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012,
+ 6109, 2796, 2203, 1652, 711, 7004, 1053, 8973,
+ 5244, 1517, 9322, 11269, 900, 3888, 11133, 10736,
+ 4949, 7616, 9974, 4746, 10270, 126, 2921, 6720,
+ 6635, 6543, 1582, 4868, 42, 673, 2240, 7219,
+ 1296, 11989, 7675, 8578, 11949, 989, 10541, 7687,
+ 7085, 8487, 1004, 10236, 4703, 163, 9143, 4597,
+ 6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357,
+ 12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880,
+ 6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556,
+ 6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103,
+ 11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552,
+ 6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822,
+ 9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609,
+ 3468, 4659, 625, 2700, 7738, 3443, 3060, 3388,
+ 3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344,
+ 5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101,
+ 4609, 8605, 8226, 144, 5656, 8704, 2621, 5424,
+ 10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888,
+ 3764, 39, 8219, 2080, 2502, 1469, 10550, 8709,
+ 5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639,
+ 2059, 9878, 7405, 2496, 7918, 11594, 371, 7993,
+ 3073, 10326, 40, 10004, 9245, 7987, 5603, 4051,
+ 7894, 676, 11380, 7379, 6501, 4981, 2628, 3488,
+ 10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473,
+ 7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510,
+ 6689, 386, 4462, 105, 2076, 10443, 119, 3955,
+ 4370, 11505, 3672, 11439, 750, 3240, 3133, 754,
+ 4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851,
+ 4966, 8181, 2688, 6205, 6814, 926, 2936, 4327,
+ 10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255,
+ 728, 7569, 6056, 10432, 11036, 2452, 2811, 3787,
+ 945, 8998, 1244, 8815, 11017, 11218, 5894, 4325,
+ 4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707,
+ 1361, 9812, 2949, 11265, 10301, 9108, 478, 6489,
+ 101, 1911, 9483, 3608, 11997, 10536, 812, 8915,
+ 637, 8159, 5299, 9128, 3512, 8290, 7068, 7922,
+ 3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922,
+ 11932, 424, 5538, 6228, 11131, 7778, 11974, 1097,
+ 2890, 10027, 2569, 2250, 2352, 821, 2550, 11016,
+ 7769, 136, 617, 3157, 5889, 9219, 6855, 120,
+ 4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562,
+ 11176, 599, 2085, 11465, 7233, 6177, 4801, 9926,
+ 9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766,
+ 6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732,
+ 7961, 1457, 10857, 8069, 832, 1628, 3410, 4900,
+ 10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847,
+ 9853, 10122, 5259, 11413, 6556, 303, 1465, 3871,
+ 4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852,
+ 3856, 928, 4933, 8530, 1871, 2184, 5571, 5879,
+ 3481, 11597, 9511, 8153, 35, 2609, 5963, 8064,
+ 1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454,
+ 2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028,
+ 2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795,
+ 9222, 10837, 280, 8583, 3270, 6753, 2354, 3779,
+ 6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127,
+ 2958, 12287, 10292, 8086, 817, 4021, 2610, 1444,
+ 5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281,
+ 9956, 2702, 6656, 735, 2243, 11656, 833, 3107,
+ 6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278,
+ 3513, 9769, 3025, 779, 9433, 3392, 7437, 668,
+ 10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711,
+ 9780, 467, 5462, 4425, 11999, 1205, 5015, 6918,
+ 5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931,
+ 6615, 1541, 8708, 260, 3385, 4792, 4391, 5697,
+ 7895, 2155, 7337, 236, 10635, 11534, 1906, 4793,
+ 9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556,
+ 707, 1088, 4936, 678, 10245, 18, 5684, 960,
+ 4459, 7957, 226, 2451, 6, 8874, 320, 6298,
+ 8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876,
+ 9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679,
+ 7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378,
+ 5227, 952, 4319, 9810, 4356, 3088, 11118, 840,
+ 6257, 486, 6000, 1342, 10382, 6017, 4798, 5489,
+ 4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037,
+ 1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917,
+ 11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546,
+ 6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722,
+ 2251, 11199, 5356, 7408, 2861, 4003, 9215, 484,
+ 7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519,
+ 9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097,
+ 2414, 6496, 9953, 10554, 808, 2999, 2130, 4286,
+ 12078, 7445, 5132, 7915, 245, 5974, 4874, 7292,
+ 7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022,
+ 9578, 8934, 11074, 9498, 294, 4711, 3391, 1377,
+ 9072, 10189, 4569, 10890, 9909, 6923, 53, 4653,
+ 439, 10253, 7028, 10207, 8343, 1141, 2556, 7601,
+ 8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765,
+ 10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293,
+ 11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892,
+ 11489, 8833, 2393, 15, 10830, 5003, 17, 565,
+ 5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130,
+ 5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020,
+ 2465, 8191, 384, 2642, 2729, 5399, 2175, 9396,
+ 11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427,
+ 104, 6348, 9643, 6757, 12110, 5617, 10935, 541,
+ 135, 3041, 7200, 6526, 5085, 12136, 842, 4129,
+ 7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101,
+ 1950, 8424, 5688, 6876, 12005, 10079, 5335, 927,
+ 1770, 273, 8377, 2271, 5225, 10283, 116, 11807,
+ 91, 11699, 757, 1304, 7524, 6451, 8032, 8154,
+ 7456, 4191, 309, 2318, 2292, 10393, 11639, 9481,
+ 12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179,
+ 3924, 3188, 367, 2077, 336, 5384, 5631, 8596,
+ 4621, 1775, 8866, 451, 6108, 1317, 6246, 8795,
+ 5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366,
+ 12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418,
+ 10065, 4156, 8373, 8644, 10445, 882, 8158, 10173,
+ 9763, 12191, 459, 2966, 3166, 405, 5000, 9311,
+ 6404, 8986, 1551, 8175, 3630, 10766, 9265, 700,
+ 8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775,
+ 11941, 1446, 6018, 3386, 11470, 5310, 5476, 553,
+ 9474, 2586, 1431, 2741, 473, 11383, 4745, 836,
+ 4062, 10666, 7727, 11752, 5534, 312, 4307, 4351,
+ 5764, 8679, 8381, 8187, 5, 7395, 4363, 1152,
+ 5421, 5231, 6473, 436, 7567, 8603, 6229, 8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+ /*
+ * If x < 0, the cast to uint32_t will set the high bit to 1.
+ */
+ uint32_t y;
+
+ y = (uint32_t)x;
+ y += Q & -(y >> 31);
+ return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+ /*
+ * We compute x + y - q. If the result is negative, then the
+ * high bit will be set, and 'd >> 31' will be equal to 1;
+ * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+ * it will be an all-zero pattern. In other words, this
+ * implements a conditional addition of q.
+ */
+ uint32_t d;
+
+ d = x + y - Q;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+ /*
+ * As in mq_add(), we use a conditional addition to ensure the
+ * result is in the 0..q-1 range.
+ */
+ uint32_t d;
+
+ d = x - y;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+ x += Q & -(x & 1);
+ return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+ uint32_t z, w;
+
+ /*
+ * We compute x*y + k*q with a value of k chosen so that the 16
+ * low bits of the result are 0. We can then shift the value.
+ * After the shift, result may still be larger than q, but it
+ * will be lower than 2*q, so a conditional subtraction works.
+ */
+
+ z = x * y;
+ w = ((z * Q0I) & 0xFFFF) * Q;
+
+ /*
+ * When adding z and w, the result will have its low 16 bits
+ * equal to 0. Since x, y and z are lower than q, the sum will
+ * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+ * fit on 29 bits.
+ */
+ z = (z + w) >> 16;
+
+ /*
+ * After the shift, analysis shows that the value will be less
+ * than 2q. We do a subtraction then conditional subtraction to
+ * ensure the result is in the expected range.
+ */
+ z -= Q;
+ z += Q & -(z >> 31);
+ return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+ return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+ /*
+ * We invert y by computing y^(q-2) mod q.
+ *
+ * We use the following addition chain for exponent e = 12287:
+ *
+ * e0 = 1
+ * e1 = 2 * e0 = 2
+ * e2 = e1 + e0 = 3
+ * e3 = e2 + e1 = 5
+ * e4 = 2 * e3 = 10
+ * e5 = 2 * e4 = 20
+ * e6 = 2 * e5 = 40
+ * e7 = 2 * e6 = 80
+ * e8 = 2 * e7 = 160
+ * e9 = e8 + e2 = 163
+ * e10 = e9 + e8 = 323
+ * e11 = 2 * e10 = 646
+ * e12 = 2 * e11 = 1292
+ * e13 = e12 + e9 = 1455
+ * e14 = 2 * e13 = 2910
+ * e15 = 2 * e14 = 5820
+ * e16 = e15 + e10 = 6143
+ * e17 = 2 * e16 = 12286
+ * e18 = e17 + e0 = 12287
+ *
+ * Additions on exponents are converted to Montgomery
+ * multiplications. We define all intermediate results as so
+ * many local variables, and let the C compiler work out which
+ * must be kept around.
+ */
+ uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+ uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+ y0 = mq_montymul(y, R2);
+ y1 = mq_montysqr(y0);
+ y2 = mq_montymul(y1, y0);
+ y3 = mq_montymul(y2, y1);
+ y4 = mq_montysqr(y3);
+ y5 = mq_montysqr(y4);
+ y6 = mq_montysqr(y5);
+ y7 = mq_montysqr(y6);
+ y8 = mq_montysqr(y7);
+ y9 = mq_montymul(y8, y2);
+ y10 = mq_montymul(y9, y8);
+ y11 = mq_montysqr(y10);
+ y12 = mq_montysqr(y11);
+ y13 = mq_montymul(y12, y9);
+ y14 = mq_montysqr(y13);
+ y15 = mq_montysqr(y14);
+ y16 = mq_montymul(y15, y10);
+ y17 = mq_montysqr(y16);
+ y18 = mq_montymul(y17, y0);
+
+ /*
+ * Final multiplication with x, which is not in Montgomery
+ * representation, computes the correct division result.
+ */
+ return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, i, j1;
+
+ ht = t >> 1;
+ for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+ size_t j, j2;
+ uint32_t s;
+
+ s = GMb[m + i];
+ j2 = j1 + ht;
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v;
+
+ u = a[j];
+ v = mq_montymul(a[j + ht], s);
+ a[j] = (uint16_t)mq_add(u, v);
+ a[j + ht] = (uint16_t)mq_sub(u, v);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+ uint32_t ni;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ while (m > 1) {
+ size_t hm, dt, i, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+ size_t j, j2;
+ uint32_t s;
+
+ j2 = j1 + t;
+ s = iGMb[hm + i];
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v, w;
+
+ u = a[j];
+ v = a[j + t];
+ a[j] = (uint16_t)mq_add(u, v);
+ w = mq_sub(u, v);
+ a[j + t] = (uint16_t)
+ mq_montymul(w, s);
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * To complete the inverse NTT, we must now divide all values by
+ * n (the vector size). We thus need the inverse of n, i.e. we
+ * need to divide 1 by 2 logn times. But we also want it in
+ * Montgomery representation, i.e. we also want to multiply it
+ * by R = 2^16. In the common case, this should be a simple right
+ * shift. The loop below is generic and works also in corner cases;
+ * its computation time is negligible.
+ */
+ ni = R;
+ for (m = n; m > 1; m >>= 1) {
+ ni = mq_rshift1(ni);
+ }
+ for (m = 0; m < n; m ++) {
+ a[m] = (uint16_t)mq_montymul(a[m], ni);
+ }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], R2);
+ }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+ }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_sub(f[u], g[u]);
+ }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_AVX2_to_ntt_monty(uint16_t *h, unsigned logn) {
+ mq_NTT(h, logn);
+ mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+
+ /*
+ * Reduce s2 elements modulo q ([0..q-1] range).
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+ */
+ mq_NTT(tt, logn);
+ mq_poly_montymul_ntt(tt, h, logn);
+ mq_iNTT(tt, logn);
+ mq_poly_sub(tt, c0, logn);
+
+ /*
+ * Normalize -s1 elements into the [-q/2..q/2] range.
+ */
+ for (u = 0; u < n; u ++) {
+ int32_t w;
+
+ w = (int32_t)tt[u];
+ w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+ ((int16_t *)tt)[u] = (int16_t)w;
+ }
+
+ /*
+ * Signature is valid if and only if the aggregate (-s1,s2) vector
+ * is short enough.
+ */
+ return PQCLEAN_FALCONPADDED512_AVX2_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ tt[u] = (uint16_t)mq_conv_small(f[u]);
+ h[u] = (uint16_t)mq_conv_small(g[u]);
+ }
+ mq_NTT(h, logn);
+ mq_NTT(tt, logn);
+ for (u = 0; u < n; u ++) {
+ if (tt[u] == 0) {
+ return 0;
+ }
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *t1, *t2;
+
+ n = (size_t)1 << logn;
+ t1 = (uint16_t *)tmp;
+ t2 = t1 + n;
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint16_t)mq_conv_small(g[u]);
+ t2[u] = (uint16_t)mq_conv_small(F[u]);
+ }
+ mq_NTT(t1, logn);
+ mq_NTT(t2, logn);
+ mq_poly_tomonty(t1, logn);
+ mq_poly_montymul_ntt(t1, t2, logn);
+ for (u = 0; u < n; u ++) {
+ t2[u] = (uint16_t)mq_conv_small(f[u]);
+ }
+ mq_NTT(t2, logn);
+ for (u = 0; u < n; u ++) {
+ if (t2[u] == 0) {
+ return 0;
+ }
+ t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+ }
+ mq_iNTT(t1, logn);
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+ int32_t gi;
+
+ w = t1[u];
+ w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+ gi = *(int32_t *)&w;
+ if (gi < -127 || gi > +127) {
+ return 0;
+ }
+ G[u] = (int8_t)gi;
+ }
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+ mq_NTT(tt, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ }
+ return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+ * and c0 - s1 into h[].
+ */
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+
+ w = (uint32_t)s1[u];
+ w += Q & -(w >> 31);
+ w = mq_sub(c0[u], w);
+ h[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+ * is zero (in NTT representation) then the operation fails. We
+ * keep that information into a flag so that we do not deviate
+ * from strict constant-time processing; if all coefficients of
+ * s2 are non-zero, then the high bit of r will be zero.
+ */
+ mq_NTT(tt, logn);
+ mq_NTT(h, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+
+ /*
+ * Signature is acceptable if and only if it is short enough,
+ * and s2 was invertible mod phi mod q. The caller must still
+ * check that the rebuilt public key matches the expected
+ * value (e.g. through a hash).
+ */
+ r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED512_AVX2_is_short(s1, s2, logn);
+ return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_AVX2_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+ uint16_t *s2;
+ size_t u, n;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ s2 = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)sig[u];
+ w += Q & -(w >> 31);
+ s2[u] = (uint16_t)w;
+ }
+ mq_NTT(s2, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u] - 1u;
+ r += (w >> 31);
+ }
+ return (int)r;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/LICENSE b/src/sig/falcon/pqclean_falcon-padded-512_clean/LICENSE
new file mode 100644
index 000000000..18592ab71
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/LICENSE
@@ -0,0 +1,36 @@
+This code is provided under the MIT license:
+
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * ===========================(LICENSE END)=============================
+
+It was written by Thomas Pornin .
+
+It has been reported that patent US7308097B2 may be applicable to parts
+of Falcon. William Whyte, one of the designers of Falcon and also
+representative of OnBoard Security (current owner of the said patent),
+has pledged, as part of the IP statements submitted to the NIST for the
+PQC project, that in the event of Falcon being selected for
+standardization, a worldwide non-exclusive license to the patent will be
+granted for the purpose of implementing the standard "without
+compensation and under reasonable terms and conditions that are
+demonstrably free of any unfair discrimination".
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/api.h b/src/sig/falcon/pqclean_falcon-padded-512_clean/api.h
new file mode 100644
index 000000000..47c131469
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/api.h
@@ -0,0 +1,80 @@
+#ifndef PQCLEAN_FALCONPADDED512_CLEAN_API_H
+#define PQCLEAN_FALCONPADDED512_CLEAN_API_H
+
+#include
+#include
+
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES 1281
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES 897
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES 666
+
+#define PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_ALGNAME "Falcon-padded-512"
+
+/*
+ * Generate a new key pair. Public key goes into pk[], private key in sk[].
+ * Key sizes are exact (in bytes):
+ * public (pk): PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES
+ * private (sk): PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk);
+
+/*
+ * Compute a signature on a provided message (m, mlen), with a given
+ * private key (sk). Signature is written in sig[], with length written
+ * into *siglen. Signature length is variable; maximum signature length
+ * (in bytes) is PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES.
+ *
+ * sig[], m[] and sk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Verify a signature (sig, siglen) on a message (m, mlen) with a given
+ * public key (pk).
+ *
+ * sig[], m[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+/*
+ * Compute a signature on a message and pack the signature and message
+ * into a single object, written into sm[]. The length of that output is
+ * written in *smlen; that length may be larger than the message length
+ * (mlen) by up to PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES.
+ *
+ * sm[] and m[] may overlap each other arbitrarily; however, sm[] shall
+ * not overlap with sk[].
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk);
+
+/*
+ * Open a signed message object (sm, smlen) and verify the signature;
+ * on success, the message itself is written into m[] and its length
+ * into *mlen. The message is shorter than the signed message object,
+ * but the size difference depends on the signature value; the difference
+ * may range up to PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES.
+ *
+ * m[], sm[] and pk[] may overlap each other arbitrarily.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk);
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/codec.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/codec.c
new file mode 100644
index 000000000..2105122ec
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/codec.c
@@ -0,0 +1,570 @@
+/*
+ * Encoding/decoding of keys and signatures.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_modq_encode(
+ void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn) {
+ size_t n, out_len, u;
+ uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ if (x[u] >= 12289) {
+ return 0;
+ }
+ }
+ out_len = ((n * 14) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << 14) | x[u];
+ acc_len += 14;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_modq_decode(
+ uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len, u;
+ const uint8_t *buf;
+ uint32_t acc;
+ int acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * 14) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ u = 0;
+ while (u < n) {
+ acc = (acc << 8) | (*buf ++);
+ acc_len += 8;
+ if (acc_len >= 14) {
+ unsigned w;
+
+ acc_len -= 14;
+ w = (acc >> acc_len) & 0x3FFF;
+ if (w >= 12289) {
+ return 0;
+ }
+ x[u ++] = (uint16_t)w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint16_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_decode(
+ int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ w |= -(w & mask2);
+ x[u ++] = (int16_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+ void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits) {
+ size_t n, u, out_len;
+ int minv, maxv;
+ uint8_t *buf;
+ uint32_t acc, mask;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ maxv = (1 << (bits - 1)) - 1;
+ minv = -maxv;
+ for (u = 0; u < n; u ++) {
+ if (x[u] < minv || x[u] > maxv) {
+ return 0;
+ }
+ }
+ out_len = ((n * bits) + 7) >> 3;
+ if (out == NULL) {
+ return out_len;
+ }
+ if (out_len > max_out_len) {
+ return 0;
+ }
+ buf = out;
+ acc = 0;
+ acc_len = 0;
+ mask = ((uint32_t)1 << bits) - 1;
+ for (u = 0; u < n; u ++) {
+ acc = (acc << bits) | ((uint8_t)x[u] & mask);
+ acc_len += bits;
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ *buf ++ = (uint8_t)(acc >> acc_len);
+ }
+ }
+ if (acc_len > 0) {
+ *buf ++ = (uint8_t)(acc << (8 - acc_len));
+ }
+ return out_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+ int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len) {
+ size_t n, in_len;
+ const uint8_t *buf;
+ size_t u;
+ uint32_t acc, mask1, mask2;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ in_len = ((n * bits) + 7) >> 3;
+ if (in_len > max_in_len) {
+ return 0;
+ }
+ buf = in;
+ u = 0;
+ acc = 0;
+ acc_len = 0;
+ mask1 = ((uint32_t)1 << bits) - 1;
+ mask2 = (uint32_t)1 << (bits - 1);
+ while (u < n) {
+ acc = (acc << 8) | *buf ++;
+ acc_len += 8;
+ while (acc_len >= bits && u < n) {
+ uint32_t w;
+
+ acc_len -= bits;
+ w = (acc >> acc_len) & mask1;
+ w |= -(w & mask2);
+ if (w == -mask2) {
+ /*
+ * The -2^(bits-1) value is forbidden.
+ */
+ return 0;
+ }
+ x[u ++] = (int8_t) * (int32_t *)&w;
+ }
+ }
+ if ((acc & (((uint32_t)1 << acc_len) - 1)) != 0) {
+ /*
+ * Extra bits in the last byte must be zero.
+ */
+ return 0;
+ }
+ return in_len;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_comp_encode(
+ void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn) {
+ uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = out;
+
+ /*
+ * Make sure that all values are within the -2047..+2047 range.
+ */
+ for (u = 0; u < n; u ++) {
+ if (x[u] < -2047 || x[u] > +2047) {
+ return 0;
+ }
+ }
+
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ int t;
+ unsigned w;
+
+ /*
+ * Get sign and absolute value of next integer; push the
+ * sign bit.
+ */
+ acc <<= 1;
+ t = x[u];
+ if (t < 0) {
+ t = -t;
+ acc |= 1;
+ }
+ w = (unsigned)t;
+
+ /*
+ * Push the low 7 bits of the absolute value.
+ */
+ acc <<= 7;
+ acc |= w & 127u;
+ w >>= 7;
+
+ /*
+ * We pushed exactly 8 bits.
+ */
+ acc_len += 8;
+
+ /*
+ * Push as many zeros as necessary, then a one. Since the
+ * absolute value is at most 2047, w can only range up to
+ * 15 at this point, thus we will add at most 16 bits
+ * here. With the 8 bits above and possibly up to 7 bits
+ * from previous iterations, we may go up to 31 bits, which
+ * will fit in the accumulator, which is an uint32_t.
+ */
+ acc <<= (w + 1);
+ acc |= 1;
+ acc_len += w + 1;
+
+ /*
+ * Produce all full bytes.
+ */
+ while (acc_len >= 8) {
+ acc_len -= 8;
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc >> acc_len);
+ }
+ v ++;
+ }
+ }
+
+ /*
+ * Flush remaining bits (if any).
+ */
+ if (acc_len > 0) {
+ if (buf != NULL) {
+ if (v >= max_out_len) {
+ return 0;
+ }
+ buf[v] = (uint8_t)(acc << (8 - acc_len));
+ }
+ v ++;
+ }
+
+ return v;
+}
+
+/* see inner.h */
+size_t
+PQCLEAN_FALCONPADDED512_CLEAN_comp_decode(
+ int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len) {
+ const uint8_t *buf;
+ size_t n, u, v;
+ uint32_t acc;
+ unsigned acc_len;
+
+ n = (size_t)1 << logn;
+ buf = in;
+ acc = 0;
+ acc_len = 0;
+ v = 0;
+ for (u = 0; u < n; u ++) {
+ unsigned b, s, m;
+
+ /*
+ * Get next eight bits: sign and low seven bits of the
+ * absolute value.
+ */
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ b = acc >> acc_len;
+ s = b & 128;
+ m = b & 127;
+
+ /*
+ * Get next bits until a 1 is reached.
+ */
+ for (;;) {
+ if (acc_len == 0) {
+ if (v >= max_in_len) {
+ return 0;
+ }
+ acc = (acc << 8) | (uint32_t)buf[v ++];
+ acc_len = 8;
+ }
+ acc_len --;
+ if (((acc >> acc_len) & 1) != 0) {
+ break;
+ }
+ m += 128;
+ if (m > 2047) {
+ return 0;
+ }
+ }
+
+ /*
+ * "-0" is forbidden.
+ */
+ if (s && m == 0) {
+ return 0;
+ }
+ if (s) {
+ x[u] = (int16_t) - m;
+ } else {
+ x[u] = (int16_t)m;
+ }
+ }
+
+ /*
+ * Unused bits in the last byte must be zero.
+ */
+ if ((acc & ((1u << acc_len) - 1u)) != 0) {
+ return 0;
+ }
+
+ return v;
+}
+
+/*
+ * Key elements and signatures are polynomials with small integer
+ * coefficients. Here are some statistics gathered over many
+ * generated key pairs (10000 or more for each degree):
+ *
+ * log(n) n max(f,g) std(f,g) max(F,G) std(F,G)
+ * 1 2 129 56.31 143 60.02
+ * 2 4 123 40.93 160 46.52
+ * 3 8 97 28.97 159 38.01
+ * 4 16 100 21.48 154 32.50
+ * 5 32 71 15.41 151 29.36
+ * 6 64 59 11.07 138 27.77
+ * 7 128 39 7.91 144 27.00
+ * 8 256 32 5.63 148 26.61
+ * 9 512 22 4.00 137 26.46
+ * 10 1024 15 2.84 146 26.41
+ *
+ * We want a compact storage format for private key, and, as part of
+ * key generation, we are allowed to reject some keys which would
+ * otherwise be fine (this does not induce any noticeable vulnerability
+ * as long as we reject only a small proportion of possible keys).
+ * Hence, we enforce at key generation time maximum values for the
+ * elements of f, g, F and G, so that their encoding can be expressed
+ * in fixed-width values. Limits have been chosen so that generated
+ * keys are almost always within bounds, thus not impacting neither
+ * security or performance.
+ *
+ * IMPORTANT: the code assumes that all coefficients of f, g, F and G
+ * ultimately fit in the -127..+127 range. Thus, none of the elements
+ * of max_fg_bits[] and max_FG_bits[] shall be greater than 8.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 7,
+ 7,
+ 6,
+ 6,
+ 5
+};
+
+const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[] = {
+ 0, /* unused */
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8,
+ 8
+};
+
+/*
+ * When generating a new key pair, we can always reject keys which
+ * feature an abnormally large coefficient. This can also be done for
+ * signatures, albeit with some care: in case the signature process is
+ * used in a derandomized setup (explicitly seeded with the message and
+ * private key), we have to follow the specification faithfully, and the
+ * specification only enforces a limit on the L2 norm of the signature
+ * vector. The limit on the L2 norm implies that the absolute value of
+ * a coefficient of the signature cannot be more than the following:
+ *
+ * log(n) n max sig coeff (theoretical)
+ * 1 2 412
+ * 2 4 583
+ * 3 8 824
+ * 4 16 1166
+ * 5 32 1649
+ * 6 64 2332
+ * 7 128 3299
+ * 8 256 4665
+ * 9 512 6598
+ * 10 1024 9331
+ *
+ * However, the largest observed signature coefficients during our
+ * experiments was 1077 (in absolute value), hence we can assume that,
+ * with overwhelming probability, signature coefficients will fit
+ * in -2047..2047, i.e. 12 bits.
+ */
+
+const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_sig_bits[] = {
+ 0, /* unused */
+ 10,
+ 11,
+ 11,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12,
+ 12
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/common.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/common.c
new file mode 100644
index 000000000..74e88e903
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/common.c
@@ -0,0 +1,302 @@
+/*
+ * Support functions for signatures (hash-to-point, norm).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_vartime(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn) {
+ /*
+ * This is the straightforward per-the-spec implementation. It
+ * is not constant-time, thus it might reveal information on the
+ * plaintext (at least, enough to check the plaintext against a
+ * list of potential plaintexts) in a scenario where the
+ * attacker does not have access to the signature value or to
+ * the public key, but knows the nonce (without knowledge of the
+ * nonce, the hashed output cannot be matched against potential
+ * plaintexts).
+ */
+ size_t n;
+
+ n = (size_t)1 << logn;
+ while (n > 0) {
+ uint8_t buf[2];
+ uint32_t w;
+
+ inner_shake256_extract(sc, (void *)buf, sizeof buf);
+ w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+ if (w < 61445) {
+ while (w >= 12289) {
+ w -= 12289;
+ }
+ *x ++ = (uint16_t)w;
+ n --;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(
+ inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp) {
+ /*
+ * Each 16-bit sample is a value in 0..65535. The value is
+ * kept if it falls in 0..61444 (because 61445 = 5*12289)
+ * and rejected otherwise; thus, each sample has probability
+ * about 0.93758 of being selected.
+ *
+ * We want to oversample enough to be sure that we will
+ * have enough values with probability at least 1 - 2^(-256).
+ * Depending on degree N, this leads to the following
+ * required oversampling:
+ *
+ * logn n oversampling
+ * 1 2 65
+ * 2 4 67
+ * 3 8 71
+ * 4 16 77
+ * 5 32 86
+ * 6 64 100
+ * 7 128 122
+ * 8 256 154
+ * 9 512 205
+ * 10 1024 287
+ *
+ * If logn >= 7, then the provided temporary buffer is large
+ * enough. Otherwise, we use a stack buffer of 63 entries
+ * (i.e. 126 bytes) for the values that do not fit in tmp[].
+ */
+
+ static const uint16_t overtab[] = {
+ 0, /* unused */
+ 65,
+ 67,
+ 71,
+ 77,
+ 86,
+ 100,
+ 122,
+ 154,
+ 205,
+ 287
+ };
+
+ unsigned n, n2, u, m, p, over;
+ uint16_t *tt1, tt2[63];
+
+ /*
+ * We first generate m 16-bit value. Values 0..n-1 go to x[].
+ * Values n..2*n-1 go to tt1[]. Values 2*n and later go to tt2[].
+ * We also reduce modulo q the values; rejected values are set
+ * to 0xFFFF.
+ */
+ n = 1U << logn;
+ n2 = n << 1;
+ over = overtab[logn];
+ m = n + over;
+ tt1 = (uint16_t *)tmp;
+ for (u = 0; u < m; u ++) {
+ uint8_t buf[2];
+ uint32_t w, wr;
+
+ inner_shake256_extract(sc, buf, sizeof buf);
+ w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
+ wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
+ wr = wr - ((uint32_t)12289 & (((wr - 12289) >> 31) - 1));
+ wr |= ((w - 61445) >> 31) - 1;
+ if (u < n) {
+ x[u] = (uint16_t)wr;
+ } else if (u < n2) {
+ tt1[u - n] = (uint16_t)wr;
+ } else {
+ tt2[u - n2] = (uint16_t)wr;
+ }
+ }
+
+ /*
+ * Now we must "squeeze out" the invalid values. We do this in
+ * a logarithmic sequence of passes; each pass computes where a
+ * value should go, and moves it down by 'p' slots if necessary,
+ * where 'p' uses an increasing powers-of-two scale. It can be
+ * shown that in all cases where the loop decides that a value
+ * has to be moved down by p slots, the destination slot is
+ * "free" (i.e. contains an invalid value).
+ */
+ for (p = 1; p <= over; p <<= 1) {
+ unsigned v;
+
+ /*
+ * In the loop below:
+ *
+ * - v contains the index of the final destination of
+ * the value; it is recomputed dynamically based on
+ * whether values are valid or not.
+ *
+ * - u is the index of the value we consider ("source");
+ * its address is s.
+ *
+ * - The loop may swap the value with the one at index
+ * u-p. The address of the swap destination is d.
+ */
+ v = 0;
+ for (u = 0; u < m; u ++) {
+ uint16_t *s, *d;
+ unsigned j, sv, dv, mk;
+
+ if (u < n) {
+ s = &x[u];
+ } else if (u < n2) {
+ s = &tt1[u - n];
+ } else {
+ s = &tt2[u - n2];
+ }
+ sv = *s;
+
+ /*
+ * The value in sv should ultimately go to
+ * address v, i.e. jump back by u-v slots.
+ */
+ j = u - v;
+
+ /*
+ * We increment v for the next iteration, but
+ * only if the source value is valid. The mask
+ * 'mk' is -1 if the value is valid, 0 otherwise,
+ * so we _subtract_ mk.
+ */
+ mk = (sv >> 15) - 1U;
+ v -= mk;
+
+ /*
+ * In this loop we consider jumps by p slots; if
+ * u < p then there is nothing more to do.
+ */
+ if (u < p) {
+ continue;
+ }
+
+ /*
+ * Destination for the swap: value at address u-p.
+ */
+ if ((u - p) < n) {
+ d = &x[u - p];
+ } else if ((u - p) < n2) {
+ d = &tt1[(u - p) - n];
+ } else {
+ d = &tt2[(u - p) - n2];
+ }
+ dv = *d;
+
+ /*
+ * The swap should be performed only if the source
+ * is valid AND the jump j has its 'p' bit set.
+ */
+ mk &= -(((j & p) + 0x1FF) >> 9);
+
+ *s = (uint16_t)(sv ^ (mk & (sv ^ dv)));
+ *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
+ }
+ }
+}
+
+/*
+ * Acceptance bound for the (squared) l2-norm of the signature depends
+ * on the degree. This array is indexed by logn (1 to 10). These bounds
+ * are _inclusive_ (they are equal to floor(beta^2)).
+ */
+static const uint32_t l2bound[] = {
+ 0, /* unused */
+ 101498,
+ 208714,
+ 428865,
+ 892039,
+ 1852696,
+ 3842630,
+ 7959734,
+ 16468416,
+ 34034726,
+ 70265242
+};
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_is_short(
+ const int16_t *s1, const int16_t *s2, unsigned logn) {
+ /*
+ * We use the l2-norm. Code below uses only 32-bit operations to
+ * compute the square of the norm with saturation to 2^32-1 if
+ * the value exceeds 2^31-1.
+ */
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = (size_t)1 << logn;
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s1[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ z = s2[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ s |= -(ng >> 31);
+
+ return s <= l2bound[logn];
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(
+ uint32_t sqn, const int16_t *s2, unsigned logn) {
+ size_t n, u;
+ uint32_t ng;
+
+ n = (size_t)1 << logn;
+ ng = -(sqn >> 31);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = s2[u];
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ }
+ sqn |= -(ng >> 31);
+
+ return sqn <= l2bound[logn];
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/fft.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/fft.c
new file mode 100644
index 000000000..011fbe11d
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/fft.c
@@ -0,0 +1,699 @@
+/*
+ * FFT code.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/*
+ * Rules for complex number macros:
+ * --------------------------------
+ *
+ * Operand order is: destination, source1, source2...
+ *
+ * Each operand is a real and an imaginary part.
+ *
+ * All overlaps are allowed.
+ */
+
+/*
+ * Addition of two complex numbers (d = a + b).
+ */
+#define FPC_ADD(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_add(a_re, b_re); \
+ fpct_im = fpr_add(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Subtraction of two complex numbers (d = a - b).
+ */
+#define FPC_SUB(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_re, fpct_im; \
+ fpct_re = fpr_sub(a_re, b_re); \
+ fpct_im = fpr_sub(a_im, b_im); \
+ (d_re) = fpct_re; \
+ (d_im) = fpct_im; \
+ } while (0)
+
+/*
+ * Multplication of two complex numbers (d = a * b).
+ */
+#define FPC_MUL(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Squaring of a complex number (d = a * a).
+ */
+#define FPC_SQR(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_d_re = fpr_sub(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_d_im = fpr_double(fpr_mul(fpct_a_re, fpct_a_im)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Inversion of a complex number (d = 1 / a).
+ */
+#define FPC_INV(d_re, d_im, a_re, a_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_a_re), fpr_sqr(fpct_a_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_d_re = fpr_mul(fpct_a_re, fpct_m); \
+ fpct_d_im = fpr_mul(fpr_neg(fpct_a_im), fpct_m); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Division of complex numbers (d = a / b).
+ */
+#define FPC_DIV(d_re, d_im, a_re, a_im, b_re, b_im) do { \
+ fpr fpct_a_re, fpct_a_im; \
+ fpr fpct_b_re, fpct_b_im; \
+ fpr fpct_d_re, fpct_d_im; \
+ fpr fpct_m; \
+ fpct_a_re = (a_re); \
+ fpct_a_im = (a_im); \
+ fpct_b_re = (b_re); \
+ fpct_b_im = (b_im); \
+ fpct_m = fpr_add(fpr_sqr(fpct_b_re), fpr_sqr(fpct_b_im)); \
+ fpct_m = fpr_inv(fpct_m); \
+ fpct_b_re = fpr_mul(fpct_b_re, fpct_m); \
+ fpct_b_im = fpr_mul(fpr_neg(fpct_b_im), fpct_m); \
+ fpct_d_re = fpr_sub( \
+ fpr_mul(fpct_a_re, fpct_b_re), \
+ fpr_mul(fpct_a_im, fpct_b_im)); \
+ fpct_d_im = fpr_add( \
+ fpr_mul(fpct_a_re, fpct_b_im), \
+ fpr_mul(fpct_a_im, fpct_b_re)); \
+ (d_re) = fpct_d_re; \
+ (d_im) = fpct_d_im; \
+ } while (0)
+
+/*
+ * Let w = exp(i*pi/N); w is a primitive 2N-th root of 1. We define the
+ * values w_j = w^(2j+1) for all j from 0 to N-1: these are the roots
+ * of X^N+1 in the field of complex numbers. A crucial property is that
+ * w_{N-1-j} = conj(w_j) = 1/w_j for all j.
+ *
+ * FFT representation of a polynomial f (taken modulo X^N+1) is the
+ * set of values f(w_j). Since f is real, conj(f(w_j)) = f(conj(w_j)),
+ * thus f(w_{N-1-j}) = conj(f(w_j)). We thus store only half the values,
+ * for j = 0 to N/2-1; the other half can be recomputed easily when (if)
+ * needed. A consequence is that FFT representation has the same size
+ * as normal representation: N/2 complex numbers use N real numbers (each
+ * complex number is the combination of a real and an imaginary part).
+ *
+ * We use a specific ordering which makes computations easier. Let rev()
+ * be the bit-reversal function over log(N) bits. For j in 0..N/2-1, we
+ * store the real and imaginary parts of f(w_j) in slots:
+ *
+ * Re(f(w_j)) -> slot rev(j)/2
+ * Im(f(w_j)) -> slot rev(j)/2+N/2
+ *
+ * (Note that rev(j) is even for j < N/2.)
+ */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_FFT(fpr *f, unsigned logn) {
+ /*
+ * FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = N
+ * for m = 1; m < N; m *= 2:
+ * ht = t/2
+ * for i1 = 0; i1 < m; i1 ++:
+ * j1 = i1 * t
+ * s = GM[m + i1]
+ * for j = j1; j < (j1 + ht); j ++:
+ * x = f[j]
+ * y = s * f[j + ht]
+ * f[j] = x + y
+ * f[j + ht] = x - y
+ * t = ht
+ *
+ * GM[k] contains w^rev(k) for primitive root w = exp(i*pi/N).
+ *
+ * In the description above, f[] is supposed to contain complex
+ * numbers. In our in-memory representation, the real and
+ * imaginary parts of f[k] are in array slots k and k+N/2.
+ *
+ * We only keep the first half of the complex numbers. We can
+ * see that after the first iteration, the first and second halves
+ * of the array of complex numbers have separate lives, so we
+ * simply ignore the second part.
+ */
+
+ unsigned u;
+ size_t t, n, hn, m;
+
+ /*
+ * First iteration: compute f[j] + i * f[j+N/2] for all j < N/2
+ * (because GM[1] = w^rev(1) = w^(N/2) = i).
+ * In our chosen representation, this is a no-op: everything is
+ * already where it should be.
+ */
+
+ /*
+ * Subsequent iterations are truncated to use only the first
+ * half of values.
+ */
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ t = hn;
+ for (u = 1, m = 2; u < logn; u ++, m <<= 1) {
+ size_t ht, hm, i1, j1;
+
+ ht = t >> 1;
+ hm = m >> 1;
+ for (i1 = 0, j1 = 0; i1 < hm; i1 ++, j1 += t) {
+ size_t j, j2;
+
+ j2 = j1 + ht;
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((m + i1) << 1) + 0];
+ s_im = fpr_gm_tab[((m + i1) << 1) + 1];
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + ht];
+ y_im = f[j + ht + hn];
+ FPC_MUL(y_re, y_im, y_re, y_im, s_re, s_im);
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(f[j + ht], f[j + ht + hn],
+ x_re, x_im, y_re, y_im);
+ }
+ }
+ t = ht;
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_iFFT(fpr *f, unsigned logn) {
+ /*
+ * Inverse FFT algorithm in bit-reversal order uses the following
+ * iterative algorithm:
+ *
+ * t = 1
+ * for m = N; m > 1; m /= 2:
+ * hm = m/2
+ * dt = t*2
+ * for i1 = 0; i1 < hm; i1 ++:
+ * j1 = i1 * dt
+ * s = iGM[hm + i1]
+ * for j = j1; j < (j1 + t); j ++:
+ * x = f[j]
+ * y = f[j + t]
+ * f[j] = x + y
+ * f[j + t] = s * (x - y)
+ * t = dt
+ * for i1 = 0; i1 < N; i1 ++:
+ * f[i1] = f[i1] / N
+ *
+ * iGM[k] contains (1/w)^rev(k) for primitive root w = exp(i*pi/N)
+ * (actually, iGM[k] = 1/GM[k] = conj(GM[k])).
+ *
+ * In the main loop (not counting the final division loop), in
+ * all iterations except the last, the first and second half of f[]
+ * (as an array of complex numbers) are separate. In our chosen
+ * representation, we do not keep the second half.
+ *
+ * The last iteration recombines the recomputed half with the
+ * implicit half, and should yield only real numbers since the
+ * target polynomial is real; moreover, s = i at that step.
+ * Thus, when considering x and y:
+ * y = conj(x) since the final f[j] must be real
+ * Therefore, f[j] is filled with 2*Re(x), and f[j + t] is
+ * filled with 2*Im(x).
+ * But we already have Re(x) and Im(x) in array slots j and j+t
+ * in our chosen representation. That last iteration is thus a
+ * simple doubling of the values in all the array.
+ *
+ * We make the last iteration a no-op by tweaking the final
+ * division into a division by N/2, not N.
+ */
+ size_t u, n, hn, t, m;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ hn = n >> 1;
+ for (u = logn; u > 1; u --) {
+ size_t hm, dt, i1, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i1 = 0, j1 = 0; j1 < hn; i1 ++, j1 += dt) {
+ size_t j, j2;
+
+ j2 = j1 + t;
+ fpr s_re, s_im;
+
+ s_re = fpr_gm_tab[((hm + i1) << 1) + 0];
+ s_im = fpr_neg(fpr_gm_tab[((hm + i1) << 1) + 1]);
+ for (j = j1; j < j2; j ++) {
+ fpr x_re, x_im, y_re, y_im;
+
+ x_re = f[j];
+ x_im = f[j + hn];
+ y_re = f[j + t];
+ y_im = f[j + t + hn];
+ FPC_ADD(f[j], f[j + hn],
+ x_re, x_im, y_re, y_im);
+ FPC_SUB(x_re, x_im, x_re, x_im, y_re, y_im);
+ FPC_MUL(f[j + t], f[j + t + hn],
+ x_re, x_im, s_re, s_im);
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * Last iteration is a no-op, provided that we divide by N/2
+ * instead of N. We need to make a special case for logn = 0.
+ */
+ if (logn > 0) {
+ fpr ni;
+
+ ni = fpr_p2_tab[logn];
+ for (u = 0; u < n; u ++) {
+ f[u] = fpr_mul(f[u], ni);
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_add(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_add(a[u], b[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_sub(a[u], b[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(fpr *a, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = (n >> 1); u < n; u ++) {
+ a[u] = fpr_neg(a[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = fpr_neg(b[u + hn]);
+ FPC_MUL(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn) {
+ /*
+ * Since each coefficient is multiplied with its own conjugate,
+ * the result contains only real values.
+ */
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ a[u] = fpr_add(fpr_sqr(a_re), fpr_sqr(a_im));
+ a[u + hn] = fpr_zero;
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn) {
+ size_t n, u;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ a[u] = fpr_mul(a[u], x);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_div_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ FPC_DIV(a[u], a[u + hn], a_re, a_im, b_re, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr a_re, a_im;
+ fpr b_re, b_im;
+
+ a_re = a[u];
+ a_im = a[u + hn];
+ b_re = b[u];
+ b_im = b[u + hn];
+ d[u] = fpr_inv(fpr_add(
+ fpr_add(fpr_sqr(a_re), fpr_sqr(a_im)),
+ fpr_add(fpr_sqr(b_re), fpr_sqr(b_im))));
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr F_re, F_im, G_re, G_im;
+ fpr f_re, f_im, g_re, g_im;
+ fpr a_re, a_im, b_re, b_im;
+
+ F_re = F[u];
+ F_im = F[u + hn];
+ G_re = G[u];
+ G_im = G[u + hn];
+ f_re = f[u];
+ f_im = f[u + hn];
+ g_re = g[u];
+ g_im = g[u + hn];
+
+ FPC_MUL(a_re, a_im, F_re, F_im, f_re, fpr_neg(f_im));
+ FPC_MUL(b_re, b_im, G_re, G_im, g_re, fpr_neg(g_im));
+ d[u] = fpr_add(a_re, b_re);
+ d[u + hn] = fpr_add(a_im, b_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ a[u] = fpr_mul(a[u], b[u]);
+ a[u + hn] = fpr_mul(a[u + hn], b[u]);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_div_autoadj_fft(
+ fpr *a, const fpr *b, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr ib;
+
+ ib = fpr_inv(b[u]);
+ a[u] = fpr_mul(a[u], ib);
+ a[u + hn] = fpr_mul(a[u + hn], ib);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_LDL_fft(
+ const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(g11[u], g11[u + hn], g11_re, g11_im, g01_re, g01_im);
+ g01[u] = mu_re;
+ g01[u + hn] = fpr_neg(mu_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(
+ fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn) {
+ size_t n, hn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ for (u = 0; u < hn; u ++) {
+ fpr g00_re, g00_im, g01_re, g01_im, g11_re, g11_im;
+ fpr mu_re, mu_im;
+
+ g00_re = g00[u];
+ g00_im = g00[u + hn];
+ g01_re = g01[u];
+ g01_im = g01[u + hn];
+ g11_re = g11[u];
+ g11_im = g11[u + hn];
+ FPC_DIV(mu_re, mu_im, g01_re, g01_im, g00_re, g00_im);
+ FPC_MUL(g01_re, g01_im, mu_re, mu_im, g01_re, fpr_neg(g01_im));
+ FPC_SUB(d11[u], d11[u + hn], g11_re, g11_im, g01_re, g01_im);
+ l10[u] = mu_re;
+ l10[u + hn] = fpr_neg(mu_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(
+ fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn) {
+ /*
+ * The FFT representation we use is in bit-reversed order
+ * (element i contains f(w^(rev(i))), where rev() is the
+ * bit-reversal function over the ring degree. This changes
+ * indexes with regards to the Falcon specification.
+ */
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ /*
+ * We process complex values by pairs. For logn = 1, there is only
+ * one complex value (the other one is the implicit conjugate),
+ * so we add the two lines below because the loop will be
+ * skipped.
+ */
+ f0[0] = f[0];
+ f1[0] = f[hn];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f[(u << 1) + 0];
+ a_im = f[(u << 1) + 0 + hn];
+ b_re = f[(u << 1) + 1];
+ b_im = f[(u << 1) + 1 + hn];
+
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f0[u] = fpr_half(t_re);
+ f0[u + qn] = fpr_half(t_im);
+
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ FPC_MUL(t_re, t_im, t_re, t_im,
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_neg(fpr_gm_tab[((u + hn) << 1) + 1]));
+ f1[u] = fpr_half(t_re);
+ f1[u + qn] = fpr_half(t_im);
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(
+ fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn) {
+ size_t n, hn, qn, u;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ qn = hn >> 1;
+
+ /*
+ * An extra copy to handle the special case logn = 1.
+ */
+ f[0] = f0[0];
+ f[hn] = f1[0];
+
+ for (u = 0; u < qn; u ++) {
+ fpr a_re, a_im, b_re, b_im;
+ fpr t_re, t_im;
+
+ a_re = f0[u];
+ a_im = f0[u + qn];
+ FPC_MUL(b_re, b_im, f1[u], f1[u + qn],
+ fpr_gm_tab[((u + hn) << 1) + 0],
+ fpr_gm_tab[((u + hn) << 1) + 1]);
+ FPC_ADD(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 0] = t_re;
+ f[(u << 1) + 0 + hn] = t_im;
+ FPC_SUB(t_re, t_im, a_re, a_im, b_re, b_im);
+ f[(u << 1) + 1] = t_re;
+ f[(u << 1) + 1 + hn] = t_im;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.c
new file mode 100644
index 000000000..82ff1df46
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.c
@@ -0,0 +1,1622 @@
+/*
+ * Floating-point operations.
+ *
+ * This file implements the non-inline functions declared in
+ * fpr.h, as well as the constants for FFT / iFFT.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/*
+ * Normalize a provided unsigned integer to the 2^63..2^64-1 range by
+ * left-shifting it if necessary. The exponent e is adjusted accordingly
+ * (i.e. if the value was left-shifted by n bits, then n is subtracted
+ * from e). If source m is 0, then it remains 0, but e is altered.
+ * Both m and e must be simple variables (no expressions allowed).
+ */
+#define FPR_NORM64(m, e) do { \
+ uint32_t nt; \
+ \
+ (e) -= 63; \
+ \
+ nt = (uint32_t)((m) >> 32); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 32)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 5); \
+ \
+ nt = (uint32_t)((m) >> 48); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 16)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 4); \
+ \
+ nt = (uint32_t)((m) >> 56); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 8)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 3); \
+ \
+ nt = (uint32_t)((m) >> 60); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 4)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 2); \
+ \
+ nt = (uint32_t)((m) >> 62); \
+ nt = (nt | -nt) >> 31; \
+ (m) ^= ((m) ^ ((m) << 2)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt << 1); \
+ \
+ nt = (uint32_t)((m) >> 63); \
+ (m) ^= ((m) ^ ((m) << 1)) & ((uint64_t)nt - 1); \
+ (e) += (int)(nt); \
+ } while (0)
+
+fpr
+fpr_scaled(int64_t i, int sc) {
+ /*
+ * To convert from int to float, we have to do the following:
+ * 1. Get the absolute value of the input, and its sign
+ * 2. Shift right or left the value as appropriate
+ * 3. Pack the result
+ *
+ * We can assume that the source integer is not -2^63.
+ */
+ int s, e;
+ uint32_t t;
+ uint64_t m;
+
+ /*
+ * Extract sign bit.
+ * We have: -i = 1 + ~i
+ */
+ s = (int)((uint64_t)i >> 63);
+ i ^= -(int64_t)s;
+ i += s;
+
+ /*
+ * For now we suppose that i != 0.
+ * Otherwise, we set m to i and left-shift it as much as needed
+ * to get a 1 in the top bit. We can do that in a logarithmic
+ * number of conditional shifts.
+ */
+ m = (uint64_t)i;
+ e = 9 + sc;
+ FPR_NORM64(m, e);
+
+ /*
+ * Now m is in the 2^63..2^64-1 range. We must divide it by 512;
+ * if one of the dropped bits is a 1, this should go into the
+ * "sticky bit".
+ */
+ m |= ((uint32_t)m & 0x1FF) + 0x1FF;
+ m >>= 9;
+
+ /*
+ * Corrective action: if i = 0 then all of the above was
+ * incorrect, and we clamp e and m down to zero.
+ */
+ t = (uint32_t)((uint64_t)(i | -i) >> 63);
+ m &= -(uint64_t)t;
+ e &= -(int)t;
+
+ /*
+ * Assemble back everything. The FPR() function will handle cases
+ * where e is too low.
+ */
+ return FPR(s, e, m);
+}
+
+fpr
+fpr_add(fpr x, fpr y) {
+ uint64_t m, xu, yu, za;
+ uint32_t cs;
+ int ex, ey, sx, sy, cc;
+
+ /*
+ * Make sure that the first operand (x) has the larger absolute
+ * value. This guarantees that the exponent of y is less than
+ * or equal to the exponent of x, and, if they are equal, then
+ * the mantissa of y will not be greater than the mantissa of x.
+ *
+ * After this swap, the result will have the sign x, except in
+ * the following edge case: abs(x) = abs(y), and x and y have
+ * opposite sign bits; in that case, the result shall be +0
+ * even if the sign bit of x is 1. To handle this case properly,
+ * we do the swap is abs(x) = abs(y) AND the sign of x is 1.
+ */
+ m = ((uint64_t)1 << 63) - 1;
+ za = (x & m) - (y & m);
+ cs = (uint32_t)(za >> 63)
+ | ((1U - (uint32_t)(-za >> 63)) & (uint32_t)(x >> 63));
+ m = (x ^ y) & -(uint64_t)cs;
+ x ^= m;
+ y ^= m;
+
+ /*
+ * Extract sign bits, exponents and mantissas. The mantissas are
+ * scaled up to 2^55..2^56-1, and the exponent is unbiased. If
+ * an operand is zero, its mantissa is set to 0 at this step, and
+ * its exponent will be -1078.
+ */
+ ex = (int)(x >> 52);
+ sx = ex >> 11;
+ ex &= 0x7FF;
+ m = (uint64_t)(uint32_t)((ex + 0x7FF) >> 11) << 52;
+ xu = ((x & (((uint64_t)1 << 52) - 1)) | m) << 3;
+ ex -= 1078;
+ ey = (int)(y >> 52);
+ sy = ey >> 11;
+ ey &= 0x7FF;
+ m = (uint64_t)(uint32_t)((ey + 0x7FF) >> 11) << 52;
+ yu = ((y & (((uint64_t)1 << 52) - 1)) | m) << 3;
+ ey -= 1078;
+
+ /*
+ * x has the larger exponent; hence, we only need to right-shift y.
+ * If the shift count is larger than 59 bits then we clamp the
+ * value to zero.
+ */
+ cc = ex - ey;
+ yu &= -(uint64_t)((uint32_t)(cc - 60) >> 31);
+ cc &= 63;
+
+ /*
+ * The lowest bit of yu is "sticky".
+ */
+ m = fpr_ulsh(1, cc) - 1;
+ yu |= (yu & m) + m;
+ yu = fpr_ursh(yu, cc);
+
+ /*
+ * If the operands have the same sign, then we add the mantissas;
+ * otherwise, we subtract the mantissas.
+ */
+ xu += yu - ((yu << 1) & -(uint64_t)(sx ^ sy));
+
+ /*
+ * The result may be smaller, or slightly larger. We normalize
+ * it to the 2^63..2^64-1 range (if xu is zero, then it stays
+ * at zero).
+ */
+ FPR_NORM64(xu, ex);
+
+ /*
+ * Scale down the value to 2^54..s^55-1, handling the last bit
+ * as sticky.
+ */
+ xu |= ((uint32_t)xu & 0x1FF) + 0x1FF;
+ xu >>= 9;
+ ex += 9;
+
+ /*
+ * In general, the result has the sign of x. However, if the
+ * result is exactly zero, then the following situations may
+ * be encountered:
+ * x > 0, y = -x -> result should be +0
+ * x < 0, y = -x -> result should be +0
+ * x = +0, y = +0 -> result should be +0
+ * x = -0, y = +0 -> result should be +0
+ * x = +0, y = -0 -> result should be +0
+ * x = -0, y = -0 -> result should be -0
+ *
+ * But at the conditional swap step at the start of the
+ * function, we ensured that if abs(x) = abs(y) and the
+ * sign of x was 1, then x and y were swapped. Thus, the
+ * two following cases cannot actually happen:
+ * x < 0, y = -x
+ * x = -0, y = +0
+ * In all other cases, the sign bit of x is conserved, which
+ * is what the FPR() function does. The FPR() function also
+ * properly clamps values to zero when the exponent is too
+ * low, but does not alter the sign in that case.
+ */
+ return FPR(sx, ex, xu);
+}
+
+fpr
+fpr_mul(fpr x, fpr y) {
+ uint64_t xu, yu, w, zu, zv;
+ uint32_t x0, x1, y0, y1, z0, z1, z2;
+ int ex, ey, d, e, s;
+
+ /*
+ * Extract absolute values as scaled unsigned integers. We
+ * don't extract exponents yet.
+ */
+ xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+ yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+ /*
+ * We have two 53-bit integers to multiply; we need to split
+ * each into a lower half and a upper half. Moreover, we
+ * prefer to have lower halves to be of 25 bits each, for
+ * reasons explained later on.
+ */
+ x0 = (uint32_t)xu & 0x01FFFFFF;
+ x1 = (uint32_t)(xu >> 25);
+ y0 = (uint32_t)yu & 0x01FFFFFF;
+ y1 = (uint32_t)(yu >> 25);
+ w = (uint64_t)x0 * (uint64_t)y0;
+ z0 = (uint32_t)w & 0x01FFFFFF;
+ z1 = (uint32_t)(w >> 25);
+ w = (uint64_t)x0 * (uint64_t)y1;
+ z1 += (uint32_t)w & 0x01FFFFFF;
+ z2 = (uint32_t)(w >> 25);
+ w = (uint64_t)x1 * (uint64_t)y0;
+ z1 += (uint32_t)w & 0x01FFFFFF;
+ z2 += (uint32_t)(w >> 25);
+ zu = (uint64_t)x1 * (uint64_t)y1;
+ z2 += (z1 >> 25);
+ z1 &= 0x01FFFFFF;
+ zu += z2;
+
+ /*
+ * Since xu and yu are both in the 2^52..2^53-1 range, the
+ * product is in the 2^104..2^106-1 range. We first reassemble
+ * it and round it into the 2^54..2^56-1 range; the bottom bit
+ * is made "sticky". Since the low limbs z0 and z1 are 25 bits
+ * each, we just take the upper part (zu), and consider z0 and
+ * z1 only for purposes of stickiness.
+ * (This is the reason why we chose 25-bit limbs above.)
+ */
+ zu |= ((z0 | z1) + 0x01FFFFFF) >> 25;
+
+ /*
+ * We normalize zu to the 2^54..s^55-1 range: it could be one
+ * bit too large at this point. This is done with a conditional
+ * right-shift that takes into account the sticky bit.
+ */
+ zv = (zu >> 1) | (zu & 1);
+ w = zu >> 55;
+ zu ^= (zu ^ zv) & -w;
+
+ /*
+ * Get the aggregate scaling factor:
+ *
+ * - Each exponent is biased by 1023.
+ *
+ * - Integral mantissas are scaled by 2^52, hence an
+ * extra 52 bias for each exponent.
+ *
+ * - However, we right-shifted z by 50 bits, and then
+ * by 0 or 1 extra bit (depending on the value of w).
+ *
+ * In total, we must add the exponents, then subtract
+ * 2 * (1023 + 52), then add 50 + w.
+ */
+ ex = (int)((x >> 52) & 0x7FF);
+ ey = (int)((y >> 52) & 0x7FF);
+ e = ex + ey - 2100 + (int)w;
+
+ /*
+ * Sign bit is the XOR of the operand sign bits.
+ */
+ s = (int)((x ^ y) >> 63);
+
+ /*
+ * Corrective actions for zeros: if either of the operands is
+ * zero, then the computations above were wrong. Test for zero
+ * is whether ex or ey is zero. We just have to set the mantissa
+ * (zu) to zero, the FPR() function will normalize e.
+ */
+ d = ((ex + 0x7FF) & (ey + 0x7FF)) >> 11;
+ zu &= -(uint64_t)d;
+
+ /*
+ * FPR() packs the result and applies proper rounding.
+ */
+ return FPR(s, e, zu);
+}
+
+fpr
+fpr_div(fpr x, fpr y) {
+ uint64_t xu, yu, q, q2, w;
+ int i, ex, ey, e, d, s;
+
+ /*
+ * Extract mantissas of x and y (unsigned).
+ */
+ xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+ yu = (y & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+
+ /*
+ * Perform bit-by-bit division of xu by yu. We run it for 55 bits.
+ */
+ q = 0;
+ for (i = 0; i < 55; i ++) {
+ /*
+ * If yu is less than or equal xu, then subtract it and
+ * push a 1 in the quotient; otherwise, leave xu unchanged
+ * and push a 0.
+ */
+ uint64_t b;
+
+ b = ((xu - yu) >> 63) - 1;
+ xu -= b & yu;
+ q |= b & 1;
+ xu <<= 1;
+ q <<= 1;
+ }
+
+ /*
+ * We got 55 bits in the quotient, followed by an extra zero. We
+ * want that 56th bit to be "sticky": it should be a 1 if and
+ * only if the remainder (xu) is non-zero.
+ */
+ q |= (xu | -xu) >> 63;
+
+ /*
+ * Quotient is at most 2^56-1. Its top bit may be zero, but in
+ * that case the next-to-top bit will be a one, since the
+ * initial xu and yu were both in the 2^52..2^53-1 range.
+ * We perform a conditional shift to normalize q to the
+ * 2^54..2^55-1 range (with the bottom bit being sticky).
+ */
+ q2 = (q >> 1) | (q & 1);
+ w = q >> 55;
+ q ^= (q ^ q2) & -w;
+
+ /*
+ * Extract exponents to compute the scaling factor:
+ *
+ * - Each exponent is biased and we scaled them up by
+ * 52 bits; but these biases will cancel out.
+ *
+ * - The division loop produced a 55-bit shifted result,
+ * so we must scale it down by 55 bits.
+ *
+ * - If w = 1, we right-shifted the integer by 1 bit,
+ * hence we must add 1 to the scaling.
+ */
+ ex = (int)((x >> 52) & 0x7FF);
+ ey = (int)((y >> 52) & 0x7FF);
+ e = ex - ey - 55 + (int)w;
+
+ /*
+ * Sign is the XOR of the signs of the operands.
+ */
+ s = (int)((x ^ y) >> 63);
+
+ /*
+ * Corrective actions for zeros: if x = 0, then the computation
+ * is wrong, and we must clamp e and q to 0. We do not care
+ * about the case y = 0 (as per assumptions in this module,
+ * the caller does not perform divisions by zero).
+ */
+ d = (ex + 0x7FF) >> 11;
+ s &= d;
+ e &= -d;
+ q &= -(uint64_t)d;
+
+ /*
+ * FPR() packs the result and applies proper rounding.
+ */
+ return FPR(s, e, q);
+}
+
+fpr
+fpr_sqrt(fpr x) {
+ uint64_t xu, q, s, r;
+ int ex, e;
+
+ /*
+ * Extract the mantissa and the exponent. We don't care about
+ * the sign: by assumption, the operand is nonnegative.
+ * We want the "true" exponent corresponding to a mantissa
+ * in the 1..2 range.
+ */
+ xu = (x & (((uint64_t)1 << 52) - 1)) | ((uint64_t)1 << 52);
+ ex = (int)((x >> 52) & 0x7FF);
+ e = ex - 1023;
+
+ /*
+ * If the exponent is odd, double the mantissa and decrement
+ * the exponent. The exponent is then halved to account for
+ * the square root.
+ */
+ xu += xu & -(uint64_t)(e & 1);
+ e >>= 1;
+
+ /*
+ * Double the mantissa.
+ */
+ xu <<= 1;
+
+ /*
+ * We now have a mantissa in the 2^53..2^55-1 range. It
+ * represents a value between 1 (inclusive) and 4 (exclusive)
+ * in fixed point notation (with 53 fractional bits). We
+ * compute the square root bit by bit.
+ */
+ q = 0;
+ s = 0;
+ r = (uint64_t)1 << 53;
+ for (int i = 0; i < 54; i ++) {
+ uint64_t t, b;
+
+ t = s + r;
+ b = ((xu - t) >> 63) - 1;
+ s += (r << 1) & b;
+ xu -= t & b;
+ q += r & b;
+ xu <<= 1;
+ r >>= 1;
+ }
+
+ /*
+ * Now, q is a rounded-low 54-bit value, with a leading 1,
+ * 52 fractional digits, and an additional guard bit. We add
+ * an extra sticky bit to account for what remains of the operand.
+ */
+ q <<= 1;
+ q |= (xu | -xu) >> 63;
+
+ /*
+ * Result q is in the 2^54..2^55-1 range; we bias the exponent
+ * by 54 bits (the value e at that point contains the "true"
+ * exponent, but q is now considered an integer, i.e. scaled
+ * up.
+ */
+ e -= 54;
+
+ /*
+ * Corrective action for an operand of value zero.
+ */
+ q &= -(uint64_t)((ex + 0x7FF) >> 11);
+
+ /*
+ * Apply rounding and back result.
+ */
+ return FPR(0, e, q);
+}
+
+uint64_t
+fpr_expm_p63(fpr x, fpr ccs) {
+ /*
+ * Polynomial approximation of exp(-x) is taken from FACCT:
+ * https://eprint.iacr.org/2018/1234
+ * Specifically, values are extracted from the implementation
+ * referenced from the FACCT article, and available at:
+ * https://github.com/raykzhao/gaussian
+ * Here, the coefficients have been scaled up by 2^63 and
+ * converted to integers.
+ *
+ * Tests over more than 24 billions of random inputs in the
+ * 0..log(2) range have never shown a deviation larger than
+ * 2^(-50) from the true mathematical value.
+ */
+ static const uint64_t C[] = {
+ 0x00000004741183A3u,
+ 0x00000036548CFC06u,
+ 0x0000024FDCBF140Au,
+ 0x0000171D939DE045u,
+ 0x0000D00CF58F6F84u,
+ 0x000680681CF796E3u,
+ 0x002D82D8305B0FEAu,
+ 0x011111110E066FD0u,
+ 0x0555555555070F00u,
+ 0x155555555581FF00u,
+ 0x400000000002B400u,
+ 0x7FFFFFFFFFFF4800u,
+ 0x8000000000000000u
+ };
+
+ uint64_t z, y;
+ unsigned u;
+ uint32_t z0, z1, y0, y1;
+ uint64_t a, b;
+
+ y = C[0];
+ z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
+ for (u = 1; u < (sizeof C) / sizeof(C[0]); u ++) {
+ /*
+ * Compute product z * y over 128 bits, but keep only
+ * the top 64 bits.
+ *
+ * TODO: On some architectures/compilers we could use
+ * some intrinsics (__umulh() on MSVC) or other compiler
+ * extensions (unsigned __int128 on GCC / Clang) for
+ * improved speed; however, most 64-bit architectures
+ * also have appropriate IEEE754 floating-point support,
+ * which is better.
+ */
+ uint64_t c;
+
+ z0 = (uint32_t)z;
+ z1 = (uint32_t)(z >> 32);
+ y0 = (uint32_t)y;
+ y1 = (uint32_t)(y >> 32);
+ a = ((uint64_t)z0 * (uint64_t)y1)
+ + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+ b = ((uint64_t)z1 * (uint64_t)y0);
+ c = (a >> 32) + (b >> 32);
+ c += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+ c += (uint64_t)z1 * (uint64_t)y1;
+ y = C[u] - c;
+ }
+
+ /*
+ * The scaling factor must be applied at the end. Since y is now
+ * in fixed-point notation, we have to convert the factor to the
+ * same format, and do an extra integer multiplication.
+ */
+ z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+ z0 = (uint32_t)z;
+ z1 = (uint32_t)(z >> 32);
+ y0 = (uint32_t)y;
+ y1 = (uint32_t)(y >> 32);
+ a = ((uint64_t)z0 * (uint64_t)y1)
+ + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+ b = ((uint64_t)z1 * (uint64_t)y0);
+ y = (a >> 32) + (b >> 32);
+ y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+ y += (uint64_t)z1 * (uint64_t)y1;
+
+ return y;
+}
+
+const fpr fpr_gm_tab[] = {
+ 0, 0,
+ 9223372036854775808U, 4607182418800017408U,
+ 4604544271217802189U, 4604544271217802189U,
+ 13827916308072577997U, 4604544271217802189U,
+ 4606496786581982534U, 4600565431771507043U,
+ 13823937468626282851U, 4606496786581982534U,
+ 4600565431771507043U, 4606496786581982534U,
+ 13829868823436758342U, 4600565431771507043U,
+ 4607009347991985328U, 4596196889902818827U,
+ 13819568926757594635U, 4607009347991985328U,
+ 4603179351334086856U, 4605664432017547683U,
+ 13829036468872323491U, 4603179351334086856U,
+ 4605664432017547683U, 4603179351334086856U,
+ 13826551388188862664U, 4605664432017547683U,
+ 4596196889902818827U, 4607009347991985328U,
+ 13830381384846761136U, 4596196889902818827U,
+ 4607139046673687846U, 4591727299969791020U,
+ 13815099336824566828U, 4607139046673687846U,
+ 4603889326261607894U, 4605137878724712257U,
+ 13828509915579488065U, 4603889326261607894U,
+ 4606118860100255153U, 4602163548591158843U,
+ 13825535585445934651U, 4606118860100255153U,
+ 4598900923775164166U, 4606794571824115162U,
+ 13830166608678890970U, 4598900923775164166U,
+ 4606794571824115162U, 4598900923775164166U,
+ 13822272960629939974U, 4606794571824115162U,
+ 4602163548591158843U, 4606118860100255153U,
+ 13829490896955030961U, 4602163548591158843U,
+ 4605137878724712257U, 4603889326261607894U,
+ 13827261363116383702U, 4605137878724712257U,
+ 4591727299969791020U, 4607139046673687846U,
+ 13830511083528463654U, 4591727299969791020U,
+ 4607171569234046334U, 4587232218149935124U,
+ 13810604255004710932U, 4607171569234046334U,
+ 4604224084862889120U, 4604849113969373103U,
+ 13828221150824148911U, 4604224084862889120U,
+ 4606317631232591731U, 4601373767755717824U,
+ 13824745804610493632U, 4606317631232591731U,
+ 4599740487990714333U, 4606655894547498725U,
+ 13830027931402274533U, 4599740487990714333U,
+ 4606912484326125783U, 4597922303871901467U,
+ 13821294340726677275U, 4606912484326125783U,
+ 4602805845399633902U, 4605900952042040894U,
+ 13829272988896816702U, 4602805845399633902U,
+ 4605409869824231233U, 4603540801876750389U,
+ 13826912838731526197U, 4605409869824231233U,
+ 4594454542771183930U, 4607084929468638487U,
+ 13830456966323414295U, 4594454542771183930U,
+ 4607084929468638487U, 4594454542771183930U,
+ 13817826579625959738U, 4607084929468638487U,
+ 4603540801876750389U, 4605409869824231233U,
+ 13828781906679007041U, 4603540801876750389U,
+ 4605900952042040894U, 4602805845399633902U,
+ 13826177882254409710U, 4605900952042040894U,
+ 4597922303871901467U, 4606912484326125783U,
+ 13830284521180901591U, 4597922303871901467U,
+ 4606655894547498725U, 4599740487990714333U,
+ 13823112524845490141U, 4606655894547498725U,
+ 4601373767755717824U, 4606317631232591731U,
+ 13829689668087367539U, 4601373767755717824U,
+ 4604849113969373103U, 4604224084862889120U,
+ 13827596121717664928U, 4604849113969373103U,
+ 4587232218149935124U, 4607171569234046334U,
+ 13830543606088822142U, 4587232218149935124U,
+ 4607179706000002317U, 4582730748936808062U,
+ 13806102785791583870U, 4607179706000002317U,
+ 4604386048625945823U, 4604698657331085206U,
+ 13828070694185861014U, 4604386048625945823U,
+ 4606409688975526202U, 4600971798440897930U,
+ 13824343835295673738U, 4606409688975526202U,
+ 4600154912527631775U, 4606578871587619388U,
+ 13829950908442395196U, 4600154912527631775U,
+ 4606963563043808649U, 4597061974398750563U,
+ 13820434011253526371U, 4606963563043808649U,
+ 4602994049708411683U, 4605784983948558848U,
+ 13829157020803334656U, 4602994049708411683U,
+ 4605539368864982914U, 4603361638657888991U,
+ 13826733675512664799U, 4605539368864982914U,
+ 4595327571478659014U, 4607049811591515049U,
+ 13830421848446290857U, 4595327571478659014U,
+ 4607114680469659603U, 4593485039402578702U,
+ 13816857076257354510U, 4607114680469659603U,
+ 4603716733069447353U, 4605276012900672507U,
+ 13828648049755448315U, 4603716733069447353U,
+ 4606012266443150634U, 4602550884377336506U,
+ 13825922921232112314U, 4606012266443150634U,
+ 4598476289818621559U, 4606856142606846307U,
+ 13830228179461622115U, 4598476289818621559U,
+ 4606727809065869586U, 4599322407794599425U,
+ 13822694444649375233U, 4606727809065869586U,
+ 4601771097584682078U, 4606220668805321205U,
+ 13829592705660097013U, 4601771097584682078U,
+ 4604995550503212910U, 4604058477489546729U,
+ 13827430514344322537U, 4604995550503212910U,
+ 4589965306122607094U, 4607158013403433018U,
+ 13830530050258208826U, 4589965306122607094U,
+ 4607158013403433018U, 4589965306122607094U,
+ 13813337342977382902U, 4607158013403433018U,
+ 4604058477489546729U, 4604995550503212910U,
+ 13828367587357988718U, 4604058477489546729U,
+ 4606220668805321205U, 4601771097584682078U,
+ 13825143134439457886U, 4606220668805321205U,
+ 4599322407794599425U, 4606727809065869586U,
+ 13830099845920645394U, 4599322407794599425U,
+ 4606856142606846307U, 4598476289818621559U,
+ 13821848326673397367U, 4606856142606846307U,
+ 4602550884377336506U, 4606012266443150634U,
+ 13829384303297926442U, 4602550884377336506U,
+ 4605276012900672507U, 4603716733069447353U,
+ 13827088769924223161U, 4605276012900672507U,
+ 4593485039402578702U, 4607114680469659603U,
+ 13830486717324435411U, 4593485039402578702U,
+ 4607049811591515049U, 4595327571478659014U,
+ 13818699608333434822U, 4607049811591515049U,
+ 4603361638657888991U, 4605539368864982914U,
+ 13828911405719758722U, 4603361638657888991U,
+ 4605784983948558848U, 4602994049708411683U,
+ 13826366086563187491U, 4605784983948558848U,
+ 4597061974398750563U, 4606963563043808649U,
+ 13830335599898584457U, 4597061974398750563U,
+ 4606578871587619388U, 4600154912527631775U,
+ 13823526949382407583U, 4606578871587619388U,
+ 4600971798440897930U, 4606409688975526202U,
+ 13829781725830302010U, 4600971798440897930U,
+ 4604698657331085206U, 4604386048625945823U,
+ 13827758085480721631U, 4604698657331085206U,
+ 4582730748936808062U, 4607179706000002317U,
+ 13830551742854778125U, 4582730748936808062U,
+ 4607181740574479067U, 4578227681973159812U,
+ 13801599718827935620U, 4607181740574479067U,
+ 4604465633578481725U, 4604621949701367983U,
+ 13827993986556143791U, 4604465633578481725U,
+ 4606453861145241227U, 4600769149537129431U,
+ 13824141186391905239U, 4606453861145241227U,
+ 4600360675823176935U, 4606538458821337243U,
+ 13829910495676113051U, 4600360675823176935U,
+ 4606987119037722413U, 4596629994023683153U,
+ 13820002030878458961U, 4606987119037722413U,
+ 4603087070374583113U, 4605725276488455441U,
+ 13829097313343231249U, 4603087070374583113U,
+ 4605602459698789090U, 4603270878689749849U,
+ 13826642915544525657U, 4605602459698789090U,
+ 4595762727260045105U, 4607030246558998647U,
+ 13830402283413774455U, 4595762727260045105U,
+ 4607127537664763515U, 4592606767730311893U,
+ 13815978804585087701U, 4607127537664763515U,
+ 4603803453461190356U, 4605207475328619533U,
+ 13828579512183395341U, 4603803453461190356U,
+ 4606066157444814153U, 4602357870542944470U,
+ 13825729907397720278U, 4606066157444814153U,
+ 4598688984595225406U, 4606826008603986804U,
+ 13830198045458762612U, 4598688984595225406U,
+ 4606761837001494797U, 4599112075441176914U,
+ 13822484112295952722U, 4606761837001494797U,
+ 4601967947786150793U, 4606170366472647579U,
+ 13829542403327423387U, 4601967947786150793U,
+ 4605067233569943231U, 4603974338538572089U,
+ 13827346375393347897U, 4605067233569943231U,
+ 4590846768565625881U, 4607149205763218185U,
+ 13830521242617993993U, 4590846768565625881U,
+ 4607165468267934125U, 4588998070480937184U,
+ 13812370107335712992U, 4607165468267934125U,
+ 4604141730443515286U, 4604922840319727473U,
+ 13828294877174503281U, 4604141730443515286U,
+ 4606269759522929756U, 4601573027631668967U,
+ 13824945064486444775U, 4606269759522929756U,
+ 4599531889160152938U, 4606692493141721470U,
+ 13830064529996497278U, 4599531889160152938U,
+ 4606884969294623682U, 4598262871476403630U,
+ 13821634908331179438U, 4606884969294623682U,
+ 4602710690099904183U, 4605957195211051218U,
+ 13829329232065827026U, 4602710690099904183U,
+ 4605343481119364930U, 4603629178146150899U,
+ 13827001215000926707U, 4605343481119364930U,
+ 4594016801320007031U, 4607100477024622401U,
+ 13830472513879398209U, 4594016801320007031U,
+ 4607068040143112603U, 4594891488091520602U,
+ 13818263524946296410U, 4607068040143112603U,
+ 4603451617570386922U, 4605475169017376660U,
+ 13828847205872152468U, 4603451617570386922U,
+ 4605843545406134034U, 4602900303344142735U,
+ 13826272340198918543U, 4605843545406134034U,
+ 4597492765973365521U, 4606938683557690074U,
+ 13830310720412465882U, 4597492765973365521U,
+ 4606618018794815019U, 4599948172872067014U,
+ 13823320209726842822U, 4606618018794815019U,
+ 4601173347964633034U, 4606364276725003740U,
+ 13829736313579779548U, 4601173347964633034U,
+ 4604774382555066977U, 4604305528345395596U,
+ 13827677565200171404U, 4604774382555066977U,
+ 4585465300892538317U, 4607176315382986589U,
+ 13830548352237762397U, 4585465300892538317U,
+ 4607176315382986589U, 4585465300892538317U,
+ 13808837337747314125U, 4607176315382986589U,
+ 4604305528345395596U, 4604774382555066977U,
+ 13828146419409842785U, 4604305528345395596U,
+ 4606364276725003740U, 4601173347964633034U,
+ 13824545384819408842U, 4606364276725003740U,
+ 4599948172872067014U, 4606618018794815019U,
+ 13829990055649590827U, 4599948172872067014U,
+ 4606938683557690074U, 4597492765973365521U,
+ 13820864802828141329U, 4606938683557690074U,
+ 4602900303344142735U, 4605843545406134034U,
+ 13829215582260909842U, 4602900303344142735U,
+ 4605475169017376660U, 4603451617570386922U,
+ 13826823654425162730U, 4605475169017376660U,
+ 4594891488091520602U, 4607068040143112603U,
+ 13830440076997888411U, 4594891488091520602U,
+ 4607100477024622401U, 4594016801320007031U,
+ 13817388838174782839U, 4607100477024622401U,
+ 4603629178146150899U, 4605343481119364930U,
+ 13828715517974140738U, 4603629178146150899U,
+ 4605957195211051218U, 4602710690099904183U,
+ 13826082726954679991U, 4605957195211051218U,
+ 4598262871476403630U, 4606884969294623682U,
+ 13830257006149399490U, 4598262871476403630U,
+ 4606692493141721470U, 4599531889160152938U,
+ 13822903926014928746U, 4606692493141721470U,
+ 4601573027631668967U, 4606269759522929756U,
+ 13829641796377705564U, 4601573027631668967U,
+ 4604922840319727473U, 4604141730443515286U,
+ 13827513767298291094U, 4604922840319727473U,
+ 4588998070480937184U, 4607165468267934125U,
+ 13830537505122709933U, 4588998070480937184U,
+ 4607149205763218185U, 4590846768565625881U,
+ 13814218805420401689U, 4607149205763218185U,
+ 4603974338538572089U, 4605067233569943231U,
+ 13828439270424719039U, 4603974338538572089U,
+ 4606170366472647579U, 4601967947786150793U,
+ 13825339984640926601U, 4606170366472647579U,
+ 4599112075441176914U, 4606761837001494797U,
+ 13830133873856270605U, 4599112075441176914U,
+ 4606826008603986804U, 4598688984595225406U,
+ 13822061021450001214U, 4606826008603986804U,
+ 4602357870542944470U, 4606066157444814153U,
+ 13829438194299589961U, 4602357870542944470U,
+ 4605207475328619533U, 4603803453461190356U,
+ 13827175490315966164U, 4605207475328619533U,
+ 4592606767730311893U, 4607127537664763515U,
+ 13830499574519539323U, 4592606767730311893U,
+ 4607030246558998647U, 4595762727260045105U,
+ 13819134764114820913U, 4607030246558998647U,
+ 4603270878689749849U, 4605602459698789090U,
+ 13828974496553564898U, 4603270878689749849U,
+ 4605725276488455441U, 4603087070374583113U,
+ 13826459107229358921U, 4605725276488455441U,
+ 4596629994023683153U, 4606987119037722413U,
+ 13830359155892498221U, 4596629994023683153U,
+ 4606538458821337243U, 4600360675823176935U,
+ 13823732712677952743U, 4606538458821337243U,
+ 4600769149537129431U, 4606453861145241227U,
+ 13829825898000017035U, 4600769149537129431U,
+ 4604621949701367983U, 4604465633578481725U,
+ 13827837670433257533U, 4604621949701367983U,
+ 4578227681973159812U, 4607181740574479067U,
+ 13830553777429254875U, 4578227681973159812U,
+ 4607182249242036882U, 4573724215515480177U,
+ 13797096252370255985U, 4607182249242036882U,
+ 4604505071555817232U, 4604583231088591477U,
+ 13827955267943367285U, 4604505071555817232U,
+ 4606475480113671417U, 4600667422348321968U,
+ 13824039459203097776U, 4606475480113671417U,
+ 4600463181646572228U, 4606517779747998088U,
+ 13829889816602773896U, 4600463181646572228U,
+ 4606998399608725124U, 4596413578358834022U,
+ 13819785615213609830U, 4606998399608725124U,
+ 4603133304188877240U, 4605694995810664660U,
+ 13829067032665440468U, 4603133304188877240U,
+ 4605633586259814045U, 4603225210076562971U,
+ 13826597246931338779U, 4605633586259814045U,
+ 4595979936813835462U, 4607019963775302583U,
+ 13830392000630078391U, 4595979936813835462U,
+ 4607133460805585796U, 4592167175087283203U,
+ 13815539211942059011U, 4607133460805585796U,
+ 4603846496621587377U, 4605172808754305228U,
+ 13828544845609081036U, 4603846496621587377U,
+ 4606092657816072624U, 4602260871257280788U,
+ 13825632908112056596U, 4606092657816072624U,
+ 4598795050632330097U, 4606810452769876110U,
+ 13830182489624651918U, 4598795050632330097U,
+ 4606778366364612594U, 4599006600037663623U,
+ 13822378636892439431U, 4606778366364612594U,
+ 4602065906208722008U, 4606144763310860551U,
+ 13829516800165636359U, 4602065906208722008U,
+ 4605102686554936490U, 4603931940768740167U,
+ 13827303977623515975U, 4605102686554936490U,
+ 4591287158938884897U, 4607144295058764886U,
+ 13830516331913540694U, 4591287158938884897U,
+ 4607168688050493276U, 4588115294056142819U,
+ 13811487330910918627U, 4607168688050493276U,
+ 4604183020748362039U, 4604886103475043762U,
+ 13828258140329819570U, 4604183020748362039U,
+ 4606293848208650998U, 4601473544562720001U,
+ 13824845581417495809U, 4606293848208650998U,
+ 4599636300858866724U, 4606674353838411301U,
+ 13830046390693187109U, 4599636300858866724U,
+ 4606898891031025132U, 4598136582470364665U,
+ 13821508619325140473U, 4606898891031025132U,
+ 4602758354025980442U, 4605929219593405673U,
+ 13829301256448181481U, 4602758354025980442U,
+ 4605376811039722786U, 4603585091850767959U,
+ 13826957128705543767U, 4605376811039722786U,
+ 4594235767444503503U, 4607092871118901179U,
+ 13830464907973676987U, 4594235767444503503U,
+ 4607076652372832968U, 4594673119063280916U,
+ 13818045155918056724U, 4607076652372832968U,
+ 4603496309891590679U, 4605442656228245717U,
+ 13828814693083021525U, 4603496309891590679U,
+ 4605872393621214213U, 4602853162432841185U,
+ 13826225199287616993U, 4605872393621214213U,
+ 4597707695679609371U, 4606925748668145757U,
+ 13830297785522921565U, 4597707695679609371U,
+ 4606637115963965612U, 4599844446633109139U,
+ 13823216483487884947U, 4606637115963965612U,
+ 4601273700967202825U, 4606341107699334546U,
+ 13829713144554110354U, 4601273700967202825U,
+ 4604811873195349477U, 4604264921241055824U,
+ 13827636958095831632U, 4604811873195349477U,
+ 4586348876009622851U, 4607174111710118367U,
+ 13830546148564894175U, 4586348876009622851U,
+ 4607178180169683960U, 4584498631466405633U,
+ 13807870668321181441U, 4607178180169683960U,
+ 4604345904647073908U, 4604736643460027021U,
+ 13828108680314802829U, 4604345904647073908U,
+ 4606387137437298591U, 4601072712526242277U,
+ 13824444749381018085U, 4606387137437298591U,
+ 4600051662802353687U, 4606598603759044570U,
+ 13829970640613820378U, 4600051662802353687U,
+ 4606951288507767453U, 4597277522845151878U,
+ 13820649559699927686U, 4606951288507767453U,
+ 4602947266358709886U, 4605814408482919348U,
+ 13829186445337695156U, 4602947266358709886U,
+ 4605507406967535927U, 4603406726595779752U,
+ 13826778763450555560U, 4605507406967535927U,
+ 4595109641634432498U, 4607059093103722971U,
+ 13830431129958498779U, 4595109641634432498U,
+ 4607107746899444102U, 4593797652641645341U,
+ 13817169689496421149U, 4607107746899444102U,
+ 4603673059103075106U, 4605309881318010327U,
+ 13828681918172786135U, 4603673059103075106U,
+ 4605984877841711338U, 4602646891659203088U,
+ 13826018928513978896U, 4605984877841711338U,
+ 4598369669086960528U, 4606870719641066940U,
+ 13830242756495842748U, 4598369669086960528U,
+ 4606710311774494716U, 4599427256825614420U,
+ 13822799293680390228U, 4606710311774494716U,
+ 4601672213217083403U, 4606245366082353408U,
+ 13829617402937129216U, 4601672213217083403U,
+ 4604959323120302796U, 4604100215502905499U,
+ 13827472252357681307U, 4604959323120302796U,
+ 4589524267239410099U, 4607161910007591876U,
+ 13830533946862367684U, 4589524267239410099U,
+ 4607153778602162496U, 4590406145430462614U,
+ 13813778182285238422U, 4607153778602162496U,
+ 4604016517974851588U, 4605031521104517324U,
+ 13828403557959293132U, 4604016517974851588U,
+ 4606195668621671667U, 4601869677011524443U,
+ 13825241713866300251U, 4606195668621671667U,
+ 4599217346014614711U, 4606744984357082948U,
+ 13830117021211858756U, 4599217346014614711U,
+ 4606841238740778884U, 4598582729657176439U,
+ 13821954766511952247U, 4606841238740778884U,
+ 4602454542796181607U, 4606039359984203741U,
+ 13829411396838979549U, 4602454542796181607U,
+ 4605241877142478242U, 4603760198400967492U,
+ 13827132235255743300U, 4605241877142478242U,
+ 4593046061348462537U, 4607121277474223905U,
+ 13830493314328999713U, 4593046061348462537U,
+ 4607040195955932526U, 4595545269419264690U,
+ 13818917306274040498U, 4607040195955932526U,
+ 4603316355454250015U, 4605571053506370248U,
+ 13828943090361146056U, 4603316355454250015U,
+ 4605755272910869620U, 4603040651631881451U,
+ 13826412688486657259U, 4605755272910869620U,
+ 4596846128749438754U, 4606975506703684317U,
+ 13830347543558460125U, 4596846128749438754U,
+ 4606558823023444576U, 4600257918160607478U,
+ 13823629955015383286U, 4606558823023444576U,
+ 4600870609507958271U, 4606431930490633905U,
+ 13829803967345409713U, 4600870609507958271U,
+ 4604660425598397818U, 4604425958770613225U,
+ 13827797995625389033U, 4604660425598397818U,
+ 4580962600092897021U, 4607180892816495009U,
+ 13830552929671270817U, 4580962600092897021U,
+ 4607180892816495009U, 4580962600092897021U,
+ 13804334636947672829U, 4607180892816495009U,
+ 4604425958770613225U, 4604660425598397818U,
+ 13828032462453173626U, 4604425958770613225U,
+ 4606431930490633905U, 4600870609507958271U,
+ 13824242646362734079U, 4606431930490633905U,
+ 4600257918160607478U, 4606558823023444576U,
+ 13829930859878220384U, 4600257918160607478U,
+ 4606975506703684317U, 4596846128749438754U,
+ 13820218165604214562U, 4606975506703684317U,
+ 4603040651631881451U, 4605755272910869620U,
+ 13829127309765645428U, 4603040651631881451U,
+ 4605571053506370248U, 4603316355454250015U,
+ 13826688392309025823U, 4605571053506370248U,
+ 4595545269419264690U, 4607040195955932526U,
+ 13830412232810708334U, 4595545269419264690U,
+ 4607121277474223905U, 4593046061348462537U,
+ 13816418098203238345U, 4607121277474223905U,
+ 4603760198400967492U, 4605241877142478242U,
+ 13828613913997254050U, 4603760198400967492U,
+ 4606039359984203741U, 4602454542796181607U,
+ 13825826579650957415U, 4606039359984203741U,
+ 4598582729657176439U, 4606841238740778884U,
+ 13830213275595554692U, 4598582729657176439U,
+ 4606744984357082948U, 4599217346014614711U,
+ 13822589382869390519U, 4606744984357082948U,
+ 4601869677011524443U, 4606195668621671667U,
+ 13829567705476447475U, 4601869677011524443U,
+ 4605031521104517324U, 4604016517974851588U,
+ 13827388554829627396U, 4605031521104517324U,
+ 4590406145430462614U, 4607153778602162496U,
+ 13830525815456938304U, 4590406145430462614U,
+ 4607161910007591876U, 4589524267239410099U,
+ 13812896304094185907U, 4607161910007591876U,
+ 4604100215502905499U, 4604959323120302796U,
+ 13828331359975078604U, 4604100215502905499U,
+ 4606245366082353408U, 4601672213217083403U,
+ 13825044250071859211U, 4606245366082353408U,
+ 4599427256825614420U, 4606710311774494716U,
+ 13830082348629270524U, 4599427256825614420U,
+ 4606870719641066940U, 4598369669086960528U,
+ 13821741705941736336U, 4606870719641066940U,
+ 4602646891659203088U, 4605984877841711338U,
+ 13829356914696487146U, 4602646891659203088U,
+ 4605309881318010327U, 4603673059103075106U,
+ 13827045095957850914U, 4605309881318010327U,
+ 4593797652641645341U, 4607107746899444102U,
+ 13830479783754219910U, 4593797652641645341U,
+ 4607059093103722971U, 4595109641634432498U,
+ 13818481678489208306U, 4607059093103722971U,
+ 4603406726595779752U, 4605507406967535927U,
+ 13828879443822311735U, 4603406726595779752U,
+ 4605814408482919348U, 4602947266358709886U,
+ 13826319303213485694U, 4605814408482919348U,
+ 4597277522845151878U, 4606951288507767453U,
+ 13830323325362543261U, 4597277522845151878U,
+ 4606598603759044570U, 4600051662802353687U,
+ 13823423699657129495U, 4606598603759044570U,
+ 4601072712526242277U, 4606387137437298591U,
+ 13829759174292074399U, 4601072712526242277U,
+ 4604736643460027021U, 4604345904647073908U,
+ 13827717941501849716U, 4604736643460027021U,
+ 4584498631466405633U, 4607178180169683960U,
+ 13830550217024459768U, 4584498631466405633U,
+ 4607174111710118367U, 4586348876009622851U,
+ 13809720912864398659U, 4607174111710118367U,
+ 4604264921241055824U, 4604811873195349477U,
+ 13828183910050125285U, 4604264921241055824U,
+ 4606341107699334546U, 4601273700967202825U,
+ 13824645737821978633U, 4606341107699334546U,
+ 4599844446633109139U, 4606637115963965612U,
+ 13830009152818741420U, 4599844446633109139U,
+ 4606925748668145757U, 4597707695679609371U,
+ 13821079732534385179U, 4606925748668145757U,
+ 4602853162432841185U, 4605872393621214213U,
+ 13829244430475990021U, 4602853162432841185U,
+ 4605442656228245717U, 4603496309891590679U,
+ 13826868346746366487U, 4605442656228245717U,
+ 4594673119063280916U, 4607076652372832968U,
+ 13830448689227608776U, 4594673119063280916U,
+ 4607092871118901179U, 4594235767444503503U,
+ 13817607804299279311U, 4607092871118901179U,
+ 4603585091850767959U, 4605376811039722786U,
+ 13828748847894498594U, 4603585091850767959U,
+ 4605929219593405673U, 4602758354025980442U,
+ 13826130390880756250U, 4605929219593405673U,
+ 4598136582470364665U, 4606898891031025132U,
+ 13830270927885800940U, 4598136582470364665U,
+ 4606674353838411301U, 4599636300858866724U,
+ 13823008337713642532U, 4606674353838411301U,
+ 4601473544562720001U, 4606293848208650998U,
+ 13829665885063426806U, 4601473544562720001U,
+ 4604886103475043762U, 4604183020748362039U,
+ 13827555057603137847U, 4604886103475043762U,
+ 4588115294056142819U, 4607168688050493276U,
+ 13830540724905269084U, 4588115294056142819U,
+ 4607144295058764886U, 4591287158938884897U,
+ 13814659195793660705U, 4607144295058764886U,
+ 4603931940768740167U, 4605102686554936490U,
+ 13828474723409712298U, 4603931940768740167U,
+ 4606144763310860551U, 4602065906208722008U,
+ 13825437943063497816U, 4606144763310860551U,
+ 4599006600037663623U, 4606778366364612594U,
+ 13830150403219388402U, 4599006600037663623U,
+ 4606810452769876110U, 4598795050632330097U,
+ 13822167087487105905U, 4606810452769876110U,
+ 4602260871257280788U, 4606092657816072624U,
+ 13829464694670848432U, 4602260871257280788U,
+ 4605172808754305228U, 4603846496621587377U,
+ 13827218533476363185U, 4605172808754305228U,
+ 4592167175087283203U, 4607133460805585796U,
+ 13830505497660361604U, 4592167175087283203U,
+ 4607019963775302583U, 4595979936813835462U,
+ 13819351973668611270U, 4607019963775302583U,
+ 4603225210076562971U, 4605633586259814045U,
+ 13829005623114589853U, 4603225210076562971U,
+ 4605694995810664660U, 4603133304188877240U,
+ 13826505341043653048U, 4605694995810664660U,
+ 4596413578358834022U, 4606998399608725124U,
+ 13830370436463500932U, 4596413578358834022U,
+ 4606517779747998088U, 4600463181646572228U,
+ 13823835218501348036U, 4606517779747998088U,
+ 4600667422348321968U, 4606475480113671417U,
+ 13829847516968447225U, 4600667422348321968U,
+ 4604583231088591477U, 4604505071555817232U,
+ 13827877108410593040U, 4604583231088591477U,
+ 4573724215515480177U, 4607182249242036882U,
+ 13830554286096812690U, 4573724215515480177U,
+ 4607182376410422530U, 4569220649180767418U,
+ 13792592686035543226U, 4607182376410422530U,
+ 4604524701268679793U, 4604563781218984604U,
+ 13827935818073760412U, 4604524701268679793U,
+ 4606486172460753999U, 4600616459743653188U,
+ 13823988496598428996U, 4606486172460753999U,
+ 4600514338912178239U, 4606507322377452870U,
+ 13829879359232228678U, 4600514338912178239U,
+ 4607003915349878877U, 4596305267720071930U,
+ 13819677304574847738U, 4607003915349878877U,
+ 4603156351203636159U, 4605679749231851918U,
+ 13829051786086627726U, 4603156351203636159U,
+ 4605649044311923410U, 4603202304363743346U,
+ 13826574341218519154U, 4605649044311923410U,
+ 4596088445927168004U, 4607014697483910382U,
+ 13830386734338686190U, 4596088445927168004U,
+ 4607136295912168606U, 4591947271803021404U,
+ 13815319308657797212U, 4607136295912168606U,
+ 4603867938232615808U, 4605155376589456981U,
+ 13828527413444232789U, 4603867938232615808U,
+ 4606105796280968177U, 4602212250118051877U,
+ 13825584286972827685U, 4606105796280968177U,
+ 4598848011564831930U, 4606802552898869248U,
+ 13830174589753645056U, 4598848011564831930U,
+ 4606786509620734768U, 4598953786765296928U,
+ 13822325823620072736U, 4606786509620734768U,
+ 4602114767134999006U, 4606131849150971908U,
+ 13829503886005747716U, 4602114767134999006U,
+ 4605120315324767624U, 4603910660507251362U,
+ 13827282697362027170U, 4605120315324767624U,
+ 4591507261658050721U, 4607141713064252300U,
+ 13830513749919028108U, 4591507261658050721U,
+ 4607170170974224083U, 4587673791460508439U,
+ 13811045828315284247U, 4607170170974224083U,
+ 4604203581176243359U, 4604867640218014515U,
+ 13828239677072790323U, 4604203581176243359U,
+ 4606305777984577632U, 4601423692641949331U,
+ 13824795729496725139U, 4606305777984577632U,
+ 4599688422741010356U, 4606665164148251002U,
+ 13830037201003026810U, 4599688422741010356U,
+ 4606905728766014348U, 4598029484874872834U,
+ 13821401521729648642U, 4606905728766014348U,
+ 4602782121393764535U, 4605915122243179241U,
+ 13829287159097955049U, 4602782121393764535U,
+ 4605393374401988274U, 4603562972219549215U,
+ 13826935009074325023U, 4605393374401988274U,
+ 4594345179472540681U, 4607088942243446236U,
+ 13830460979098222044U, 4594345179472540681U,
+ 4607080832832247697U, 4594563856311064231U,
+ 13817935893165840039U, 4607080832832247697U,
+ 4603518581031047189U, 4605426297151190466U,
+ 13828798334005966274U, 4603518581031047189U,
+ 4605886709123365959U, 4602829525820289164U,
+ 13826201562675064972U, 4605886709123365959U,
+ 4597815040470278984U, 4606919157647773535U,
+ 13830291194502549343U, 4597815040470278984U,
+ 4606646545123403481U, 4599792496117920694U,
+ 13823164532972696502U, 4606646545123403481U,
+ 4601323770373937522U, 4606329407841126011U,
+ 13829701444695901819U, 4601323770373937522U,
+ 4604830524903495634U, 4604244531615310815U,
+ 13827616568470086623U, 4604830524903495634U,
+ 4586790578280679046U, 4607172882816799076U,
+ 13830544919671574884U, 4586790578280679046U,
+ 4607178985458280057U, 4583614727651146525U,
+ 13806986764505922333U, 4607178985458280057U,
+ 4604366005771528720U, 4604717681185626434U,
+ 13828089718040402242U, 4604366005771528720U,
+ 4606398451906509788U, 4601022290077223616U,
+ 13824394326931999424U, 4606398451906509788U,
+ 4600103317933788342U, 4606588777269136769U,
+ 13829960814123912577U, 4600103317933788342U,
+ 4606957467106717424U, 4597169786279785693U,
+ 13820541823134561501U, 4606957467106717424U,
+ 4602970680601913687U, 4605799732098147061U,
+ 13829171768952922869U, 4602970680601913687U,
+ 4605523422498301790U, 4603384207141321914U,
+ 13826756243996097722U, 4605523422498301790U,
+ 4595218635031890910U, 4607054494135176056U,
+ 13830426530989951864U, 4595218635031890910U,
+ 4607111255739239816U, 4593688012422887515U,
+ 13817060049277663323U, 4607111255739239816U,
+ 4603694922063032361U, 4605292980606880364U,
+ 13828665017461656172U, 4603694922063032361U,
+ 4605998608960791335U, 4602598930031891166U,
+ 13825970966886666974U, 4605998608960791335U,
+ 4598423001813699022U, 4606863472012527185U,
+ 13830235508867302993U, 4598423001813699022U,
+ 4606719100629313491U, 4599374859150636784U,
+ 13822746896005412592U, 4606719100629313491U,
+ 4601721693286060937U, 4606233055365547081U,
+ 13829605092220322889U, 4601721693286060937U,
+ 4604977468824438271U, 4604079374282302598U,
+ 13827451411137078406U, 4604977468824438271U,
+ 4589744810590291021U, 4607160003989618959U,
+ 13830532040844394767U, 4589744810590291021U,
+ 4607155938267770208U, 4590185751760970393U,
+ 13813557788615746201U, 4607155938267770208U,
+ 4604037525321326463U, 4605013567986435066U,
+ 13828385604841210874U, 4604037525321326463U,
+ 4606208206518262803U, 4601820425647934753U,
+ 13825192462502710561U, 4606208206518262803U,
+ 4599269903251194481U, 4606736437002195879U,
+ 13830108473856971687U, 4599269903251194481U,
+ 4606848731493011465U, 4598529532600161144U,
+ 13821901569454936952U, 4606848731493011465U,
+ 4602502755147763107U, 4606025850160239809U,
+ 13829397887015015617U, 4602502755147763107U,
+ 4605258978359093269U, 4603738491917026584U,
+ 13827110528771802392U, 4605258978359093269U,
+ 4593265590854265407U, 4607118021058468598U,
+ 13830490057913244406U, 4593265590854265407U,
+ 4607045045516813836U, 4595436449949385485U,
+ 13818808486804161293U, 4607045045516813836U,
+ 4603339021357904144U, 4605555245917486022U,
+ 13828927282772261830U, 4603339021357904144U,
+ 4605770164172969910U, 4603017373458244943U,
+ 13826389410313020751U, 4605770164172969910U,
+ 4596954088216812973U, 4606969576261663845U,
+ 13830341613116439653U, 4596954088216812973U,
+ 4606568886807728474U, 4600206446098256018U,
+ 13823578482953031826U, 4606568886807728474U,
+ 4600921238092511730U, 4606420848538580260U,
+ 13829792885393356068U, 4600921238092511730U,
+ 4604679572075463103U, 4604406033021674239U,
+ 13827778069876450047U, 4604679572075463103U,
+ 4581846703643734566U, 4607180341788068727U,
+ 13830552378642844535U, 4581846703643734566U,
+ 4607181359080094673U, 4579996072175835083U,
+ 13803368109030610891U, 4607181359080094673U,
+ 4604445825685214043U, 4604641218080103285U,
+ 13828013254934879093U, 4604445825685214043U,
+ 4606442934727379583U, 4600819913163773071U,
+ 13824191950018548879U, 4606442934727379583U,
+ 4600309328230211502U, 4606548680329491866U,
+ 13829920717184267674U, 4600309328230211502U,
+ 4606981354314050484U, 4596738097012783531U,
+ 13820110133867559339U, 4606981354314050484U,
+ 4603063884010218172U, 4605740310302420207U,
+ 13829112347157196015U, 4603063884010218172U,
+ 4605586791482848547U, 4603293641160266722U,
+ 13826665678015042530U, 4605586791482848547U,
+ 4595654028864046335U, 4607035262954517034U,
+ 13830407299809292842U, 4595654028864046335U,
+ 4607124449686274900U, 4592826452951465409U,
+ 13816198489806241217U, 4607124449686274900U,
+ 4603781852316960384U, 4605224709411790590U,
+ 13828596746266566398U, 4603781852316960384U,
+ 4606052795787882823U, 4602406247776385022U,
+ 13825778284631160830U, 4606052795787882823U,
+ 4598635880488956483U, 4606833664420673202U,
+ 13830205701275449010U, 4598635880488956483U,
+ 4606753451050079834U, 4599164736579548843U,
+ 13822536773434324651U, 4606753451050079834U,
+ 4601918851211878557U, 4606183055233559255U,
+ 13829555092088335063U, 4601918851211878557U,
+ 4605049409688478101U, 4603995455647851249U,
+ 13827367492502627057U, 4605049409688478101U,
+ 4590626485056654602U, 4607151534426937478U,
+ 13830523571281713286U, 4590626485056654602U,
+ 4607163731439411601U, 4589303678145802340U,
+ 13812675715000578148U, 4607163731439411601U,
+ 4604121000955189926U, 4604941113561600762U,
+ 13828313150416376570U, 4604121000955189926U,
+ 4606257600839867033U, 4601622657843474729U,
+ 13824994694698250537U, 4606257600839867033U,
+ 4599479600326345459U, 4606701442584137310U,
+ 13830073479438913118U, 4599479600326345459U,
+ 4606877885424248132U, 4598316292140394014U,
+ 13821688328995169822U, 4606877885424248132U,
+ 4602686793990243041U, 4605971073215153165U,
+ 13829343110069928973U, 4602686793990243041U,
+ 4605326714874986465U, 4603651144395358093U,
+ 13827023181250133901U, 4605326714874986465U,
+ 4593907249284540294U, 4607104153983298999U,
+ 13830476190838074807U, 4593907249284540294U,
+ 4607063608453868552U, 4595000592312171144U,
+ 13818372629166946952U, 4607063608453868552U,
+ 4603429196809300824U, 4605491322423429598U,
+ 13828863359278205406U, 4603429196809300824U,
+ 4605829012964735987U, 4602923807199184054U,
+ 13826295844053959862U, 4605829012964735987U,
+ 4597385183080791534U, 4606945027305114062U,
+ 13830317064159889870U, 4597385183080791534U,
+ 4606608350964852124U, 4599999947619525579U,
+ 13823371984474301387U, 4606608350964852124U,
+ 4601123065313358619U, 4606375745674388705U,
+ 13829747782529164513U, 4601123065313358619U,
+ 4604755543975806820U, 4604325745441780828U,
+ 13827697782296556636U, 4604755543975806820U,
+ 4585023436363055487U, 4607177290141793710U,
+ 13830549326996569518U, 4585023436363055487U,
+ 4607175255902437396U, 4585907115494236537U,
+ 13809279152349012345U, 4607175255902437396U,
+ 4604285253548209224U, 4604793159020491611U,
+ 13828165195875267419U, 4604285253548209224U,
+ 4606352730697093817U, 4601223560006786057U,
+ 13824595596861561865U, 4606352730697093817U,
+ 4599896339047301634U, 4606627607157935956U,
+ 13829999644012711764U, 4599896339047301634U,
+ 4606932257325205256U, 4597600270510262682U,
+ 13820972307365038490U, 4606932257325205256U,
+ 4602876755014813164U, 4605858005670328613U,
+ 13829230042525104421U, 4602876755014813164U,
+ 4605458946901419122U, 4603473988668005304U,
+ 13826846025522781112U, 4605458946901419122U,
+ 4594782329999411347U, 4607072388129742377U,
+ 13830444424984518185U, 4594782329999411347U,
+ 4607096716058023245U, 4594126307716900071U,
+ 13817498344571675879U, 4607096716058023245U,
+ 4603607160562208225U, 4605360179893335444U,
+ 13828732216748111252U, 4603607160562208225U,
+ 4605943243960030558U, 4602734543519989142U,
+ 13826106580374764950U, 4605943243960030558U,
+ 4598209407597805010U, 4606891971185517504U,
+ 13830264008040293312U, 4598209407597805010U,
+ 4606683463531482757U, 4599584122834874440U,
+ 13822956159689650248U, 4606683463531482757U,
+ 4601523323048804569U, 4606281842017099424U,
+ 13829653878871875232U, 4601523323048804569U,
+ 4604904503566677638U, 4604162403772767740U,
+ 13827534440627543548U, 4604904503566677638U,
+ 4588556721781247689U, 4607167120476811757U,
+ 13830539157331587565U, 4588556721781247689U,
+ 4607146792632922887U, 4591066993883984169U,
+ 13814439030738759977U, 4607146792632922887U,
+ 4603953166845776383U, 4605084992581147553U,
+ 13828457029435923361U, 4603953166845776383U,
+ 4606157602458368090U, 4602016966272225497U,
+ 13825389003127001305U, 4606157602458368090U,
+ 4599059363095165615U, 4606770142132396069U,
+ 13830142178987171877U, 4599059363095165615U,
+ 4606818271362779153U, 4598742041476147134U,
+ 13822114078330922942U, 4606818271362779153U,
+ 4602309411551204896U, 4606079444829232727U,
+ 13829451481684008535U, 4602309411551204896U,
+ 4605190175055178825U, 4603825001630339212U,
+ 13827197038485115020U, 4605190175055178825U,
+ 4592387007752762956U, 4607130541380624519U,
+ 13830502578235400327U, 4592387007752762956U,
+ 4607025146816593591U, 4595871363584150300U,
+ 13819243400438926108U, 4607025146816593591U,
+ 4603248068256948438U, 4605618058006716661U,
+ 13828990094861492469U, 4603248068256948438U,
+ 4605710171610479304U, 4603110210506737381U,
+ 13826482247361513189U, 4605710171610479304U,
+ 4596521820799644122U, 4606992800820440327U,
+ 13830364837675216135U, 4596521820799644122U,
+ 4606528158595189433U, 4600411960456200676U,
+ 13823783997310976484U, 4606528158595189433U,
+ 4600718319105833937U, 4606464709641375231U,
+ 13829836746496151039U, 4600718319105833937U,
+ 4604602620643553229U, 4604485382263976838U,
+ 13827857419118752646U, 4604602620643553229U,
+ 4576459225186735875U, 4607182037296057423U,
+ 13830554074150833231U, 4576459225186735875U,
+ 4607182037296057423U, 4576459225186735875U,
+ 13799831262041511683U, 4607182037296057423U,
+ 4604485382263976838U, 4604602620643553229U,
+ 13827974657498329037U, 4604485382263976838U,
+ 4606464709641375231U, 4600718319105833937U,
+ 13824090355960609745U, 4606464709641375231U,
+ 4600411960456200676U, 4606528158595189433U,
+ 13829900195449965241U, 4600411960456200676U,
+ 4606992800820440327U, 4596521820799644122U,
+ 13819893857654419930U, 4606992800820440327U,
+ 4603110210506737381U, 4605710171610479304U,
+ 13829082208465255112U, 4603110210506737381U,
+ 4605618058006716661U, 4603248068256948438U,
+ 13826620105111724246U, 4605618058006716661U,
+ 4595871363584150300U, 4607025146816593591U,
+ 13830397183671369399U, 4595871363584150300U,
+ 4607130541380624519U, 4592387007752762956U,
+ 13815759044607538764U, 4607130541380624519U,
+ 4603825001630339212U, 4605190175055178825U,
+ 13828562211909954633U, 4603825001630339212U,
+ 4606079444829232727U, 4602309411551204896U,
+ 13825681448405980704U, 4606079444829232727U,
+ 4598742041476147134U, 4606818271362779153U,
+ 13830190308217554961U, 4598742041476147134U,
+ 4606770142132396069U, 4599059363095165615U,
+ 13822431399949941423U, 4606770142132396069U,
+ 4602016966272225497U, 4606157602458368090U,
+ 13829529639313143898U, 4602016966272225497U,
+ 4605084992581147553U, 4603953166845776383U,
+ 13827325203700552191U, 4605084992581147553U,
+ 4591066993883984169U, 4607146792632922887U,
+ 13830518829487698695U, 4591066993883984169U,
+ 4607167120476811757U, 4588556721781247689U,
+ 13811928758636023497U, 4607167120476811757U,
+ 4604162403772767740U, 4604904503566677638U,
+ 13828276540421453446U, 4604162403772767740U,
+ 4606281842017099424U, 4601523323048804569U,
+ 13824895359903580377U, 4606281842017099424U,
+ 4599584122834874440U, 4606683463531482757U,
+ 13830055500386258565U, 4599584122834874440U,
+ 4606891971185517504U, 4598209407597805010U,
+ 13821581444452580818U, 4606891971185517504U,
+ 4602734543519989142U, 4605943243960030558U,
+ 13829315280814806366U, 4602734543519989142U,
+ 4605360179893335444U, 4603607160562208225U,
+ 13826979197416984033U, 4605360179893335444U,
+ 4594126307716900071U, 4607096716058023245U,
+ 13830468752912799053U, 4594126307716900071U,
+ 4607072388129742377U, 4594782329999411347U,
+ 13818154366854187155U, 4607072388129742377U,
+ 4603473988668005304U, 4605458946901419122U,
+ 13828830983756194930U, 4603473988668005304U,
+ 4605858005670328613U, 4602876755014813164U,
+ 13826248791869588972U, 4605858005670328613U,
+ 4597600270510262682U, 4606932257325205256U,
+ 13830304294179981064U, 4597600270510262682U,
+ 4606627607157935956U, 4599896339047301634U,
+ 13823268375902077442U, 4606627607157935956U,
+ 4601223560006786057U, 4606352730697093817U,
+ 13829724767551869625U, 4601223560006786057U,
+ 4604793159020491611U, 4604285253548209224U,
+ 13827657290402985032U, 4604793159020491611U,
+ 4585907115494236537U, 4607175255902437396U,
+ 13830547292757213204U, 4585907115494236537U,
+ 4607177290141793710U, 4585023436363055487U,
+ 13808395473217831295U, 4607177290141793710U,
+ 4604325745441780828U, 4604755543975806820U,
+ 13828127580830582628U, 4604325745441780828U,
+ 4606375745674388705U, 4601123065313358619U,
+ 13824495102168134427U, 4606375745674388705U,
+ 4599999947619525579U, 4606608350964852124U,
+ 13829980387819627932U, 4599999947619525579U,
+ 4606945027305114062U, 4597385183080791534U,
+ 13820757219935567342U, 4606945027305114062U,
+ 4602923807199184054U, 4605829012964735987U,
+ 13829201049819511795U, 4602923807199184054U,
+ 4605491322423429598U, 4603429196809300824U,
+ 13826801233664076632U, 4605491322423429598U,
+ 4595000592312171144U, 4607063608453868552U,
+ 13830435645308644360U, 4595000592312171144U,
+ 4607104153983298999U, 4593907249284540294U,
+ 13817279286139316102U, 4607104153983298999U,
+ 4603651144395358093U, 4605326714874986465U,
+ 13828698751729762273U, 4603651144395358093U,
+ 4605971073215153165U, 4602686793990243041U,
+ 13826058830845018849U, 4605971073215153165U,
+ 4598316292140394014U, 4606877885424248132U,
+ 13830249922279023940U, 4598316292140394014U,
+ 4606701442584137310U, 4599479600326345459U,
+ 13822851637181121267U, 4606701442584137310U,
+ 4601622657843474729U, 4606257600839867033U,
+ 13829629637694642841U, 4601622657843474729U,
+ 4604941113561600762U, 4604121000955189926U,
+ 13827493037809965734U, 4604941113561600762U,
+ 4589303678145802340U, 4607163731439411601U,
+ 13830535768294187409U, 4589303678145802340U,
+ 4607151534426937478U, 4590626485056654602U,
+ 13813998521911430410U, 4607151534426937478U,
+ 4603995455647851249U, 4605049409688478101U,
+ 13828421446543253909U, 4603995455647851249U,
+ 4606183055233559255U, 4601918851211878557U,
+ 13825290888066654365U, 4606183055233559255U,
+ 4599164736579548843U, 4606753451050079834U,
+ 13830125487904855642U, 4599164736579548843U,
+ 4606833664420673202U, 4598635880488956483U,
+ 13822007917343732291U, 4606833664420673202U,
+ 4602406247776385022U, 4606052795787882823U,
+ 13829424832642658631U, 4602406247776385022U,
+ 4605224709411790590U, 4603781852316960384U,
+ 13827153889171736192U, 4605224709411790590U,
+ 4592826452951465409U, 4607124449686274900U,
+ 13830496486541050708U, 4592826452951465409U,
+ 4607035262954517034U, 4595654028864046335U,
+ 13819026065718822143U, 4607035262954517034U,
+ 4603293641160266722U, 4605586791482848547U,
+ 13828958828337624355U, 4603293641160266722U,
+ 4605740310302420207U, 4603063884010218172U,
+ 13826435920864993980U, 4605740310302420207U,
+ 4596738097012783531U, 4606981354314050484U,
+ 13830353391168826292U, 4596738097012783531U,
+ 4606548680329491866U, 4600309328230211502U,
+ 13823681365084987310U, 4606548680329491866U,
+ 4600819913163773071U, 4606442934727379583U,
+ 13829814971582155391U, 4600819913163773071U,
+ 4604641218080103285U, 4604445825685214043U,
+ 13827817862539989851U, 4604641218080103285U,
+ 4579996072175835083U, 4607181359080094673U,
+ 13830553395934870481U, 4579996072175835083U,
+ 4607180341788068727U, 4581846703643734566U,
+ 13805218740498510374U, 4607180341788068727U,
+ 4604406033021674239U, 4604679572075463103U,
+ 13828051608930238911U, 4604406033021674239U,
+ 4606420848538580260U, 4600921238092511730U,
+ 13824293274947287538U, 4606420848538580260U,
+ 4600206446098256018U, 4606568886807728474U,
+ 13829940923662504282U, 4600206446098256018U,
+ 4606969576261663845U, 4596954088216812973U,
+ 13820326125071588781U, 4606969576261663845U,
+ 4603017373458244943U, 4605770164172969910U,
+ 13829142201027745718U, 4603017373458244943U,
+ 4605555245917486022U, 4603339021357904144U,
+ 13826711058212679952U, 4605555245917486022U,
+ 4595436449949385485U, 4607045045516813836U,
+ 13830417082371589644U, 4595436449949385485U,
+ 4607118021058468598U, 4593265590854265407U,
+ 13816637627709041215U, 4607118021058468598U,
+ 4603738491917026584U, 4605258978359093269U,
+ 13828631015213869077U, 4603738491917026584U,
+ 4606025850160239809U, 4602502755147763107U,
+ 13825874792002538915U, 4606025850160239809U,
+ 4598529532600161144U, 4606848731493011465U,
+ 13830220768347787273U, 4598529532600161144U,
+ 4606736437002195879U, 4599269903251194481U,
+ 13822641940105970289U, 4606736437002195879U,
+ 4601820425647934753U, 4606208206518262803U,
+ 13829580243373038611U, 4601820425647934753U,
+ 4605013567986435066U, 4604037525321326463U,
+ 13827409562176102271U, 4605013567986435066U,
+ 4590185751760970393U, 4607155938267770208U,
+ 13830527975122546016U, 4590185751760970393U,
+ 4607160003989618959U, 4589744810590291021U,
+ 13813116847445066829U, 4607160003989618959U,
+ 4604079374282302598U, 4604977468824438271U,
+ 13828349505679214079U, 4604079374282302598U,
+ 4606233055365547081U, 4601721693286060937U,
+ 13825093730140836745U, 4606233055365547081U,
+ 4599374859150636784U, 4606719100629313491U,
+ 13830091137484089299U, 4599374859150636784U,
+ 4606863472012527185U, 4598423001813699022U,
+ 13821795038668474830U, 4606863472012527185U,
+ 4602598930031891166U, 4605998608960791335U,
+ 13829370645815567143U, 4602598930031891166U,
+ 4605292980606880364U, 4603694922063032361U,
+ 13827066958917808169U, 4605292980606880364U,
+ 4593688012422887515U, 4607111255739239816U,
+ 13830483292594015624U, 4593688012422887515U,
+ 4607054494135176056U, 4595218635031890910U,
+ 13818590671886666718U, 4607054494135176056U,
+ 4603384207141321914U, 4605523422498301790U,
+ 13828895459353077598U, 4603384207141321914U,
+ 4605799732098147061U, 4602970680601913687U,
+ 13826342717456689495U, 4605799732098147061U,
+ 4597169786279785693U, 4606957467106717424U,
+ 13830329503961493232U, 4597169786279785693U,
+ 4606588777269136769U, 4600103317933788342U,
+ 13823475354788564150U, 4606588777269136769U,
+ 4601022290077223616U, 4606398451906509788U,
+ 13829770488761285596U, 4601022290077223616U,
+ 4604717681185626434U, 4604366005771528720U,
+ 13827738042626304528U, 4604717681185626434U,
+ 4583614727651146525U, 4607178985458280057U,
+ 13830551022313055865U, 4583614727651146525U,
+ 4607172882816799076U, 4586790578280679046U,
+ 13810162615135454854U, 4607172882816799076U,
+ 4604244531615310815U, 4604830524903495634U,
+ 13828202561758271442U, 4604244531615310815U,
+ 4606329407841126011U, 4601323770373937522U,
+ 13824695807228713330U, 4606329407841126011U,
+ 4599792496117920694U, 4606646545123403481U,
+ 13830018581978179289U, 4599792496117920694U,
+ 4606919157647773535U, 4597815040470278984U,
+ 13821187077325054792U, 4606919157647773535U,
+ 4602829525820289164U, 4605886709123365959U,
+ 13829258745978141767U, 4602829525820289164U,
+ 4605426297151190466U, 4603518581031047189U,
+ 13826890617885822997U, 4605426297151190466U,
+ 4594563856311064231U, 4607080832832247697U,
+ 13830452869687023505U, 4594563856311064231U,
+ 4607088942243446236U, 4594345179472540681U,
+ 13817717216327316489U, 4607088942243446236U,
+ 4603562972219549215U, 4605393374401988274U,
+ 13828765411256764082U, 4603562972219549215U,
+ 4605915122243179241U, 4602782121393764535U,
+ 13826154158248540343U, 4605915122243179241U,
+ 4598029484874872834U, 4606905728766014348U,
+ 13830277765620790156U, 4598029484874872834U,
+ 4606665164148251002U, 4599688422741010356U,
+ 13823060459595786164U, 4606665164148251002U,
+ 4601423692641949331U, 4606305777984577632U,
+ 13829677814839353440U, 4601423692641949331U,
+ 4604867640218014515U, 4604203581176243359U,
+ 13827575618031019167U, 4604867640218014515U,
+ 4587673791460508439U, 4607170170974224083U,
+ 13830542207828999891U, 4587673791460508439U,
+ 4607141713064252300U, 4591507261658050721U,
+ 13814879298512826529U, 4607141713064252300U,
+ 4603910660507251362U, 4605120315324767624U,
+ 13828492352179543432U, 4603910660507251362U,
+ 4606131849150971908U, 4602114767134999006U,
+ 13825486803989774814U, 4606131849150971908U,
+ 4598953786765296928U, 4606786509620734768U,
+ 13830158546475510576U, 4598953786765296928U,
+ 4606802552898869248U, 4598848011564831930U,
+ 13822220048419607738U, 4606802552898869248U,
+ 4602212250118051877U, 4606105796280968177U,
+ 13829477833135743985U, 4602212250118051877U,
+ 4605155376589456981U, 4603867938232615808U,
+ 13827239975087391616U, 4605155376589456981U,
+ 4591947271803021404U, 4607136295912168606U,
+ 13830508332766944414U, 4591947271803021404U,
+ 4607014697483910382U, 4596088445927168004U,
+ 13819460482781943812U, 4607014697483910382U,
+ 4603202304363743346U, 4605649044311923410U,
+ 13829021081166699218U, 4603202304363743346U,
+ 4605679749231851918U, 4603156351203636159U,
+ 13826528388058411967U, 4605679749231851918U,
+ 4596305267720071930U, 4607003915349878877U,
+ 13830375952204654685U, 4596305267720071930U,
+ 4606507322377452870U, 4600514338912178239U,
+ 13823886375766954047U, 4606507322377452870U,
+ 4600616459743653188U, 4606486172460753999U,
+ 13829858209315529807U, 4600616459743653188U,
+ 4604563781218984604U, 4604524701268679793U,
+ 13827896738123455601U, 4604563781218984604U,
+ 4569220649180767418U, 4607182376410422530U,
+ 13830554413265198338U, 4569220649180767418U
+};
+
+const fpr fpr_p2_tab[] = {
+ 4611686018427387904U,
+ 4607182418800017408U,
+ 4602678819172646912U,
+ 4598175219545276416U,
+ 4593671619917905920U,
+ 4589168020290535424U,
+ 4584664420663164928U,
+ 4580160821035794432U,
+ 4575657221408423936U,
+ 4571153621781053440U,
+ 4566650022153682944U
+};
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.h b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.h
new file mode 100644
index 000000000..beab1ab66
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/fpr.h
@@ -0,0 +1,491 @@
+/*
+ * Floating-point operations.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/* ====================================================================== */
+/*
+ * Custom floating-point implementation with integer arithmetics. We
+ * use IEEE-754 "binary64" format, with some simplifications:
+ *
+ * - Top bit is s = 1 for negative, 0 for positive.
+ *
+ * - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
+ *
+ * - Mantissa m uses the 52 low bits.
+ *
+ * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
+ * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
+ * less than 1.0), but the top bit (equal to 1 by definition) is omitted
+ * in the encoding.
+ *
+ * In IEEE-754, there are some special values:
+ *
+ * - If e = 2047, then the value is either an infinite (m = 0) or
+ * a NaN (m != 0).
+ *
+ * - If e = 0, then the value is either a zero (m = 0) or a subnormal,
+ * aka "denormalized number" (m != 0).
+ *
+ * Of these, we only need the zeros. The caller is responsible for not
+ * providing operands that would lead to infinites, NaNs or subnormals.
+ * If inputs are such that values go out of range, then indeterminate
+ * values are returned (it would still be deterministic, but no specific
+ * value may be relied upon).
+ *
+ * At the C level, the three parts are stored in a 64-bit unsigned
+ * word.
+ *
+ * One may note that a property of the IEEE-754 format is that order
+ * is preserved for positive values: if two positive floating-point
+ * values x and y are such that x < y, then their respective encodings
+ * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
+ * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
+ * y < 0, and x < y, then ia64(x) > ia64(y).
+ *
+ * IMPORTANT ASSUMPTIONS:
+ * ======================
+ *
+ * For proper computations, and constant-time behaviour, we assume the
+ * following:
+ *
+ * - 32x32->64 multiplication (unsigned) has an execution time that
+ * is independent of its operands. This is true of most modern
+ * x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
+ * and M3 (in the M0 and M0+, this is done in software, so it depends
+ * on that routine), and the PowerPC cores from the G3/G4 lines.
+ * For more info, see: https://www.bearssl.org/ctmul.html
+ *
+ * - Left-shifts and right-shifts of 32-bit values have an execution
+ * time which does not depend on the shifted value nor on the
+ * shift count. An historical exception is the Pentium IV, but most
+ * modern CPU have barrel shifters. Some small microcontrollers
+ * might have varying-time shifts (not the ARM Cortex M*, though).
+ *
+ * - Right-shift of a signed negative value performs a sign extension.
+ * As per the C standard, this operation returns an
+ * implementation-defined result (this is NOT an "undefined
+ * behaviour"). On most/all systems, an arithmetic shift is
+ * performed, because this is what makes most sense.
+ */
+
+/*
+ * Normally we should declare the 'fpr' type to be a struct or union
+ * around the internal 64-bit value; however, we want to use the
+ * direct 64-bit integer type to enable a lighter call convention on
+ * ARM platforms. This means that direct (invalid) use of operators
+ * such as '*' or '+' will not be caught by the compiler. We rely on
+ * the "normal" (non-emulated) code to detect such instances.
+ */
+typedef uint64_t fpr;
+
+/*
+ * For computations, we split values into an integral mantissa in the
+ * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
+ * "sticky" (it is set to 1 if any of the bits below it is 1); when
+ * re-encoding, the low two bits are dropped, but may induce an
+ * increment in the value for proper rounding.
+ */
+
+/*
+ * Right-shift a 64-bit unsigned value by a possibly secret shift count.
+ * We assumed that the underlying architecture had a barrel shifter for
+ * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
+ * typically invoke a software routine that is not necessarily
+ * constant-time; hence the function below.
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ursh(uint64_t x, int n) {
+ x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
+ return x >> (n & 31);
+}
+
+/*
+ * Right-shift a 64-bit signed value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline int64_t
+fpr_irsh(int64_t x, int n) {
+ x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
+ return x >> (n & 31);
+}
+
+/*
+ * Left-shift a 64-bit unsigned value by a possibly secret shift count
+ * (see fpr_ursh() for the rationale).
+ *
+ * Shift count n MUST be in the 0..63 range.
+ */
+static inline uint64_t
+fpr_ulsh(uint64_t x, int n) {
+ x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
+ return x << (n & 31);
+}
+
+/*
+ * Expectations:
+ * s = 0 or 1
+ * exponent e is "arbitrary" and unbiased
+ * 2^54 <= m < 2^55
+ * Numerical value is (-1)^2 * m * 2^e
+ *
+ * Exponents which are too low lead to value zero. If the exponent is
+ * too large, the returned value is indeterminate.
+ *
+ * If m = 0, then a zero is returned (using the provided sign).
+ * If e < -1076, then a zero is returned (regardless of the value of m).
+ * If e >= -1076 and e != 0, m must be within the expected range
+ * (2^54 to 2^55-1).
+ */
+static inline fpr
+FPR(int s, int e, uint64_t m) {
+ fpr x;
+ uint32_t t;
+ unsigned f;
+
+ /*
+ * If e >= -1076, then the value is "normal"; otherwise, it
+ * should be a subnormal, which we clamp down to zero.
+ */
+ e += 1076;
+ t = (uint32_t)e >> 31;
+ m &= (uint64_t)t - 1;
+
+ /*
+ * If m = 0 then we want a zero; make e = 0 too, but conserve
+ * the sign.
+ */
+ t = (uint32_t)(m >> 54);
+ e &= -(int)t;
+
+ /*
+ * The 52 mantissa bits come from m. Value m has its top bit set
+ * (unless it is a zero); we leave it "as is": the top bit will
+ * increment the exponent by 1, except when m = 0, which is
+ * exactly what we want.
+ */
+ x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);
+
+ /*
+ * Rounding: if the low three bits of m are 011, 110 or 111,
+ * then the value should be incremented to get the next
+ * representable value. This implements the usual
+ * round-to-nearest rule (with preference to even values in case
+ * of a tie). Note that the increment may make a carry spill
+ * into the exponent field, which is again exactly what we want
+ * in that case.
+ */
+ f = (unsigned)m & 7U;
+ x += (0xC8U >> f) & 1;
+ return x;
+}
+
+#define fpr_scaled PQCLEAN_FALCONPADDED512_CLEAN_fpr_scaled
+fpr fpr_scaled(int64_t i, int sc);
+
+static inline fpr
+fpr_of(int64_t i) {
+ return fpr_scaled(i, 0);
+}
+
+static const fpr fpr_q = 4667981563525332992;
+static const fpr fpr_inverse_of_q = 4545632735260551042;
+static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
+static const fpr fpr_inv_sigma[] = {
+ 0, /* unused */
+ 4574611497772390042,
+ 4574501679055810265,
+ 4574396282908341804,
+ 4574245855758572086,
+ 4574103865040221165,
+ 4573969550563515544,
+ 4573842244705920822,
+ 4573721358406441454,
+ 4573606369665796042,
+ 4573496814039276259
+};
+static const fpr fpr_sigma_min[] = {
+ 0, /* unused */
+ 4607707126469777035,
+ 4607777455861499430,
+ 4607846828256951418,
+ 4607949175006100261,
+ 4608049571757433526,
+ 4608148125896792003,
+ 4608244935301382692,
+ 4608340089478362016,
+ 4608433670533905013,
+ 4608525754002622308
+};
+static const fpr fpr_log2 = 4604418534313441775;
+static const fpr fpr_inv_log2 = 4609176140021203710;
+static const fpr fpr_bnorm_max = 4670353323383631276;
+static const fpr fpr_zero = 0;
+static const fpr fpr_one = 4607182418800017408;
+static const fpr fpr_two = 4611686018427387904;
+static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
+static const fpr fpr_ptwo31 = 4746794007248502784;
+static const fpr fpr_ptwo31m1 = 4746794007244308480;
+static const fpr fpr_mtwo31m1 = 13970166044099084288U;
+static const fpr fpr_ptwo63m1 = 4890909195324358656;
+static const fpr fpr_mtwo63m1 = 14114281232179134464U;
+static const fpr fpr_ptwo63 = 4890909195324358656;
+
+static inline int64_t
+fpr_rint(fpr x) {
+ uint64_t m, d;
+ int e;
+ uint32_t s, dd, f;
+
+ /*
+ * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
+ * thus extract the mantissa as a 63-bit integer, then right-shift
+ * it as needed.
+ */
+ m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+ e = 1085 - ((int)(x >> 52) & 0x7FF);
+
+ /*
+ * If a shift of more than 63 bits is needed, then simply set m
+ * to zero. This also covers the case of an input operand equal
+ * to zero.
+ */
+ m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
+ e &= 63;
+
+ /*
+ * Right-shift m as needed. Shift count is e. Proper rounding
+ * mandates that:
+ * - If the highest dropped bit is zero, then round low.
+ * - If the highest dropped bit is one, and at least one of the
+ * other dropped bits is one, then round up.
+ * - If the highest dropped bit is one, and all other dropped
+ * bits are zero, then round up if the lowest kept bit is 1,
+ * or low otherwise (i.e. ties are broken by "rounding to even").
+ *
+ * We thus first extract a word consisting of all the dropped bit
+ * AND the lowest kept bit; then we shrink it down to three bits,
+ * the lowest being "sticky".
+ */
+ d = fpr_ulsh(m, 63 - e);
+ dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
+ f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
+ m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);
+
+ /*
+ * Apply the sign bit.
+ */
+ s = (uint32_t)(x >> 63);
+ return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
+}
+
+static inline int64_t
+fpr_floor(fpr x) {
+ uint64_t t;
+ int64_t xi;
+ int e, cc;
+
+ /*
+ * We extract the integer as a _signed_ 64-bit integer with
+ * a scaling factor. Since we assume that the value fits
+ * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
+ * absolute value to make it in the 2^62..2^63-1 range: we
+ * will only need a right-shift afterwards.
+ */
+ e = (int)(x >> 52) & 0x7FF;
+ t = x >> 63;
+ xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
+ & (((uint64_t)1 << 63) - 1));
+ xi = (xi ^ -(int64_t)t) + (int64_t)t;
+ cc = 1085 - e;
+
+ /*
+ * We perform an arithmetic right-shift on the value. This
+ * applies floor() semantics on both positive and negative values
+ * (rounding toward minus infinity).
+ */
+ xi = fpr_irsh(xi, cc & 63);
+
+ /*
+ * If the true shift count was 64 or more, then we should instead
+ * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
+ * case: -0 will be floored to -1, not 0 (whether this is correct
+ * is debatable; in any case, the other functions normalize zero
+ * to +0).
+ *
+ * For an input of zero, the non-shifted xi was incorrect (we used
+ * a top implicit bit of value 1, not 0), but this does not matter
+ * since this operation will clamp it down.
+ */
+ xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
+ return xi;
+}
+
+static inline int64_t
+fpr_trunc(fpr x) {
+ uint64_t t, xu;
+ int e, cc;
+
+ /*
+ * Extract the absolute value. Since we assume that the value
+ * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
+ * the absolute value into the 2^62..2^63-1 range, and then
+ * do a right shift afterwards.
+ */
+ e = (int)(x >> 52) & 0x7FF;
+ xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
+ cc = 1085 - e;
+ xu = fpr_ursh(xu, cc & 63);
+
+ /*
+ * If the exponent is too low (cc > 63), then the shift was wrong
+ * and we must clamp the value to 0. This also covers the case
+ * of an input equal to zero.
+ */
+ xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);
+
+ /*
+ * Apply back the sign, if the source value is negative.
+ */
+ t = x >> 63;
+ xu = (xu ^ -t) + t;
+ return *(int64_t *)&xu;
+}
+
+#define fpr_add PQCLEAN_FALCONPADDED512_CLEAN_fpr_add
+fpr fpr_add(fpr x, fpr y);
+
+static inline fpr
+fpr_sub(fpr x, fpr y) {
+ y ^= (uint64_t)1 << 63;
+ return fpr_add(x, y);
+}
+
+static inline fpr
+fpr_neg(fpr x) {
+ x ^= (uint64_t)1 << 63;
+ return x;
+}
+
+static inline fpr
+fpr_half(fpr x) {
+ /*
+ * To divide a value by 2, we just have to subtract 1 from its
+ * exponent, but we have to take care of zero.
+ */
+ uint32_t t;
+
+ x -= (uint64_t)1 << 52;
+ t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
+ x &= (uint64_t)t - 1;
+ return x;
+}
+
+static inline fpr
+fpr_double(fpr x) {
+ /*
+ * To double a value, we just increment by one the exponent. We
+ * don't care about infinites or NaNs; however, 0 is a
+ * special case.
+ */
+ x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
+ return x;
+}
+
+#define fpr_mul PQCLEAN_FALCONPADDED512_CLEAN_fpr_mul
+fpr fpr_mul(fpr x, fpr y);
+
+static inline fpr
+fpr_sqr(fpr x) {
+ return fpr_mul(x, x);
+}
+
+#define fpr_div PQCLEAN_FALCONPADDED512_CLEAN_fpr_div
+fpr fpr_div(fpr x, fpr y);
+
+static inline fpr
+fpr_inv(fpr x) {
+ return fpr_div(4607182418800017408u, x);
+}
+
+#define fpr_sqrt PQCLEAN_FALCONPADDED512_CLEAN_fpr_sqrt
+fpr fpr_sqrt(fpr x);
+
+static inline int
+fpr_lt(fpr x, fpr y) {
+ /*
+ * If both x and y are positive, then a signed comparison yields
+ * the proper result:
+ * - For positive values, the order is preserved.
+ * - The sign bit is at the same place as in integers, so
+ * sign is preserved.
+ * Moreover, we can compute [x < y] as sgn(x-y) and the computation
+ * of x-y will not overflow.
+ *
+ * If the signs differ, then sgn(x) gives the proper result.
+ *
+ * If both x and y are negative, then the order is reversed.
+ * Hence [x < y] = sgn(y-x). We must compute this separately from
+ * sgn(x-y); simply inverting sgn(x-y) would not handle the edge
+ * case x = y properly.
+ */
+ int cc0, cc1;
+ int64_t sx;
+ int64_t sy;
+
+ sx = *(int64_t *)&x;
+ sy = *(int64_t *)&y;
+ sy &= ~((sx ^ sy) >> 63); /* set sy=0 if signs differ */
+
+ cc0 = (int)((sx - sy) >> 63) & 1; /* Neither subtraction overflows when */
+ cc1 = (int)((sy - sx) >> 63) & 1; /* the signs are the same. */
+
+ return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
+}
+
+/*
+ * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
+ * bits or so.
+ */
+#define fpr_expm_p63 PQCLEAN_FALCONPADDED512_CLEAN_fpr_expm_p63
+uint64_t fpr_expm_p63(fpr x, fpr ccs);
+
+#define fpr_gm_tab PQCLEAN_FALCONPADDED512_CLEAN_fpr_gm_tab
+extern const fpr fpr_gm_tab[];
+
+#define fpr_p2_tab PQCLEAN_FALCONPADDED512_CLEAN_fpr_p2_tab
+extern const fpr fpr_p2_tab[];
+
+/* ====================================================================== */
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/inner.h b/src/sig/falcon/pqclean_falcon-padded-512_clean/inner.h
new file mode 100644
index 000000000..361f06263
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/inner.h
@@ -0,0 +1,820 @@
+#ifndef FALCON_INNER_H__
+#define FALCON_INNER_H__
+
+/*
+ * Internal functions for Falcon. This is not the API intended to be
+ * used by applications; instead, this internal API provides all the
+ * primitives on which wrappers build to provide external APIs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ * - All public functions (i.e. the non-static ones) must be referenced
+ * with the PQCLEAN_FALCONPADDED512_CLEAN_ macro (e.g. PQCLEAN_FALCONPADDED512_CLEAN_verify_raw for the verify_raw()
+ * function). That macro adds a prefix to the name, which is
+ * configurable with the FALCON_PREFIX macro. This allows compiling
+ * the code into a specific "namespace" and potentially including
+ * several versions of this code into a single application (e.g. to
+ * have an AVX2 and a non-AVX2 variants and select the one to use at
+ * runtime based on availability of AVX2 opcodes).
+ *
+ * - Functions that need temporary buffers expects them as a final
+ * tmp[] array of type uint8_t*, with a size which is documented for
+ * each function. However, most have some alignment requirements,
+ * because they will use the array to store 16-bit, 32-bit or 64-bit
+ * values (e.g. uint64_t or double). The caller must ensure proper
+ * alignment. What happens on unaligned access depends on the
+ * underlying architecture, ranging from a slight time penalty
+ * to immediate termination of the process.
+ *
+ * - Some functions rely on specific rounding rules and precision for
+ * floating-point numbers. On some systems (in particular 32-bit x86
+ * with the 387 FPU), this requires setting an hardware control
+ * word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ * oldcw = set_fpu_cw(2);
+ * PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(...);
+ * set_fpu_cw(oldcw);
+ *
+ * On systems where the native floating-point precision is already
+ * proper, or integer-based emulation is used, the set_fpu_cw()
+ * function does nothing, so it can be called systematically.
+ */
+
+#include
+#include
+#include
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+ return x;
+}
+
+/* ==================================================================== */
+/*
+ * SHAKE256 implementation (shake.c).
+ *
+ * API is defined to be easily replaced with the fips202.h API defined
+ * as part of PQClean.
+ */
+
+#include "fips202.h"
+
+#define inner_shake256_context shake256incctx
+#define inner_shake256_init(sc) shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_ctx_release(sc) shake256_inc_ctx_release(sc)
+
+/* ==================================================================== */
+/*
+ * Encoding/decoding functions (codec.c).
+ *
+ * Encoding functions take as parameters an output buffer (out) with
+ * a given maximum length (max_out_len); returned value is the actual
+ * number of bytes which have been written. If the output buffer is
+ * not large enough, then 0 is returned (some bytes may have been
+ * written to the buffer). If 'out' is NULL, then 'max_out_len' is
+ * ignored; instead, the function computes and returns the actual
+ * required output length (in bytes).
+ *
+ * Decoding functions take as parameters an input buffer (in) with
+ * its maximum length (max_in_len); returned value is the actual number
+ * of bytes that have been read from the buffer. If the provided length
+ * is too short, then 0 is returned.
+ *
+ * Values to encode or decode are vectors of integers, with N = 2^logn
+ * elements.
+ *
+ * Three encoding formats are defined:
+ *
+ * - modq: sequence of values modulo 12289, each encoded over exactly
+ * 14 bits. The encoder and decoder verify that integers are within
+ * the valid range (0..12288). Values are arrays of uint16.
+ *
+ * - trim: sequence of signed integers, a specified number of bits
+ * each. The number of bits is provided as parameter and includes
+ * the sign bit. Each integer x must be such that |x| < 2^(bits-1)
+ * (which means that the -2^(bits-1) value is forbidden); encode and
+ * decode functions check that property. Values are arrays of
+ * int16_t or int8_t, corresponding to names 'trim_i16' and
+ * 'trim_i8', respectively.
+ *
+ * - comp: variable-length encoding for signed integers; each integer
+ * uses a minimum of 9 bits, possibly more. This is normally used
+ * only for signatures.
+ *
+ */
+
+size_t PQCLEAN_FALCONPADDED512_CLEAN_modq_encode(void *out, size_t max_out_len,
+ const uint16_t *x, unsigned logn);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(void *out, size_t max_out_len,
+ const int8_t *x, unsigned logn, unsigned bits);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_comp_encode(void *out, size_t max_out_len,
+ const int16_t *x, unsigned logn);
+
+size_t PQCLEAN_FALCONPADDED512_CLEAN_modq_decode(uint16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i16_decode(int16_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(int8_t *x, unsigned logn, unsigned bits,
+ const void *in, size_t max_in_len);
+size_t PQCLEAN_FALCONPADDED512_CLEAN_comp_decode(int16_t *x, unsigned logn,
+ const void *in, size_t max_in_len);
+
+/*
+ * Number of bits for key elements, indexed by logn (1 to 10). This
+ * is at most 8 bits for all degrees, but some degrees may have shorter
+ * elements.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[];
+extern const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[];
+
+/*
+ * Maximum size, in bits, of elements in a signature, indexed by logn
+ * (1 to 10). The size includes the sign bit.
+ */
+extern const uint8_t PQCLEAN_FALCONPADDED512_CLEAN_max_sig_bits[];
+
+/* ==================================================================== */
+/*
+ * Support functions used for both signature generation and signature
+ * verification (common.c).
+ */
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
+ uint16_t *x, unsigned logn, uint8_t *tmp);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. This compares the appropriate norm of the
+ * vector with the acceptance bound. Returned value is 1 on success
+ * (vector is short enough to be acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_is_short(const int16_t *s1, const int16_t *s2, unsigned logn);
+
+/*
+ * Tell whether a given vector (2N coordinates, in two halves) is
+ * acceptable as a signature. Instead of the first half s1, this
+ * function receives the "saturated squared norm" of s1, i.e. the
+ * sum of the squares of the coordinates of s1 (saturated at 2^32-1
+ * if the sum exceeds 2^31-1).
+ *
+ * Returned value is 1 on success (vector is short enough to be
+ * acceptable), 0 otherwise.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(uint32_t sqn, const int16_t *s2, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Signature verification functions (vrfy.c).
+ */
+
+/*
+ * Convert a public key to NTT + Montgomery format. Conversion is done
+ * in place.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
+
+/*
+ * Internal signature verification code:
+ * c0[] contains the hashed nonce+message
+ * s2[] is the decoded signature
+ * h[] contains the public key, in NTT + Montgomery format
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute the public key h[], given the private key elements f[] and
+ * g[]. This computes h = g/f mod phi mod q, where phi is the polynomial
+ * modulus. This function returns 1 on success, 0 on error (an error is
+ * reported if f is not invertible mod phi mod q).
+ *
+ * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
+
+/*
+ * Recompute the fourth private key element. Private key consists in
+ * four polynomials with small coefficients f, g, F and G, which are
+ * such that fG - gF = q mod phi; furthermore, f is invertible modulo
+ * phi and modulo q. This function recomputes G from f, g and F.
+ *
+ * The tmp[] array must have room for at least 4*2^logn bytes.
+ *
+ * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ * h[] receives the public key (NOT in NTT/Montgomery format)
+ * c0[] contains the hashed nonce+message
+ * s1[] is the first signature half
+ * s2[] is the second signature half
+ * logn is the degree log
+ * tmp[] temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Implementation of floating-point real numbers (fpr.h, fpr.c).
+ */
+
+/*
+ * Real numbers are implemented by an extra header file, included below.
+ * This is meant to support pluggable implementations. The default
+ * implementation relies on the C type 'double'.
+ *
+ * The included file must define the following types, functions and
+ * constants:
+ *
+ * fpr
+ * type for a real number
+ *
+ * fpr fpr_of(int64_t i)
+ * cast an integer into a real number; source must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_scaled(int64_t i, int sc)
+ * compute i*2^sc as a real number; source 'i' must be in the
+ * -(2^63-1)..+(2^63-1) range
+ *
+ * fpr fpr_ldexp(fpr x, int e)
+ * compute x*2^e
+ *
+ * int64_t fpr_rint(fpr x)
+ * round x to the nearest integer; x must be in the -(2^63-1)
+ * to +(2^63-1) range
+ *
+ * int64_t fpr_trunc(fpr x)
+ * round to an integer; this rounds towards zero; value must
+ * be in the -(2^63-1) to +(2^63-1) range
+ *
+ * fpr fpr_add(fpr x, fpr y)
+ * compute x + y
+ *
+ * fpr fpr_sub(fpr x, fpr y)
+ * compute x - y
+ *
+ * fpr fpr_neg(fpr x)
+ * compute -x
+ *
+ * fpr fpr_half(fpr x)
+ * compute x/2
+ *
+ * fpr fpr_double(fpr x)
+ * compute x*2
+ *
+ * fpr fpr_mul(fpr x, fpr y)
+ * compute x * y
+ *
+ * fpr fpr_sqr(fpr x)
+ * compute x * x
+ *
+ * fpr fpr_inv(fpr x)
+ * compute 1/x
+ *
+ * fpr fpr_div(fpr x, fpr y)
+ * compute x/y
+ *
+ * fpr fpr_sqrt(fpr x)
+ * compute the square root of x
+ *
+ * int fpr_lt(fpr x, fpr y)
+ * return 1 if x < y, 0 otherwise
+ *
+ * uint64_t fpr_expm_p63(fpr x)
+ * return exp(x), assuming that 0 <= x < log(2). Returned value
+ * is scaled to 63 bits (i.e. it really returns 2^63*exp(-x),
+ * rounded to the nearest integer). Computation should have a
+ * precision of at least 45 bits.
+ *
+ * const fpr fpr_gm_tab[]
+ * array of constants for FFT / iFFT
+ *
+ * const fpr fpr_p2_tab[]
+ * precomputed powers of 2 (by index, 0 to 10)
+ *
+ * Constants of type 'fpr':
+ *
+ * fpr fpr_q 12289
+ * fpr fpr_inverse_of_q 1/12289
+ * fpr fpr_inv_2sqrsigma0 1/(2*(1.8205^2))
+ * fpr fpr_inv_sigma[] 1/sigma (indexed by logn, 1 to 10)
+ * fpr fpr_sigma_min[] 1/sigma_min (indexed by logn, 1 to 10)
+ * fpr fpr_log2 log(2)
+ * fpr fpr_inv_log2 1/log(2)
+ * fpr fpr_bnorm_max 16822.4121
+ * fpr fpr_zero 0
+ * fpr fpr_one 1
+ * fpr fpr_two 2
+ * fpr fpr_onehalf 0.5
+ * fpr fpr_ptwo31 2^31
+ * fpr fpr_ptwo31m1 2^31-1
+ * fpr fpr_mtwo31m1 -(2^31-1)
+ * fpr fpr_ptwo63m1 2^63-1
+ * fpr fpr_mtwo63m1 -(2^63-1)
+ * fpr fpr_ptwo63 2^63
+ */
+#include "fpr.h"
+
+/* ==================================================================== */
+/*
+ * RNG (rng.c).
+ *
+ * A PRNG based on ChaCha20 is implemented; it is seeded from a SHAKE256
+ * context (flipped) and is used for bulk pseudorandom generation.
+ * A system-dependent seed generator is also provided.
+ */
+
+/*
+ * Obtain a random seed from the system RNG.
+ *
+ * Returned value is 1 on success, 0 on error.
+ */
+int PQCLEAN_FALCONPADDED512_CLEAN_get_seed(void *seed, size_t seed_len);
+
+/*
+ * Structure for a PRNG. This includes a large buffer so that values
+ * get generated in advance. The 'state' is used to keep the current
+ * PRNG algorithm state (contents depend on the selected algorithm).
+ *
+ * The unions with 'dummy_u64' are there to ensure proper alignment for
+ * 64-bit direct access.
+ */
+typedef struct {
+ union {
+ uint8_t d[512]; /* MUST be 512, exactly */
+ uint64_t dummy_u64;
+ } buf;
+ size_t ptr;
+ union {
+ uint8_t d[256];
+ uint64_t dummy_u64;
+ } state;
+ int type;
+} prng;
+
+/*
+ * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
+ * context (in "flipped" state) to obtain its initial state.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_prng_init(prng *p, inner_shake256_context *src);
+
+/*
+ * Refill the PRNG buffer. This is normally invoked automatically, and
+ * is declared here only so that prng_get_u64() may be inlined.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(prng *p);
+
+/*
+ * Get some bytes from a PRNG.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len);
+
+/*
+ * Get a 64-bit random value from a PRNG.
+ */
+static inline uint64_t
+prng_get_u64(prng *p) {
+ size_t u;
+
+ /*
+ * If there are less than 9 bytes in the buffer, we refill it.
+ * This means that we may drop the last few bytes, but this allows
+ * for faster extraction code. Also, it means that we never leave
+ * an empty buffer.
+ */
+ u = p->ptr;
+ if (u >= (sizeof p->buf.d) - 9) {
+ PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+ u = 0;
+ }
+ p->ptr = u + 8;
+
+ return (uint64_t)p->buf.d[u + 0]
+ | ((uint64_t)p->buf.d[u + 1] << 8)
+ | ((uint64_t)p->buf.d[u + 2] << 16)
+ | ((uint64_t)p->buf.d[u + 3] << 24)
+ | ((uint64_t)p->buf.d[u + 4] << 32)
+ | ((uint64_t)p->buf.d[u + 5] << 40)
+ | ((uint64_t)p->buf.d[u + 6] << 48)
+ | ((uint64_t)p->buf.d[u + 7] << 56);
+}
+
+/*
+ * Get an 8-bit random value from a PRNG.
+ */
+static inline unsigned
+prng_get_u8(prng *p) {
+ unsigned v;
+
+ v = p->buf.d[p->ptr ++];
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+ }
+ return v;
+}
+
+/* ==================================================================== */
+/*
+ * FFT (falcon-fft.c).
+ *
+ * A real polynomial is represented as an array of N 'fpr' elements.
+ * The FFT representation of a real polynomial contains N/2 complex
+ * elements; each is stored as two real numbers, for the real and
+ * imaginary parts, respectively. See falcon-fft.c for details on the
+ * internal representation.
+ */
+
+/*
+ * Compute FFT in-place: the source array should contain a real
+ * polynomial (N coefficients); its storage area is reused to store
+ * the FFT representation of that polynomial (N/2 complex numbers).
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_FFT(fpr *f, unsigned logn);
+
+/*
+ * Compute the inverse FFT in-place: the source array should contain the
+ * FFT representation of a real polynomial (N/2 elements); the resulting
+ * real polynomial (N coefficients of type 'fpr') is written over the
+ * array.
+ *
+ * 'logn' MUST lie between 1 and 10 (inclusive).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_iFFT(fpr *f, unsigned logn);
+
+/*
+ * Add polynomial b to polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_add(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Subtract polynomial b from polynomial a. a and b MUST NOT overlap. This
+ * function works in both normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Negate polynomial a. This function works in both normal and FFT
+ * representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(fpr *a, unsigned logn);
+
+/*
+ * Compute adjoint of polynomial a. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial a with polynomial b. a and b MUST NOT overlap.
+ * This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial a with the adjoint of polynomial b. a and b MUST NOT
+ * overlap. This function works only in FFT representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Multiply polynomial with its own adjoint. This function works only in FFT
+ * representation.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(fpr *a, unsigned logn);
+
+/*
+ * Multiply polynomial with a real constant. This function works in both
+ * normal and FFT representations.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(fpr *a, fpr x, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, modulo X^N+1 (FFT representation).
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_div_fft(fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given f and g (in FFT representation), compute 1/(f*adj(f)+g*adj(g))
+ * (also in FFT representation). Since the result is auto-adjoint, all its
+ * coordinates in FFT representation are real; as such, only the first N/2
+ * values of d[] are filled (the imaginary parts are skipped).
+ *
+ * Array d MUST NOT overlap with either a or b.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(fpr *d,
+ const fpr *a, const fpr *b, unsigned logn);
+
+/*
+ * Given F, G, f and g (in FFT representation), compute F*adj(f)+G*adj(g)
+ * (also in FFT representation). Destination d MUST NOT overlap with
+ * any of the source arrays.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_add_muladj_fft(fpr *d,
+ const fpr *F, const fpr *G,
+ const fpr *f, const fpr *g, unsigned logn);
+
+/*
+ * Multiply polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Divide polynomial a by polynomial b, where b is autoadjoint. Both
+ * a and b are in FFT representation. Since b is autoadjoint, all its
+ * FFT coefficients are real, and the array b contains only N/2 elements.
+ * a and b MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_div_autoadj_fft(fpr *a,
+ const fpr *b, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. On input, g00, g01 and g11 are provided (where the
+ * matrix G = [[g00, g01], [adj(g01), g11]]). On output, the d00, l10
+ * and d11 values are written in g00, g01 and g11, respectively
+ * (with D = [[d00, 0], [0, d11]] and L = [[1, 0], [l10, 1]]).
+ * (In fact, d00 = g00, so the g00 operand is left unmodified.)
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_LDL_fft(const fpr *g00,
+ fpr *g01, fpr *g11, unsigned logn);
+
+/*
+ * Perform an LDL decomposition of an auto-adjoint matrix G, in FFT
+ * representation. This is identical to poly_LDL_fft() except that
+ * g00, g01 and g11 are unmodified; the outputs d11 and l10 are written
+ * in two other separate buffers provided as extra parameters.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(fpr *d11, fpr *l10,
+ const fpr *g00, const fpr *g01,
+ const fpr *g11, unsigned logn);
+
+/*
+ * Apply "split" operation on a polynomial in FFT representation:
+ * f = f0(x^2) + x*f1(x^2), for half-size polynomials f0 and f1
+ * (polynomials modulo X^(N/2)+1). f0, f1 and f MUST NOT overlap.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(fpr *f0, fpr *f1,
+ const fpr *f, unsigned logn);
+
+/*
+ * Apply "merge" operation on two polynomials in FFT representation:
+ * given f0 and f1, polynomials moduo X^(N/2)+1, this function computes
+ * f = f0(x^2) + x*f1(x^2), in FFT representation modulo X^N+1.
+ * f MUST NOT overlap with either f0 or f1.
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(fpr *f,
+ const fpr *f0, const fpr *f1, unsigned logn);
+
+/* ==================================================================== */
+/*
+ * Key pair generation.
+ */
+
+/*
+ * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
+ */
+#define FALCON_KEYGEN_TEMP_1 136
+#define FALCON_KEYGEN_TEMP_2 272
+#define FALCON_KEYGEN_TEMP_3 224
+#define FALCON_KEYGEN_TEMP_4 448
+#define FALCON_KEYGEN_TEMP_5 896
+#define FALCON_KEYGEN_TEMP_6 1792
+#define FALCON_KEYGEN_TEMP_7 3584
+#define FALCON_KEYGEN_TEMP_8 7168
+#define FALCON_KEYGEN_TEMP_9 14336
+#define FALCON_KEYGEN_TEMP_10 28672
+
+/*
+ * Generate a new key pair. Randomness is extracted from the provided
+ * SHAKE256 context, which must have already been seeded and flipped.
+ * The tmp[] array must have suitable size (see FALCON_KEYGEN_TEMP_*
+ * macros) and be aligned for the uint32_t, uint64_t and fpr types.
+ *
+ * The private key elements are written in f, g, F and G, and the
+ * public key is written in h. Either or both of G and h may be NULL,
+ * in which case the corresponding element is not returned (they can
+ * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp);
+
+/* ==================================================================== */
+/*
+ * Signature generation.
+ */
+
+/*
+ * Expand a private key into the B0 matrix in FFT representation and
+ * the LDL tree. All the values are written in 'expanded_key', for
+ * a total of (8*logn+40)*2^logn bytes.
+ *
+ * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses an
+ * expanded key (as generated by PQCLEAN_FALCONPADDED512_CLEAN_expand_privkey()).
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Compute a signature over the provided hashed message (hm); the
+ * signature value is one short vector. This function uses a raw
+ * key and dynamically recompute the B0 matrix and LDL tree; this
+ * saves RAM since there is no needed for an expanded key, but
+ * increases the signature cost.
+ *
+ * The sig[] and hm[] buffers may overlap.
+ *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
+ * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
+ */
+void PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ * ctx pointer to the sampler_context structure
+ * mu center for the distribution
+ * isigma inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+ prng p;
+ fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCONPADDED512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCONPADDED512_CLEAN_gaussian0_sampler(prng *p);
+
+/* ==================================================================== */
+
+#endif
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/keygen.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/keygen.c
new file mode 100644
index 000000000..f556877cc
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/keygen.c
@@ -0,0 +1,4234 @@
+/*
+ * Falcon key pair generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* ==================================================================== */
+/*
+ * Modular arithmetics.
+ *
+ * We implement a few functions for computing modulo a small integer p.
+ *
+ * All functions require that 2^30 < p < 2^31. Moreover, operands must
+ * be in the 0..p-1 range.
+ *
+ * Modular addition and subtraction work for all such p.
+ *
+ * Montgomery multiplication requires that p is odd, and must be provided
+ * with an additional value p0i = -1/p mod 2^31. See below for some basics
+ * on Montgomery multiplication.
+ *
+ * Division computes an inverse modulo p by an exponentiation (with
+ * exponent p-2): this works only if p is prime. Multiplication
+ * requirements also apply, i.e. p must be odd and p0i must be provided.
+ *
+ * The NTT and inverse NTT need all of the above, and also that
+ * p = 1 mod 2048.
+ *
+ * -----------------------------------------------------------------------
+ *
+ * We use Montgomery representation with 31-bit values:
+ *
+ * Let R = 2^31 mod p. When 2^30 < p < 2^31, R = 2^31 - p.
+ * Montgomery representation of an integer x modulo p is x*R mod p.
+ *
+ * Montgomery multiplication computes (x*y)/R mod p for
+ * operands x and y. Therefore:
+ *
+ * - if operands are x*R and y*R (Montgomery representations of x and
+ * y), then Montgomery multiplication computes (x*R*y*R)/R = (x*y)*R
+ * mod p, which is the Montgomery representation of the product x*y;
+ *
+ * - if operands are x*R and y (or x and y*R), then Montgomery
+ * multiplication returns x*y mod p: mixed-representation
+ * multiplications yield results in normal representation.
+ *
+ * To convert to Montgomery representation, we multiply by R, which is done
+ * by Montgomery-multiplying by R^2. Stand-alone conversion back from
+ * Montgomery representation is Montgomery-multiplication by 1.
+ */
+
+/*
+ * Precomputed small primes. Each element contains the following:
+ *
+ * p The prime itself.
+ *
+ * g A primitive root of phi = X^N+1 (in field Z_p).
+ *
+ * s The inverse of the product of all previous primes in the array,
+ * computed modulo p and in Montgomery representation.
+ *
+ * All primes are such that p = 1 mod 2048, and are lower than 2^31. They
+ * are listed in decreasing order.
+ */
+
+typedef struct {
+ uint32_t p;
+ uint32_t g;
+ uint32_t s;
+} small_prime;
+
+static const small_prime PRIMES[] = {
+ { 2147473409, 383167813, 10239 },
+ { 2147389441, 211808905, 471403745 },
+ { 2147387393, 37672282, 1329335065 },
+ { 2147377153, 1977035326, 968223422 },
+ { 2147358721, 1067163706, 132460015 },
+ { 2147352577, 1606082042, 598693809 },
+ { 2147346433, 2033915641, 1056257184 },
+ { 2147338241, 1653770625, 421286710 },
+ { 2147309569, 631200819, 1111201074 },
+ { 2147297281, 2038364663, 1042003613 },
+ { 2147295233, 1962540515, 19440033 },
+ { 2147239937, 2100082663, 353296760 },
+ { 2147235841, 1991153006, 1703918027 },
+ { 2147217409, 516405114, 1258919613 },
+ { 2147205121, 409347988, 1089726929 },
+ { 2147196929, 927788991, 1946238668 },
+ { 2147178497, 1136922411, 1347028164 },
+ { 2147100673, 868626236, 701164723 },
+ { 2147082241, 1897279176, 617820870 },
+ { 2147074049, 1888819123, 158382189 },
+ { 2147051521, 25006327, 522758543 },
+ { 2147043329, 327546255, 37227845 },
+ { 2147039233, 766324424, 1133356428 },
+ { 2146988033, 1862817362, 73861329 },
+ { 2146963457, 404622040, 653019435 },
+ { 2146959361, 1936581214, 995143093 },
+ { 2146938881, 1559770096, 634921513 },
+ { 2146908161, 422623708, 1985060172 },
+ { 2146885633, 1751189170, 298238186 },
+ { 2146871297, 578919515, 291810829 },
+ { 2146846721, 1114060353, 915902322 },
+ { 2146834433, 2069565474, 47859524 },
+ { 2146818049, 1552824584, 646281055 },
+ { 2146775041, 1906267847, 1597832891 },
+ { 2146756609, 1847414714, 1228090888 },
+ { 2146744321, 1818792070, 1176377637 },
+ { 2146738177, 1118066398, 1054971214 },
+ { 2146736129, 52057278, 933422153 },
+ { 2146713601, 592259376, 1406621510 },
+ { 2146695169, 263161877, 1514178701 },
+ { 2146656257, 685363115, 384505091 },
+ { 2146650113, 927727032, 537575289 },
+ { 2146646017, 52575506, 1799464037 },
+ { 2146643969, 1276803876, 1348954416 },
+ { 2146603009, 814028633, 1521547704 },
+ { 2146572289, 1846678872, 1310832121 },
+ { 2146547713, 919368090, 1019041349 },
+ { 2146508801, 671847612, 38582496 },
+ { 2146492417, 283911680, 532424562 },
+ { 2146490369, 1780044827, 896447978 },
+ { 2146459649, 327980850, 1327906900 },
+ { 2146447361, 1310561493, 958645253 },
+ { 2146441217, 412148926, 287271128 },
+ { 2146437121, 293186449, 2009822534 },
+ { 2146430977, 179034356, 1359155584 },
+ { 2146418689, 1517345488, 1790248672 },
+ { 2146406401, 1615820390, 1584833571 },
+ { 2146404353, 826651445, 607120498 },
+ { 2146379777, 3816988, 1897049071 },
+ { 2146363393, 1221409784, 1986921567 },
+ { 2146355201, 1388081168, 849968120 },
+ { 2146336769, 1803473237, 1655544036 },
+ { 2146312193, 1023484977, 273671831 },
+ { 2146293761, 1074591448, 467406983 },
+ { 2146283521, 831604668, 1523950494 },
+ { 2146203649, 712865423, 1170834574 },
+ { 2146154497, 1764991362, 1064856763 },
+ { 2146142209, 627386213, 1406840151 },
+ { 2146127873, 1638674429, 2088393537 },
+ { 2146099201, 1516001018, 690673370 },
+ { 2146093057, 1294931393, 315136610 },
+ { 2146091009, 1942399533, 973539425 },
+ { 2146078721, 1843461814, 2132275436 },
+ { 2146060289, 1098740778, 360423481 },
+ { 2146048001, 1617213232, 1951981294 },
+ { 2146041857, 1805783169, 2075683489 },
+ { 2146019329, 272027909, 1753219918 },
+ { 2145986561, 1206530344, 2034028118 },
+ { 2145976321, 1243769360, 1173377644 },
+ { 2145964033, 887200839, 1281344586 },
+ { 2145906689, 1651026455, 906178216 },
+ { 2145875969, 1673238256, 1043521212 },
+ { 2145871873, 1226591210, 1399796492 },
+ { 2145841153, 1465353397, 1324527802 },
+ { 2145832961, 1150638905, 554084759 },
+ { 2145816577, 221601706, 427340863 },
+ { 2145785857, 608896761, 316590738 },
+ { 2145755137, 1712054942, 1684294304 },
+ { 2145742849, 1302302867, 724873116 },
+ { 2145728513, 516717693, 431671476 },
+ { 2145699841, 524575579, 1619722537 },
+ { 2145691649, 1925625239, 982974435 },
+ { 2145687553, 463795662, 1293154300 },
+ { 2145673217, 771716636, 881778029 },
+ { 2145630209, 1509556977, 837364988 },
+ { 2145595393, 229091856, 851648427 },
+ { 2145587201, 1796903241, 635342424 },
+ { 2145525761, 715310882, 1677228081 },
+ { 2145495041, 1040930522, 200685896 },
+ { 2145466369, 949804237, 1809146322 },
+ { 2145445889, 1673903706, 95316881 },
+ { 2145390593, 806941852, 1428671135 },
+ { 2145372161, 1402525292, 159350694 },
+ { 2145361921, 2124760298, 1589134749 },
+ { 2145359873, 1217503067, 1561543010 },
+ { 2145355777, 338341402, 83865711 },
+ { 2145343489, 1381532164, 641430002 },
+ { 2145325057, 1883895478, 1528469895 },
+ { 2145318913, 1335370424, 65809740 },
+ { 2145312769, 2000008042, 1919775760 },
+ { 2145300481, 961450962, 1229540578 },
+ { 2145282049, 910466767, 1964062701 },
+ { 2145232897, 816527501, 450152063 },
+ { 2145218561, 1435128058, 1794509700 },
+ { 2145187841, 33505311, 1272467582 },
+ { 2145181697, 269767433, 1380363849 },
+ { 2145175553, 56386299, 1316870546 },
+ { 2145079297, 2106880293, 1391797340 },
+ { 2145021953, 1347906152, 720510798 },
+ { 2145015809, 206769262, 1651459955 },
+ { 2145003521, 1885513236, 1393381284 },
+ { 2144960513, 1810381315, 31937275 },
+ { 2144944129, 1306487838, 2019419520 },
+ { 2144935937, 37304730, 1841489054 },
+ { 2144894977, 1601434616, 157985831 },
+ { 2144888833, 98749330, 2128592228 },
+ { 2144880641, 1772327002, 2076128344 },
+ { 2144864257, 1404514762, 2029969964 },
+ { 2144827393, 801236594, 406627220 },
+ { 2144806913, 349217443, 1501080290 },
+ { 2144796673, 1542656776, 2084736519 },
+ { 2144778241, 1210734884, 1746416203 },
+ { 2144759809, 1146598851, 716464489 },
+ { 2144757761, 286328400, 1823728177 },
+ { 2144729089, 1347555695, 1836644881 },
+ { 2144727041, 1795703790, 520296412 },
+ { 2144696321, 1302475157, 852964281 },
+ { 2144667649, 1075877614, 504992927 },
+ { 2144573441, 198765808, 1617144982 },
+ { 2144555009, 321528767, 155821259 },
+ { 2144550913, 814139516, 1819937644 },
+ { 2144536577, 571143206, 962942255 },
+ { 2144524289, 1746733766, 2471321 },
+ { 2144512001, 1821415077, 124190939 },
+ { 2144468993, 917871546, 1260072806 },
+ { 2144458753, 378417981, 1569240563 },
+ { 2144421889, 175229668, 1825620763 },
+ { 2144409601, 1699216963, 351648117 },
+ { 2144370689, 1071885991, 958186029 },
+ { 2144348161, 1763151227, 540353574 },
+ { 2144335873, 1060214804, 919598847 },
+ { 2144329729, 663515846, 1448552668 },
+ { 2144327681, 1057776305, 590222840 },
+ { 2144309249, 1705149168, 1459294624 },
+ { 2144296961, 325823721, 1649016934 },
+ { 2144290817, 738775789, 447427206 },
+ { 2144243713, 962347618, 893050215 },
+ { 2144237569, 1655257077, 900860862 },
+ { 2144161793, 242206694, 1567868672 },
+ { 2144155649, 769415308, 1247993134 },
+ { 2144137217, 320492023, 515841070 },
+ { 2144120833, 1639388522, 770877302 },
+ { 2144071681, 1761785233, 964296120 },
+ { 2144065537, 419817825, 204564472 },
+ { 2144028673, 666050597, 2091019760 },
+ { 2144010241, 1413657615, 1518702610 },
+ { 2143952897, 1238327946, 475672271 },
+ { 2143940609, 307063413, 1176750846 },
+ { 2143918081, 2062905559, 786785803 },
+ { 2143899649, 1338112849, 1562292083 },
+ { 2143891457, 68149545, 87166451 },
+ { 2143885313, 921750778, 394460854 },
+ { 2143854593, 719766593, 133877196 },
+ { 2143836161, 1149399850, 1861591875 },
+ { 2143762433, 1848739366, 1335934145 },
+ { 2143756289, 1326674710, 102999236 },
+ { 2143713281, 808061791, 1156900308 },
+ { 2143690753, 388399459, 1926468019 },
+ { 2143670273, 1427891374, 1756689401 },
+ { 2143666177, 1912173949, 986629565 },
+ { 2143645697, 2041160111, 371842865 },
+ { 2143641601, 1279906897, 2023974350 },
+ { 2143635457, 720473174, 1389027526 },
+ { 2143621121, 1298309455, 1732632006 },
+ { 2143598593, 1548762216, 1825417506 },
+ { 2143567873, 620475784, 1073787233 },
+ { 2143561729, 1932954575, 949167309 },
+ { 2143553537, 354315656, 1652037534 },
+ { 2143541249, 577424288, 1097027618 },
+ { 2143531009, 357862822, 478640055 },
+ { 2143522817, 2017706025, 1550531668 },
+ { 2143506433, 2078127419, 1824320165 },
+ { 2143488001, 613475285, 1604011510 },
+ { 2143469569, 1466594987, 502095196 },
+ { 2143426561, 1115430331, 1044637111 },
+ { 2143383553, 9778045, 1902463734 },
+ { 2143377409, 1557401276, 2056861771 },
+ { 2143363073, 652036455, 1965915971 },
+ { 2143260673, 1464581171, 1523257541 },
+ { 2143246337, 1876119649, 764541916 },
+ { 2143209473, 1614992673, 1920672844 },
+ { 2143203329, 981052047, 2049774209 },
+ { 2143160321, 1847355533, 728535665 },
+ { 2143129601, 965558457, 603052992 },
+ { 2143123457, 2140817191, 8348679 },
+ { 2143100929, 1547263683, 694209023 },
+ { 2143092737, 643459066, 1979934533 },
+ { 2143082497, 188603778, 2026175670 },
+ { 2143062017, 1657329695, 377451099 },
+ { 2143051777, 114967950, 979255473 },
+ { 2143025153, 1698431342, 1449196896 },
+ { 2143006721, 1862741675, 1739650365 },
+ { 2142996481, 756660457, 996160050 },
+ { 2142976001, 927864010, 1166847574 },
+ { 2142965761, 905070557, 661974566 },
+ { 2142916609, 40932754, 1787161127 },
+ { 2142892033, 1987985648, 675335382 },
+ { 2142885889, 797497211, 1323096997 },
+ { 2142871553, 2068025830, 1411877159 },
+ { 2142861313, 1217177090, 1438410687 },
+ { 2142830593, 409906375, 1767860634 },
+ { 2142803969, 1197788993, 359782919 },
+ { 2142785537, 643817365, 513932862 },
+ { 2142779393, 1717046338, 218943121 },
+ { 2142724097, 89336830, 416687049 },
+ { 2142707713, 5944581, 1356813523 },
+ { 2142658561, 887942135, 2074011722 },
+ { 2142638081, 151851972, 1647339939 },
+ { 2142564353, 1691505537, 1483107336 },
+ { 2142533633, 1989920200, 1135938817 },
+ { 2142529537, 959263126, 1531961857 },
+ { 2142527489, 453251129, 1725566162 },
+ { 2142502913, 1536028102, 182053257 },
+ { 2142498817, 570138730, 701443447 },
+ { 2142416897, 326965800, 411931819 },
+ { 2142363649, 1675665410, 1517191733 },
+ { 2142351361, 968529566, 1575712703 },
+ { 2142330881, 1384953238, 1769087884 },
+ { 2142314497, 1977173242, 1833745524 },
+ { 2142289921, 95082313, 1714775493 },
+ { 2142283777, 109377615, 1070584533 },
+ { 2142277633, 16960510, 702157145 },
+ { 2142263297, 553850819, 431364395 },
+ { 2142208001, 241466367, 2053967982 },
+ { 2142164993, 1795661326, 1031836848 },
+ { 2142097409, 1212530046, 712772031 },
+ { 2142087169, 1763869720, 822276067 },
+ { 2142078977, 644065713, 1765268066 },
+ { 2142074881, 112671944, 643204925 },
+ { 2142044161, 1387785471, 1297890174 },
+ { 2142025729, 783885537, 1000425730 },
+ { 2142011393, 905662232, 1679401033 },
+ { 2141974529, 799788433, 468119557 },
+ { 2141943809, 1932544124, 449305555 },
+ { 2141933569, 1527403256, 841867925 },
+ { 2141931521, 1247076451, 743823916 },
+ { 2141902849, 1199660531, 401687910 },
+ { 2141890561, 150132350, 1720336972 },
+ { 2141857793, 1287438162, 663880489 },
+ { 2141833217, 618017731, 1819208266 },
+ { 2141820929, 999578638, 1403090096 },
+ { 2141786113, 81834325, 1523542501 },
+ { 2141771777, 120001928, 463556492 },
+ { 2141759489, 122455485, 2124928282 },
+ { 2141749249, 141986041, 940339153 },
+ { 2141685761, 889088734, 477141499 },
+ { 2141673473, 324212681, 1122558298 },
+ { 2141669377, 1175806187, 1373818177 },
+ { 2141655041, 1113654822, 296887082 },
+ { 2141587457, 991103258, 1585913875 },
+ { 2141583361, 1401451409, 1802457360 },
+ { 2141575169, 1571977166, 712760980 },
+ { 2141546497, 1107849376, 1250270109 },
+ { 2141515777, 196544219, 356001130 },
+ { 2141495297, 1733571506, 1060744866 },
+ { 2141483009, 321552363, 1168297026 },
+ { 2141458433, 505818251, 733225819 },
+ { 2141360129, 1026840098, 948342276 },
+ { 2141325313, 945133744, 2129965998 },
+ { 2141317121, 1871100260, 1843844634 },
+ { 2141286401, 1790639498, 1750465696 },
+ { 2141267969, 1376858592, 186160720 },
+ { 2141255681, 2129698296, 1876677959 },
+ { 2141243393, 2138900688, 1340009628 },
+ { 2141214721, 1933049835, 1087819477 },
+ { 2141212673, 1898664939, 1786328049 },
+ { 2141202433, 990234828, 940682169 },
+ { 2141175809, 1406392421, 993089586 },
+ { 2141165569, 1263518371, 289019479 },
+ { 2141073409, 1485624211, 507864514 },
+ { 2141052929, 1885134788, 311252465 },
+ { 2141040641, 1285021247, 280941862 },
+ { 2141028353, 1527610374, 375035110 },
+ { 2141011969, 1400626168, 164696620 },
+ { 2140999681, 632959608, 966175067 },
+ { 2140997633, 2045628978, 1290889438 },
+ { 2140993537, 1412755491, 375366253 },
+ { 2140942337, 719477232, 785367828 },
+ { 2140925953, 45224252, 836552317 },
+ { 2140917761, 1157376588, 1001839569 },
+ { 2140887041, 278480752, 2098732796 },
+ { 2140837889, 1663139953, 924094810 },
+ { 2140788737, 802501511, 2045368990 },
+ { 2140766209, 1820083885, 1800295504 },
+ { 2140764161, 1169561905, 2106792035 },
+ { 2140696577, 127781498, 1885987531 },
+ { 2140684289, 16014477, 1098116827 },
+ { 2140653569, 665960598, 1796728247 },
+ { 2140594177, 1043085491, 377310938 },
+ { 2140579841, 1732838211, 1504505945 },
+ { 2140569601, 302071939, 358291016 },
+ { 2140567553, 192393733, 1909137143 },
+ { 2140557313, 406595731, 1175330270 },
+ { 2140549121, 1748850918, 525007007 },
+ { 2140477441, 499436566, 1031159814 },
+ { 2140469249, 1886004401, 1029951320 },
+ { 2140426241, 1483168100, 1676273461 },
+ { 2140420097, 1779917297, 846024476 },
+ { 2140413953, 522948893, 1816354149 },
+ { 2140383233, 1931364473, 1296921241 },
+ { 2140366849, 1917356555, 147196204 },
+ { 2140354561, 16466177, 1349052107 },
+ { 2140348417, 1875366972, 1860485634 },
+ { 2140323841, 456498717, 1790256483 },
+ { 2140321793, 1629493973, 150031888 },
+ { 2140315649, 1904063898, 395510935 },
+ { 2140280833, 1784104328, 831417909 },
+ { 2140250113, 256087139, 697349101 },
+ { 2140229633, 388553070, 243875754 },
+ { 2140223489, 747459608, 1396270850 },
+ { 2140200961, 507423743, 1895572209 },
+ { 2140162049, 580106016, 2045297469 },
+ { 2140149761, 712426444, 785217995 },
+ { 2140137473, 1441607584, 536866543 },
+ { 2140119041, 346538902, 1740434653 },
+ { 2140090369, 282642885, 21051094 },
+ { 2140076033, 1407456228, 319910029 },
+ { 2140047361, 1619330500, 1488632070 },
+ { 2140041217, 2089408064, 2012026134 },
+ { 2140008449, 1705524800, 1613440760 },
+ { 2139924481, 1846208233, 1280649481 },
+ { 2139906049, 989438755, 1185646076 },
+ { 2139867137, 1522314850, 372783595 },
+ { 2139842561, 1681587377, 216848235 },
+ { 2139826177, 2066284988, 1784999464 },
+ { 2139824129, 480888214, 1513323027 },
+ { 2139789313, 847937200, 858192859 },
+ { 2139783169, 1642000434, 1583261448 },
+ { 2139770881, 940699589, 179702100 },
+ { 2139768833, 315623242, 964612676 },
+ { 2139666433, 331649203, 764666914 },
+ { 2139641857, 2118730799, 1313764644 },
+ { 2139635713, 519149027, 519212449 },
+ { 2139598849, 1526413634, 1769667104 },
+ { 2139574273, 551148610, 820739925 },
+ { 2139568129, 1386800242, 472447405 },
+ { 2139549697, 813760130, 1412328531 },
+ { 2139537409, 1615286260, 1609362979 },
+ { 2139475969, 1352559299, 1696720421 },
+ { 2139455489, 1048691649, 1584935400 },
+ { 2139432961, 836025845, 950121150 },
+ { 2139424769, 1558281165, 1635486858 },
+ { 2139406337, 1728402143, 1674423301 },
+ { 2139396097, 1727715782, 1483470544 },
+ { 2139383809, 1092853491, 1741699084 },
+ { 2139369473, 690776899, 1242798709 },
+ { 2139351041, 1768782380, 2120712049 },
+ { 2139334657, 1739968247, 1427249225 },
+ { 2139332609, 1547189119, 623011170 },
+ { 2139310081, 1346827917, 1605466350 },
+ { 2139303937, 369317948, 828392831 },
+ { 2139301889, 1560417239, 1788073219 },
+ { 2139283457, 1303121623, 595079358 },
+ { 2139248641, 1354555286, 573424177 },
+ { 2139240449, 60974056, 885781403 },
+ { 2139222017, 355573421, 1221054839 },
+ { 2139215873, 566477826, 1724006500 },
+ { 2139150337, 871437673, 1609133294 },
+ { 2139144193, 1478130914, 1137491905 },
+ { 2139117569, 1854880922, 964728507 },
+ { 2139076609, 202405335, 756508944 },
+ { 2139062273, 1399715741, 884826059 },
+ { 2139045889, 1051045798, 1202295476 },
+ { 2139033601, 1707715206, 632234634 },
+ { 2139006977, 2035853139, 231626690 },
+ { 2138951681, 183867876, 838350879 },
+ { 2138945537, 1403254661, 404460202 },
+ { 2138920961, 310865011, 1282911681 },
+ { 2138910721, 1328496553, 103472415 },
+ { 2138904577, 78831681, 993513549 },
+ { 2138902529, 1319697451, 1055904361 },
+ { 2138816513, 384338872, 1706202469 },
+ { 2138810369, 1084868275, 405677177 },
+ { 2138787841, 401181788, 1964773901 },
+ { 2138775553, 1850532988, 1247087473 },
+ { 2138767361, 874261901, 1576073565 },
+ { 2138757121, 1187474742, 993541415 },
+ { 2138748929, 1782458888, 1043206483 },
+ { 2138744833, 1221500487, 800141243 },
+ { 2138738689, 413465368, 1450660558 },
+ { 2138695681, 739045140, 342611472 },
+ { 2138658817, 1355845756, 672674190 },
+ { 2138644481, 608379162, 1538874380 },
+ { 2138632193, 1444914034, 686911254 },
+ { 2138607617, 484707818, 1435142134 },
+ { 2138591233, 539460669, 1290458549 },
+ { 2138572801, 2093538990, 2011138646 },
+ { 2138552321, 1149786988, 1076414907 },
+ { 2138546177, 840688206, 2108985273 },
+ { 2138533889, 209669619, 198172413 },
+ { 2138523649, 1975879426, 1277003968 },
+ { 2138490881, 1351891144, 1976858109 },
+ { 2138460161, 1817321013, 1979278293 },
+ { 2138429441, 1950077177, 203441928 },
+ { 2138400769, 908970113, 628395069 },
+ { 2138398721, 219890864, 758486760 },
+ { 2138376193, 1306654379, 977554090 },
+ { 2138351617, 298822498, 2004708503 },
+ { 2138337281, 441457816, 1049002108 },
+ { 2138320897, 1517731724, 1442269609 },
+ { 2138290177, 1355911197, 1647139103 },
+ { 2138234881, 531313247, 1746591962 },
+ { 2138214401, 1899410930, 781416444 },
+ { 2138202113, 1813477173, 1622508515 },
+ { 2138191873, 1086458299, 1025408615 },
+ { 2138183681, 1998800427, 827063290 },
+ { 2138173441, 1921308898, 749670117 },
+ { 2138103809, 1620902804, 2126787647 },
+ { 2138099713, 828647069, 1892961817 },
+ { 2138085377, 179405355, 1525506535 },
+ { 2138060801, 615683235, 1259580138 },
+ { 2138044417, 2030277840, 1731266562 },
+ { 2138042369, 2087222316, 1627902259 },
+ { 2138032129, 126388712, 1108640984 },
+ { 2138011649, 715026550, 1017980050 },
+ { 2137993217, 1693714349, 1351778704 },
+ { 2137888769, 1289762259, 1053090405 },
+ { 2137853953, 199991890, 1254192789 },
+ { 2137833473, 941421685, 896995556 },
+ { 2137817089, 750416446, 1251031181 },
+ { 2137792513, 798075119, 368077456 },
+ { 2137786369, 878543495, 1035375025 },
+ { 2137767937, 9351178, 1156563902 },
+ { 2137755649, 1382297614, 1686559583 },
+ { 2137724929, 1345472850, 1681096331 },
+ { 2137704449, 834666929, 630551727 },
+ { 2137673729, 1646165729, 1892091571 },
+ { 2137620481, 778943821, 48456461 },
+ { 2137618433, 1730837875, 1713336725 },
+ { 2137581569, 805610339, 1378891359 },
+ { 2137538561, 204342388, 1950165220 },
+ { 2137526273, 1947629754, 1500789441 },
+ { 2137516033, 719902645, 1499525372 },
+ { 2137491457, 230451261, 556382829 },
+ { 2137440257, 979573541, 412760291 },
+ { 2137374721, 927841248, 1954137185 },
+ { 2137362433, 1243778559, 861024672 },
+ { 2137313281, 1341338501, 980638386 },
+ { 2137311233, 937415182, 1793212117 },
+ { 2137255937, 795331324, 1410253405 },
+ { 2137243649, 150756339, 1966999887 },
+ { 2137182209, 163346914, 1939301431 },
+ { 2137171969, 1952552395, 758913141 },
+ { 2137159681, 570788721, 218668666 },
+ { 2137147393, 1896656810, 2045670345 },
+ { 2137141249, 358493842, 518199643 },
+ { 2137139201, 1505023029, 674695848 },
+ { 2137133057, 27911103, 830956306 },
+ { 2137122817, 439771337, 1555268614 },
+ { 2137116673, 790988579, 1871449599 },
+ { 2137110529, 432109234, 811805080 },
+ { 2137102337, 1357900653, 1184997641 },
+ { 2137098241, 515119035, 1715693095 },
+ { 2137090049, 408575203, 2085660657 },
+ { 2137085953, 2097793407, 1349626963 },
+ { 2137055233, 1556739954, 1449960883 },
+ { 2137030657, 1545758650, 1369303716 },
+ { 2136987649, 332602570, 103875114 },
+ { 2136969217, 1499989506, 1662964115 },
+ { 2136924161, 857040753, 4738842 },
+ { 2136895489, 1948872712, 570436091 },
+ { 2136893441, 58969960, 1568349634 },
+ { 2136887297, 2127193379, 273612548 },
+ { 2136850433, 111208983, 1181257116 },
+ { 2136809473, 1627275942, 1680317971 },
+ { 2136764417, 1574888217, 14011331 },
+ { 2136741889, 14011055, 1129154251 },
+ { 2136727553, 35862563, 1838555253 },
+ { 2136721409, 310235666, 1363928244 },
+ { 2136698881, 1612429202, 1560383828 },
+ { 2136649729, 1138540131, 800014364 },
+ { 2136606721, 602323503, 1433096652 },
+ { 2136563713, 182209265, 1919611038 },
+ { 2136555521, 324156477, 165591039 },
+ { 2136549377, 195513113, 217165345 },
+ { 2136526849, 1050768046, 939647887 },
+ { 2136508417, 1886286237, 1619926572 },
+ { 2136477697, 609647664, 35065157 },
+ { 2136471553, 679352216, 1452259468 },
+ { 2136457217, 128630031, 824816521 },
+ { 2136422401, 19787464, 1526049830 },
+ { 2136420353, 698316836, 1530623527 },
+ { 2136371201, 1651862373, 1804812805 },
+ { 2136334337, 326596005, 336977082 },
+ { 2136322049, 63253370, 1904972151 },
+ { 2136297473, 312176076, 172182411 },
+ { 2136248321, 381261841, 369032670 },
+ { 2136242177, 358688773, 1640007994 },
+ { 2136229889, 512677188, 75585225 },
+ { 2136219649, 2095003250, 1970086149 },
+ { 2136207361, 1909650722, 537760675 },
+ { 2136176641, 1334616195, 1533487619 },
+ { 2136158209, 2096285632, 1793285210 },
+ { 2136143873, 1897347517, 293843959 },
+ { 2136133633, 923586222, 1022655978 },
+ { 2136096769, 1464868191, 1515074410 },
+ { 2136094721, 2020679520, 2061636104 },
+ { 2136076289, 290798503, 1814726809 },
+ { 2136041473, 156415894, 1250757633 },
+ { 2135996417, 297459940, 1132158924 },
+ { 2135955457, 538755304, 1688831340 },
+ { 0, 0, 0 }
+};
+
+/*
+ * Reduce a small signed integer modulo a small prime. The source
+ * value x MUST be such that -p < x < p.
+ */
+static inline uint32_t
+modp_set(int32_t x, uint32_t p) {
+ uint32_t w;
+
+ w = (uint32_t)x;
+ w += p & -(w >> 31);
+ return w;
+}
+
+/*
+ * Normalize a modular integer around 0.
+ */
+static inline int32_t
+modp_norm(uint32_t x, uint32_t p) {
+ return (int32_t)(x - (p & (((x - ((p + 1) >> 1)) >> 31) - 1)));
+}
+
+/*
+ * Compute -1/p mod 2^31. This works for all odd integers p that fit
+ * on 31 bits.
+ */
+static uint32_t
+modp_ninv31(uint32_t p) {
+ uint32_t y;
+
+ y = 2 - p;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ y *= 2 - p * y;
+ return (uint32_t)0x7FFFFFFF & -y;
+}
+
+/*
+ * Compute R = 2^31 mod p.
+ */
+static inline uint32_t
+modp_R(uint32_t p) {
+ /*
+ * Since 2^30 < p < 2^31, we know that 2^31 mod p is simply
+ * 2^31 - p.
+ */
+ return ((uint32_t)1 << 31) - p;
+}
+
+/*
+ * Addition modulo p.
+ */
+static inline uint32_t
+modp_add(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a + b - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo p.
+ */
+static inline uint32_t
+modp_sub(uint32_t a, uint32_t b, uint32_t p) {
+ uint32_t d;
+
+ d = a - b;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Halving modulo p.
+ */
+/* unused
+static inline uint32_t
+modp_half(uint32_t a, uint32_t p)
+{
+ a += p & -(a & 1);
+ return a >> 1;
+}
+*/
+
+/*
+ * Montgomery multiplication modulo p. The 'p0i' value is -1/p mod 2^31.
+ * It is required that p is an odd integer.
+ */
+static inline uint32_t
+modp_montymul(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i) {
+ uint64_t z, w;
+ uint32_t d;
+
+ z = (uint64_t)a * (uint64_t)b;
+ w = ((z * p0i) & (uint64_t)0x7FFFFFFF) * p;
+ d = (uint32_t)((z + w) >> 31) - p;
+ d += p & -(d >> 31);
+ return d;
+}
+
+/*
+ * Compute R2 = 2^62 mod p.
+ */
+static uint32_t
+modp_R2(uint32_t p, uint32_t p0i) {
+ uint32_t z;
+
+ /*
+ * Compute z = 2^31 mod p (this is the value 1 in Montgomery
+ * representation), then double it with an addition.
+ */
+ z = modp_R(p);
+ z = modp_add(z, z, p);
+
+ /*
+ * Square it five times to obtain 2^32 in Montgomery representation
+ * (i.e. 2^63 mod p).
+ */
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+ z = modp_montymul(z, z, p, p0i);
+
+ /*
+ * Halve the value mod p to get 2^62.
+ */
+ z = (z + (p & -(z & 1))) >> 1;
+ return z;
+}
+
+/*
+ * Compute 2^(31*x) modulo p. This works for integers x up to 2^11.
+ * p must be prime such that 2^30 < p < 2^31; p0i must be equal to
+ * -1/p mod 2^31; R2 must be equal to 2^62 mod p.
+ */
+static inline uint32_t
+modp_Rx(unsigned x, uint32_t p, uint32_t p0i, uint32_t R2) {
+ int i;
+ uint32_t r, z;
+
+ /*
+ * 2^(31*x) = (2^31)*(2^(31*(x-1))); i.e. we want the Montgomery
+ * representation of (2^31)^e mod p, where e = x-1.
+ * R2 is 2^31 in Montgomery representation.
+ */
+ x --;
+ r = R2;
+ z = modp_R(p);
+ for (i = 0; (1U << i) <= x; i ++) {
+ if ((x & (1U << i)) != 0) {
+ z = modp_montymul(z, r, p, p0i);
+ }
+ r = modp_montymul(r, r, p, p0i);
+ }
+ return z;
+}
+
+/*
+ * Division modulo p. If the divisor (b) is 0, then 0 is returned.
+ * This function computes proper results only when p is prime.
+ * Parameters:
+ * a dividend
+ * b divisor
+ * p odd prime modulus
+ * p0i -1/p mod 2^31
+ * R 2^31 mod R
+ */
+static uint32_t
+modp_div(uint32_t a, uint32_t b, uint32_t p, uint32_t p0i, uint32_t R) {
+ uint32_t z, e;
+ int i;
+
+ e = p - 2;
+ z = R;
+ for (i = 30; i >= 0; i --) {
+ uint32_t z2;
+
+ z = modp_montymul(z, z, p, p0i);
+ z2 = modp_montymul(z, b, p, p0i);
+ z ^= (z ^ z2) & -(uint32_t)((e >> i) & 1);
+ }
+
+ /*
+ * The loop above just assumed that b was in Montgomery
+ * representation, i.e. really contained b*R; under that
+ * assumption, it returns 1/b in Montgomery representation,
+ * which is R/b. But we gave it b in normal representation,
+ * so the loop really returned R/(b/R) = R^2/b.
+ *
+ * We want a/b, so we need one Montgomery multiplication with a,
+ * which also remove one of the R factors, and another such
+ * multiplication to remove the second R factor.
+ */
+ z = modp_montymul(z, 1, p, p0i);
+ return modp_montymul(a, z, p, p0i);
+}
+
+/*
+ * Bit-reversal index table.
+ */
+static const uint16_t REV10[] = {
+ 0, 512, 256, 768, 128, 640, 384, 896, 64, 576, 320, 832,
+ 192, 704, 448, 960, 32, 544, 288, 800, 160, 672, 416, 928,
+ 96, 608, 352, 864, 224, 736, 480, 992, 16, 528, 272, 784,
+ 144, 656, 400, 912, 80, 592, 336, 848, 208, 720, 464, 976,
+ 48, 560, 304, 816, 176, 688, 432, 944, 112, 624, 368, 880,
+ 240, 752, 496, 1008, 8, 520, 264, 776, 136, 648, 392, 904,
+ 72, 584, 328, 840, 200, 712, 456, 968, 40, 552, 296, 808,
+ 168, 680, 424, 936, 104, 616, 360, 872, 232, 744, 488, 1000,
+ 24, 536, 280, 792, 152, 664, 408, 920, 88, 600, 344, 856,
+ 216, 728, 472, 984, 56, 568, 312, 824, 184, 696, 440, 952,
+ 120, 632, 376, 888, 248, 760, 504, 1016, 4, 516, 260, 772,
+ 132, 644, 388, 900, 68, 580, 324, 836, 196, 708, 452, 964,
+ 36, 548, 292, 804, 164, 676, 420, 932, 100, 612, 356, 868,
+ 228, 740, 484, 996, 20, 532, 276, 788, 148, 660, 404, 916,
+ 84, 596, 340, 852, 212, 724, 468, 980, 52, 564, 308, 820,
+ 180, 692, 436, 948, 116, 628, 372, 884, 244, 756, 500, 1012,
+ 12, 524, 268, 780, 140, 652, 396, 908, 76, 588, 332, 844,
+ 204, 716, 460, 972, 44, 556, 300, 812, 172, 684, 428, 940,
+ 108, 620, 364, 876, 236, 748, 492, 1004, 28, 540, 284, 796,
+ 156, 668, 412, 924, 92, 604, 348, 860, 220, 732, 476, 988,
+ 60, 572, 316, 828, 188, 700, 444, 956, 124, 636, 380, 892,
+ 252, 764, 508, 1020, 2, 514, 258, 770, 130, 642, 386, 898,
+ 66, 578, 322, 834, 194, 706, 450, 962, 34, 546, 290, 802,
+ 162, 674, 418, 930, 98, 610, 354, 866, 226, 738, 482, 994,
+ 18, 530, 274, 786, 146, 658, 402, 914, 82, 594, 338, 850,
+ 210, 722, 466, 978, 50, 562, 306, 818, 178, 690, 434, 946,
+ 114, 626, 370, 882, 242, 754, 498, 1010, 10, 522, 266, 778,
+ 138, 650, 394, 906, 74, 586, 330, 842, 202, 714, 458, 970,
+ 42, 554, 298, 810, 170, 682, 426, 938, 106, 618, 362, 874,
+ 234, 746, 490, 1002, 26, 538, 282, 794, 154, 666, 410, 922,
+ 90, 602, 346, 858, 218, 730, 474, 986, 58, 570, 314, 826,
+ 186, 698, 442, 954, 122, 634, 378, 890, 250, 762, 506, 1018,
+ 6, 518, 262, 774, 134, 646, 390, 902, 70, 582, 326, 838,
+ 198, 710, 454, 966, 38, 550, 294, 806, 166, 678, 422, 934,
+ 102, 614, 358, 870, 230, 742, 486, 998, 22, 534, 278, 790,
+ 150, 662, 406, 918, 86, 598, 342, 854, 214, 726, 470, 982,
+ 54, 566, 310, 822, 182, 694, 438, 950, 118, 630, 374, 886,
+ 246, 758, 502, 1014, 14, 526, 270, 782, 142, 654, 398, 910,
+ 78, 590, 334, 846, 206, 718, 462, 974, 46, 558, 302, 814,
+ 174, 686, 430, 942, 110, 622, 366, 878, 238, 750, 494, 1006,
+ 30, 542, 286, 798, 158, 670, 414, 926, 94, 606, 350, 862,
+ 222, 734, 478, 990, 62, 574, 318, 830, 190, 702, 446, 958,
+ 126, 638, 382, 894, 254, 766, 510, 1022, 1, 513, 257, 769,
+ 129, 641, 385, 897, 65, 577, 321, 833, 193, 705, 449, 961,
+ 33, 545, 289, 801, 161, 673, 417, 929, 97, 609, 353, 865,
+ 225, 737, 481, 993, 17, 529, 273, 785, 145, 657, 401, 913,
+ 81, 593, 337, 849, 209, 721, 465, 977, 49, 561, 305, 817,
+ 177, 689, 433, 945, 113, 625, 369, 881, 241, 753, 497, 1009,
+ 9, 521, 265, 777, 137, 649, 393, 905, 73, 585, 329, 841,
+ 201, 713, 457, 969, 41, 553, 297, 809, 169, 681, 425, 937,
+ 105, 617, 361, 873, 233, 745, 489, 1001, 25, 537, 281, 793,
+ 153, 665, 409, 921, 89, 601, 345, 857, 217, 729, 473, 985,
+ 57, 569, 313, 825, 185, 697, 441, 953, 121, 633, 377, 889,
+ 249, 761, 505, 1017, 5, 517, 261, 773, 133, 645, 389, 901,
+ 69, 581, 325, 837, 197, 709, 453, 965, 37, 549, 293, 805,
+ 165, 677, 421, 933, 101, 613, 357, 869, 229, 741, 485, 997,
+ 21, 533, 277, 789, 149, 661, 405, 917, 85, 597, 341, 853,
+ 213, 725, 469, 981, 53, 565, 309, 821, 181, 693, 437, 949,
+ 117, 629, 373, 885, 245, 757, 501, 1013, 13, 525, 269, 781,
+ 141, 653, 397, 909, 77, 589, 333, 845, 205, 717, 461, 973,
+ 45, 557, 301, 813, 173, 685, 429, 941, 109, 621, 365, 877,
+ 237, 749, 493, 1005, 29, 541, 285, 797, 157, 669, 413, 925,
+ 93, 605, 349, 861, 221, 733, 477, 989, 61, 573, 317, 829,
+ 189, 701, 445, 957, 125, 637, 381, 893, 253, 765, 509, 1021,
+ 3, 515, 259, 771, 131, 643, 387, 899, 67, 579, 323, 835,
+ 195, 707, 451, 963, 35, 547, 291, 803, 163, 675, 419, 931,
+ 99, 611, 355, 867, 227, 739, 483, 995, 19, 531, 275, 787,
+ 147, 659, 403, 915, 83, 595, 339, 851, 211, 723, 467, 979,
+ 51, 563, 307, 819, 179, 691, 435, 947, 115, 627, 371, 883,
+ 243, 755, 499, 1011, 11, 523, 267, 779, 139, 651, 395, 907,
+ 75, 587, 331, 843, 203, 715, 459, 971, 43, 555, 299, 811,
+ 171, 683, 427, 939, 107, 619, 363, 875, 235, 747, 491, 1003,
+ 27, 539, 283, 795, 155, 667, 411, 923, 91, 603, 347, 859,
+ 219, 731, 475, 987, 59, 571, 315, 827, 187, 699, 443, 955,
+ 123, 635, 379, 891, 251, 763, 507, 1019, 7, 519, 263, 775,
+ 135, 647, 391, 903, 71, 583, 327, 839, 199, 711, 455, 967,
+ 39, 551, 295, 807, 167, 679, 423, 935, 103, 615, 359, 871,
+ 231, 743, 487, 999, 23, 535, 279, 791, 151, 663, 407, 919,
+ 87, 599, 343, 855, 215, 727, 471, 983, 55, 567, 311, 823,
+ 183, 695, 439, 951, 119, 631, 375, 887, 247, 759, 503, 1015,
+ 15, 527, 271, 783, 143, 655, 399, 911, 79, 591, 335, 847,
+ 207, 719, 463, 975, 47, 559, 303, 815, 175, 687, 431, 943,
+ 111, 623, 367, 879, 239, 751, 495, 1007, 31, 543, 287, 799,
+ 159, 671, 415, 927, 95, 607, 351, 863, 223, 735, 479, 991,
+ 63, 575, 319, 831, 191, 703, 447, 959, 127, 639, 383, 895,
+ 255, 767, 511, 1023
+};
+
+/*
+ * Compute the roots for NTT and inverse NTT (binary case). Input
+ * parameter g is a primitive 2048-th root of 1 modulo p (i.e. g^1024 =
+ * -1 mod p). This fills gm[] and igm[] with powers of g and 1/g:
+ * gm[rev(i)] = g^i mod p
+ * igm[rev(i)] = (1/g)^i mod p
+ * where rev() is the "bit reversal" function over 10 bits. It fills
+ * the arrays only up to N = 2^logn values.
+ *
+ * The values stored in gm[] and igm[] are in Montgomery representation.
+ *
+ * p must be a prime such that p = 1 mod 2048.
+ */
+static void
+modp_mkgm2(uint32_t *gm, uint32_t *igm, unsigned logn,
+ uint32_t g, uint32_t p, uint32_t p0i) {
+ size_t u, n;
+ unsigned k;
+ uint32_t ig, x1, x2, R2;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * We want g such that g^(2N) = 1 mod p, but the provided
+ * generator has order 2048. We must square it a few times.
+ */
+ R2 = modp_R2(p, p0i);
+ g = modp_montymul(g, R2, p, p0i);
+ for (k = logn; k < 10; k ++) {
+ g = modp_montymul(g, g, p, p0i);
+ }
+
+ ig = modp_div(R2, g, p, p0i, modp_R(p));
+ k = 10 - logn;
+ x1 = x2 = modp_R(p);
+ for (u = 0; u < n; u ++) {
+ size_t v;
+
+ v = REV10[u << k];
+ gm[v] = x1;
+ igm[v] = x2;
+ x1 = modp_montymul(x1, g, p, p0i);
+ x2 = modp_montymul(x2, ig, p, p0i);
+ }
+}
+
+/*
+ * Compute the NTT over a polynomial (binary case). Polynomial elements
+ * are a[0], a[stride], a[2 * stride]...
+ */
+static void
+modp_NTT2_ext(uint32_t *a, size_t stride, const uint32_t *gm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, u, v1;
+
+ ht = t >> 1;
+ for (u = 0, v1 = 0; u < m; u ++, v1 += t) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = gm[m + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + ht * stride;
+ for (v = 0; v < ht; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = modp_montymul(*r2, s, p, p0i);
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_sub(x, y, p);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT over a polynomial (binary case).
+ */
+static void
+modp_iNTT2_ext(uint32_t *a, size_t stride, const uint32_t *igm, unsigned logn,
+ uint32_t p, uint32_t p0i) {
+ size_t t, m, n, k;
+ uint32_t ni;
+ uint32_t *r;
+
+ if (logn == 0) {
+ return;
+ }
+ n = (size_t)1 << logn;
+ t = 1;
+ for (m = n; m > 1; m >>= 1) {
+ size_t hm, dt, u, v1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (u = 0, v1 = 0; u < hm; u ++, v1 += dt) {
+ uint32_t s;
+ size_t v;
+ uint32_t *r1, *r2;
+
+ s = igm[hm + u];
+ r1 = a + v1 * stride;
+ r2 = r1 + t * stride;
+ for (v = 0; v < t; v ++, r1 += stride, r2 += stride) {
+ uint32_t x, y;
+
+ x = *r1;
+ y = *r2;
+ *r1 = modp_add(x, y, p);
+ *r2 = modp_montymul(
+ modp_sub(x, y, p), s, p, p0i);;
+ }
+ }
+ t = dt;
+ }
+
+ /*
+ * We need 1/n in Montgomery representation, i.e. R/n. Since
+ * 1 <= logn <= 10, R/n is an integer; morever, R/n <= 2^30 < p,
+ * thus a simple shift will do.
+ */
+ ni = (uint32_t)1 << (31 - logn);
+ for (k = 0, r = a; k < n; k ++, r += stride) {
+ *r = modp_montymul(*r, ni, p, p0i);
+ }
+}
+
+/*
+ * Simplified macros for NTT and iNTT (binary case) when the elements
+ * are consecutive in RAM.
+ */
+#define modp_NTT2(a, gm, logn, p, p0i) modp_NTT2_ext(a, 1, gm, logn, p, p0i)
+#define modp_iNTT2(a, igm, logn, p, p0i) modp_iNTT2_ext(a, 1, igm, logn, p, p0i)
+
+/*
+ * Given polynomial f in NTT representation modulo p, compute f' of degree
+ * less than N/2 such that f' = f0^2 - X*f1^2, where f0 and f1 are
+ * polynomials of degree less than N/2 such that f = f0(X^2) + X*f1(X^2).
+ *
+ * The new polynomial is written "in place" over the first N/2 elements
+ * of f.
+ *
+ * If applied logn times successively on a given polynomial, the resulting
+ * degree-0 polynomial is the resultant of f and X^N+1 modulo p.
+ *
+ * This function applies only to the binary case; it is invoked from
+ * solve_NTRU_binary_depth1().
+ */
+static void
+modp_poly_rec_res(uint32_t *f, unsigned logn,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ size_t hn, u;
+
+ hn = (size_t)1 << (logn - 1);
+ for (u = 0; u < hn; u ++) {
+ uint32_t w0, w1;
+
+ w0 = f[(u << 1) + 0];
+ w1 = f[(u << 1) + 1];
+ f[u] = modp_montymul(modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+}
+
+/* ==================================================================== */
+/*
+ * Custom bignum implementation.
+ *
+ * This is a very reduced set of functionalities. We need to do the
+ * following operations:
+ *
+ * - Rebuild the resultant and the polynomial coefficients from their
+ * values modulo small primes (of length 31 bits each).
+ *
+ * - Compute an extended GCD between the two computed resultants.
+ *
+ * - Extract top bits and add scaled values during the successive steps
+ * of Babai rounding.
+ *
+ * When rebuilding values using CRT, we must also recompute the product
+ * of the small prime factors. We always do it one small factor at a
+ * time, so the "complicated" operations can be done modulo the small
+ * prime with the modp_* functions. CRT coefficients (inverses) are
+ * precomputed.
+ *
+ * All values are positive until the last step: when the polynomial
+ * coefficients have been rebuilt, we normalize them around 0. But then,
+ * only additions and subtractions on the upper few bits are needed
+ * afterwards.
+ *
+ * We keep big integers as arrays of 31-bit words (in uint32_t values);
+ * the top bit of each uint32_t is kept equal to 0. Using 31-bit words
+ * makes it easier to keep track of carries. When negative values are
+ * used, two's complement is used.
+ */
+
+/*
+ * Subtract integer b from integer a. Both integers are supposed to have
+ * the same size. The carry (0 or 1) is returned. Source arrays a and b
+ * MUST be distinct.
+ *
+ * The operation is performed as described above if ctr = 1. If
+ * ctl = 0, the value a[] is unmodified, but all memory accesses are
+ * still performed, and the carry is computed and returned.
+ */
+static uint32_t
+zint_sub(uint32_t *a, const uint32_t *b, size_t len,
+ uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ cc = 0;
+ m = -ctl;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, w;
+
+ aw = a[u];
+ w = aw - b[u] - cc;
+ cc = w >> 31;
+ aw ^= ((w & 0x7FFFFFFF) ^ aw) & m;
+ a[u] = aw;
+ }
+ return cc;
+}
+
+/*
+ * Mutiply the provided big integer m with a small value x.
+ * This function assumes that x < 2^31. The carry word is returned.
+ */
+static uint32_t
+zint_mul_small(uint32_t *m, size_t mlen, uint32_t x) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < mlen; u ++) {
+ uint64_t z;
+
+ z = (uint64_t)m[u] * (uint64_t)x + cc;
+ m[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ return cc;
+}
+
+/*
+ * Reduce a big integer d modulo a small integer p.
+ * Rules:
+ * d is unsigned
+ * p is prime
+ * 2^30 < p < 2^31
+ * p0i = -(1/p) mod 2^31
+ * R2 = 2^62 mod p
+ */
+static uint32_t
+zint_mod_small_unsigned(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2) {
+ uint32_t x;
+ size_t u;
+
+ /*
+ * Algorithm: we inject words one by one, starting with the high
+ * word. Each step is:
+ * - multiply x by 2^31
+ * - add new word
+ */
+ x = 0;
+ u = dlen;
+ while (u -- > 0) {
+ uint32_t w;
+
+ x = modp_montymul(x, R2, p, p0i);
+ w = d[u] - p;
+ w += p & -(w >> 31);
+ x = modp_add(x, w, p);
+ }
+ return x;
+}
+
+/*
+ * Similar to zint_mod_small_unsigned(), except that d may be signed.
+ * Extra parameter is Rx = 2^(31*dlen) mod p.
+ */
+static uint32_t
+zint_mod_small_signed(const uint32_t *d, size_t dlen,
+ uint32_t p, uint32_t p0i, uint32_t R2, uint32_t Rx) {
+ uint32_t z;
+
+ if (dlen == 0) {
+ return 0;
+ }
+ z = zint_mod_small_unsigned(d, dlen, p, p0i, R2);
+ z = modp_sub(z, Rx & -(d[dlen - 1] >> 30), p);
+ return z;
+}
+
+/*
+ * Add y*s to x. x and y initially have length 'len' words; the new x
+ * has length 'len+1' words. 's' must fit on 31 bits. x[] and y[] must
+ * not overlap.
+ */
+static void
+zint_add_mul_small(uint32_t *x,
+ const uint32_t *y, size_t len, uint32_t s) {
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t xw, yw;
+ uint64_t z;
+
+ xw = x[u];
+ yw = y[u];
+ z = (uint64_t)yw * (uint64_t)s + (uint64_t)xw + (uint64_t)cc;
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+ cc = (uint32_t)(z >> 31);
+ }
+ x[len] = cc;
+}
+
+/*
+ * Normalize a modular integer around 0: if x > p/2, then x is replaced
+ * with x - p (signed encoding with two's complement); otherwise, x is
+ * untouched. The two integers x and p are encoded over the same length.
+ */
+static void
+zint_norm_zero(uint32_t *x, const uint32_t *p, size_t len) {
+ size_t u;
+ uint32_t r, bb;
+
+ /*
+ * Compare x with p/2. We use the shifted version of p, and p
+ * is odd, so we really compare with (p-1)/2; we want to perform
+ * the subtraction if and only if x > (p-1)/2.
+ */
+ r = 0;
+ bb = 0;
+ u = len;
+ while (u -- > 0) {
+ uint32_t wx, wp, cc;
+
+ /*
+ * Get the two words to compare in wx and wp (both over
+ * 31 bits exactly).
+ */
+ wx = x[u];
+ wp = (p[u] >> 1) | (bb << 30);
+ bb = p[u] & 1;
+
+ /*
+ * We set cc to -1, 0 or 1, depending on whether wp is
+ * lower than, equal to, or greater than wx.
+ */
+ cc = wp - wx;
+ cc = ((-cc) >> 31) | -(cc >> 31);
+
+ /*
+ * If r != 0 then it is either 1 or -1, and we keep its
+ * value. Otherwise, if r = 0, then we replace it with cc.
+ */
+ r |= cc & ((r & 1) - 1);
+ }
+
+ /*
+ * At this point, r = -1, 0 or 1, depending on whether (p-1)/2
+ * is lower than, equal to, or greater than x. We thus want to
+ * do the subtraction only if r = -1.
+ */
+ zint_sub(x, p, len, r >> 31);
+}
+
+/*
+ * Rebuild integers from their RNS representation. There are 'num'
+ * integers, and each consists in 'xlen' words. 'xx' points at that
+ * first word of the first integer; subsequent integers are accessed
+ * by adding 'xstride' repeatedly.
+ *
+ * The words of an integer are the RNS representation of that integer,
+ * using the provided 'primes' are moduli. This function replaces
+ * each integer with its multi-word value (little-endian order).
+ *
+ * If "normalize_signed" is non-zero, then the returned value is
+ * normalized to the -m/2..m/2 interval (where m is the product of all
+ * small prime moduli); two's complement is used for negative values.
+ */
+static void
+zint_rebuild_CRT(uint32_t *xx, size_t xlen, size_t xstride,
+ size_t num, const small_prime *primes, int normalize_signed,
+ uint32_t *tmp) {
+ size_t u;
+ uint32_t *x;
+
+ tmp[0] = primes[0].p;
+ for (u = 1; u < xlen; u ++) {
+ /*
+ * At the entry of each loop iteration:
+ * - the first u words of each array have been
+ * reassembled;
+ * - the first u words of tmp[] contains the
+ * product of the prime moduli processed so far.
+ *
+ * We call 'q' the product of all previous primes.
+ */
+ uint32_t p, p0i, s, R2;
+ size_t v;
+
+ p = primes[u].p;
+ s = primes[u].s;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ for (v = 0, x = xx; v < num; v ++, x += xstride) {
+ uint32_t xp, xq, xr;
+ /*
+ * xp = the integer x modulo the prime p for this
+ * iteration
+ * xq = (x mod q) mod p
+ */
+ xp = x[u];
+ xq = zint_mod_small_unsigned(x, u, p, p0i, R2);
+
+ /*
+ * New value is (x mod q) + q * (s * (xp - xq) mod p)
+ */
+ xr = modp_montymul(s, modp_sub(xp, xq, p), p, p0i);
+ zint_add_mul_small(x, tmp, u, xr);
+ }
+
+ /*
+ * Update product of primes in tmp[].
+ */
+ tmp[u] = zint_mul_small(tmp, u, p);
+ }
+
+ /*
+ * Normalize the reconstructed values around 0.
+ */
+ if (normalize_signed) {
+ for (u = 0, x = xx; u < num; u ++, x += xstride) {
+ zint_norm_zero(x, tmp, xlen);
+ }
+ }
+}
+
+/*
+ * Negate a big integer conditionally: value a is replaced with -a if
+ * and only if ctl = 1. Control value ctl must be 0 or 1.
+ */
+static void
+zint_negate(uint32_t *a, size_t len, uint32_t ctl) {
+ size_t u;
+ uint32_t cc, m;
+
+ /*
+ * If ctl = 1 then we flip the bits of a by XORing with
+ * 0x7FFFFFFF, and we add 1 to the value. If ctl = 0 then we XOR
+ * with 0 and add 0, which leaves the value unchanged.
+ */
+ cc = ctl;
+ m = -ctl >> 1;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw;
+
+ aw = a[u];
+ aw = (aw ^ m) + cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) and b with (a*ya+b*yb)/(2^31).
+ * The low bits are dropped (the caller should compute the coefficients
+ * such that these dropped bits are all zeros). If either or both
+ * yields a negative value, then the value is negated.
+ *
+ * Returned value is:
+ * 0 both values were positive
+ * 1 new a had to be negated
+ * 2 new b had to be negated
+ * 3 both new a and new b had to be negated
+ *
+ * Coefficients xa, xb, ya and yb may use the full signed 32-bit range.
+ */
+static uint32_t
+zint_co_reduce(uint32_t *a, uint32_t *b, size_t len,
+ int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t nega, negb;
+
+ cca = 0;
+ ccb = 0;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ nega = (uint32_t)((uint64_t)cca >> 63);
+ negb = (uint32_t)((uint64_t)ccb >> 63);
+ zint_negate(a, len, nega);
+ zint_negate(b, len, negb);
+ return nega | (negb << 1);
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ * if neg = 1, then -m <= a < 0
+ * if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] is allowed to use 32 bits.
+ *
+ * Modulus m must be odd.
+ */
+static void
+zint_finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) {
+ size_t u;
+ uint32_t cc, xm, ym;
+
+ /*
+ * First pass: compare a (assumed nonnegative) with m. Note that
+ * if the top word uses 32 bits, subtracting m must yield a
+ * value less than 2^31 since a < 2*m.
+ */
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ cc = (a[u] - m[u] - cc) >> 31;
+ }
+
+ /*
+ * If neg = 1 then we must add m (regardless of cc)
+ * If neg = 0 and cc = 0 then we must subtract m
+ * If neg = 0 and cc = 1 then we must do nothing
+ *
+ * In the loop below, we conditionally subtract either m or -m
+ * from a. Word xm is a word of m (if neg = 0) or -m (if neg = 1);
+ * but if neg = 0 and cc = 1, then ym = 0 and it forces mw to 0.
+ */
+ xm = -neg >> 1;
+ ym = -(neg | (1 - cc));
+ cc = neg;
+ for (u = 0; u < len; u ++) {
+ uint32_t aw, mw;
+
+ aw = a[u];
+ mw = (m[u] ^ xm) & ym;
+ aw = aw - mw - cc;
+ a[u] = aw & 0x7FFFFFFF;
+ cc = aw >> 31;
+ }
+}
+
+/*
+ * Replace a with (a*xa+b*xb)/(2^31) mod m, and b with
+ * (a*ya+b*yb)/(2^31) mod m. Modulus m must be odd; m0i = -1/m[0] mod 2^31.
+ */
+static void
+zint_co_reduce_mod(uint32_t *a, uint32_t *b, const uint32_t *m, size_t len,
+ uint32_t m0i, int64_t xa, int64_t xb, int64_t ya, int64_t yb) {
+ size_t u;
+ int64_t cca, ccb;
+ uint32_t fa, fb;
+
+ /*
+ * These are actually four combined Montgomery multiplications.
+ */
+ cca = 0;
+ ccb = 0;
+ fa = ((a[0] * (uint32_t)xa + b[0] * (uint32_t)xb) * m0i) & 0x7FFFFFFF;
+ fb = ((a[0] * (uint32_t)ya + b[0] * (uint32_t)yb) * m0i) & 0x7FFFFFFF;
+ for (u = 0; u < len; u ++) {
+ uint32_t wa, wb;
+ uint64_t za, zb;
+
+ wa = a[u];
+ wb = b[u];
+ za = wa * (uint64_t)xa + wb * (uint64_t)xb
+ + m[u] * (uint64_t)fa + (uint64_t)cca;
+ zb = wa * (uint64_t)ya + wb * (uint64_t)yb
+ + m[u] * (uint64_t)fb + (uint64_t)ccb;
+ if (u > 0) {
+ a[u - 1] = (uint32_t)za & 0x7FFFFFFF;
+ b[u - 1] = (uint32_t)zb & 0x7FFFFFFF;
+ }
+ cca = *(int64_t *)&za >> 31;
+ ccb = *(int64_t *)&zb >> 31;
+ }
+ a[len - 1] = (uint32_t)cca;
+ b[len - 1] = (uint32_t)ccb;
+
+ /*
+ * At this point:
+ * -m <= a < 2*m
+ * -m <= b < 2*m
+ * (this is a case of Montgomery reduction)
+ * The top words of 'a' and 'b' may have a 32-th bit set.
+ * We want to add or subtract the modulus, as required.
+ */
+ zint_finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+ zint_finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/*
+ * Compute a GCD between two positive big integers x and y. The two
+ * integers must be odd. Returned value is 1 if the GCD is 1, 0
+ * otherwise. When 1 is returned, arrays u and v are filled with values
+ * such that:
+ * 0 <= u <= y
+ * 0 <= v <= x
+ * x*u - y*v = 1
+ * x[] and y[] are unmodified. Both input values must have the same
+ * encoded length. Temporary array must be large enough to accommodate 4
+ * extra values of that length. Arrays u, v and tmp may not overlap with
+ * each other, or with either x or y.
+ */
+static int
+zint_bezout(uint32_t *u, uint32_t *v,
+ const uint32_t *x, const uint32_t *y,
+ size_t len, uint32_t *tmp) {
+ /*
+ * Algorithm is an extended binary GCD. We maintain 6 values
+ * a, b, u0, u1, v0 and v1 with the following invariants:
+ *
+ * a = x*u0 - y*v0
+ * b = x*u1 - y*v1
+ * 0 <= a <= x
+ * 0 <= b <= y
+ * 0 <= u0 < y
+ * 0 <= v0 < x
+ * 0 <= u1 <= y
+ * 0 <= v1 < x
+ *
+ * Initial values are:
+ *
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ *
+ * Each iteration reduces either a or b, and maintains the
+ * invariants. Algorithm stops when a = b, at which point their
+ * common value is GCD(a,b) and (u0,v0) (or (u1,v1)) contains
+ * the values (u,v) we want to return.
+ *
+ * The formal definition of the algorithm is a sequence of steps:
+ *
+ * - If a is even, then:
+ * a <- a/2
+ * u0 <- u0/2 mod y
+ * v0 <- v0/2 mod x
+ *
+ * - Otherwise, if b is even, then:
+ * b <- b/2
+ * u1 <- u1/2 mod y
+ * v1 <- v1/2 mod x
+ *
+ * - Otherwise, if a > b, then:
+ * a <- (a-b)/2
+ * u0 <- (u0-u1)/2 mod y
+ * v0 <- (v0-v1)/2 mod x
+ *
+ * - Otherwise:
+ * b <- (b-a)/2
+ * u1 <- (u1-u0)/2 mod y
+ * v1 <- (v1-v0)/2 mod y
+ *
+ * We can show that the operations above preserve the invariants:
+ *
+ * - If a is even, then u0 and v0 are either both even or both
+ * odd (since a = x*u0 - y*v0, and x and y are both odd).
+ * If u0 and v0 are both even, then (u0,v0) <- (u0/2,v0/2).
+ * Otherwise, (u0,v0) <- ((u0+y)/2,(v0+x)/2). Either way,
+ * the a = x*u0 - y*v0 invariant is preserved.
+ *
+ * - The same holds for the case where b is even.
+ *
+ * - If a and b are odd, and a > b, then:
+ *
+ * a-b = x*(u0-u1) - y*(v0-v1)
+ *
+ * In that situation, if u0 < u1, then x*(u0-u1) < 0, but
+ * a-b > 0; therefore, it must be that v0 < v1, and the
+ * first part of the update is: (u0,v0) <- (u0-u1+y,v0-v1+x),
+ * which preserves the invariants. Otherwise, if u0 > u1,
+ * then u0-u1 >= 1, thus x*(u0-u1) >= x. But a <= x and
+ * b >= 0, hence a-b <= x. It follows that, in that case,
+ * v0-v1 >= 0. The first part of the update is then:
+ * (u0,v0) <- (u0-u1,v0-v1), which again preserves the
+ * invariants.
+ *
+ * Either way, once the subtraction is done, the new value of
+ * a, which is the difference of two odd values, is even,
+ * and the remaining of this step is a subcase of the
+ * first algorithm case (i.e. when a is even).
+ *
+ * - If a and b are odd, and b > a, then the a similar
+ * argument holds.
+ *
+ * The values a and b start at x and y, respectively. Since x
+ * and y are odd, their GCD is odd, and it is easily seen that
+ * all steps conserve the GCD (GCD(a-b,b) = GCD(a, b);
+ * GCD(a/2,b) = GCD(a,b) if GCD(a,b) is odd). Moreover, either a
+ * or b is reduced by at least one bit at each iteration, so
+ * the algorithm necessarily converges on the case a = b, at
+ * which point the common value is the GCD.
+ *
+ * In the algorithm expressed above, when a = b, the fourth case
+ * applies, and sets b = 0. Since a contains the GCD of x and y,
+ * which are both odd, a must be odd, and subsequent iterations
+ * (if any) will simply divide b by 2 repeatedly, which has no
+ * consequence. Thus, the algorithm can run for more iterations
+ * than necessary; the final GCD will be in a, and the (u,v)
+ * coefficients will be (u0,v0).
+ *
+ *
+ * The presentation above is bit-by-bit. It can be sped up by
+ * noticing that all decisions are taken based on the low bits
+ * and high bits of a and b. We can extract the two top words
+ * and low word of each of a and b, and compute reduction
+ * parameters pa, pb, qa and qb such that the new values for
+ * a and b are:
+ * a' = (a*pa + b*pb) / (2^31)
+ * b' = (a*qa + b*qb) / (2^31)
+ * the two divisions being exact. The coefficients are obtained
+ * just from the extracted words, and may be slightly off, requiring
+ * an optional correction: if a' < 0, then we replace pa with -pa
+ * and pb with -pb. Each such step will reduce the total length
+ * (sum of lengths of a and b) by at least 30 bits at each
+ * iteration.
+ */
+ uint32_t *u0, *u1, *v0, *v1, *a, *b;
+ uint32_t x0i, y0i;
+ uint32_t num, rc;
+ size_t j;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ /*
+ * u0 and v0 are the u and v result buffers; the four other
+ * values (u1, v1, a and b) are taken from tmp[].
+ */
+ u0 = u;
+ v0 = v;
+ u1 = tmp;
+ v1 = u1 + len;
+ a = v1 + len;
+ b = a + len;
+
+ /*
+ * We'll need the Montgomery reduction coefficients.
+ */
+ x0i = modp_ninv31(x[0]);
+ y0i = modp_ninv31(y[0]);
+
+ /*
+ * Initialize a, b, u0, u1, v0 and v1.
+ * a = x u0 = 1 v0 = 0
+ * b = y u1 = y v1 = x-1
+ * Note that x is odd, so computing x-1 is easy.
+ */
+ memcpy(a, x, len * sizeof * x);
+ memcpy(b, y, len * sizeof * y);
+ u0[0] = 1;
+ memset(u0 + 1, 0, (len - 1) * sizeof * u0);
+ memset(v0, 0, len * sizeof * v0);
+ memcpy(u1, y, len * sizeof * u1);
+ memcpy(v1, x, len * sizeof * v1);
+ v1[0] --;
+
+ /*
+ * Each input operand may be as large as 31*len bits, and we
+ * reduce the total length by at least 30 bits at each iteration.
+ */
+ for (num = 62 * (uint32_t)len + 30; num >= 30; num -= 30) {
+ uint32_t c0, c1;
+ uint32_t a0, a1, b0, b1;
+ uint64_t a_hi, b_hi;
+ uint32_t a_lo, b_lo;
+ int64_t pa, pb, qa, qb;
+ int i;
+ uint32_t r;
+
+ /*
+ * Extract the top words of a and b. If j is the highest
+ * index >= 1 such that a[j] != 0 or b[j] != 0, then we
+ * want (a[j] << 31) + a[j-1] and (b[j] << 31) + b[j-1].
+ * If a and b are down to one word each, then we use
+ * a[0] and b[0].
+ */
+ c0 = (uint32_t) -1;
+ c1 = (uint32_t) -1;
+ a0 = 0;
+ a1 = 0;
+ b0 = 0;
+ b1 = 0;
+ j = len;
+ while (j -- > 0) {
+ uint32_t aw, bw;
+
+ aw = a[j];
+ bw = b[j];
+ a0 ^= (a0 ^ aw) & c0;
+ a1 ^= (a1 ^ aw) & c1;
+ b0 ^= (b0 ^ bw) & c0;
+ b1 ^= (b1 ^ bw) & c1;
+ c1 = c0;
+ c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+ }
+
+ /*
+ * If c1 = 0, then we grabbed two words for a and b.
+ * If c1 != 0 but c0 = 0, then we grabbed one word. It
+ * is not possible that c1 != 0 and c0 != 0, because that
+ * would mean that both integers are zero.
+ */
+ a1 |= a0 & c1;
+ a0 &= ~c1;
+ b1 |= b0 & c1;
+ b0 &= ~c1;
+ a_hi = ((uint64_t)a0 << 31) + a1;
+ b_hi = ((uint64_t)b0 << 31) + b1;
+ a_lo = a[0];
+ b_lo = b[0];
+
+ /*
+ * Compute reduction factors:
+ *
+ * a' = a*pa + b*pb
+ * b' = a*qa + b*qb
+ *
+ * such that a' and b' are both multiple of 2^31, but are
+ * only marginally larger than a and b.
+ */
+ pa = 1;
+ pb = 0;
+ qa = 0;
+ qb = 1;
+ for (i = 0; i < 31; i ++) {
+ /*
+ * At each iteration:
+ *
+ * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+ * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+ * a <- a/2 if: a is even
+ * b <- b/2 if: a is odd, b is even
+ *
+ * We multiply a_lo and b_lo by 2 at each
+ * iteration, thus a division by 2 really is a
+ * non-multiplication by 2.
+ */
+ uint32_t rt, oa, ob, cAB, cBA, cA;
+ uint64_t rz;
+
+ /*
+ * rt = 1 if a_hi > b_hi, 0 otherwise.
+ */
+ rz = b_hi - a_hi;
+ rt = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+ & (a_hi ^ rz))) >> 63);
+
+ /*
+ * cAB = 1 if b must be subtracted from a
+ * cBA = 1 if a must be subtracted from b
+ * cA = 1 if a must be divided by 2
+ *
+ * Rules:
+ *
+ * cAB and cBA cannot both be 1.
+ * If a is not divided by 2, b is.
+ */
+ oa = (a_lo >> i) & 1;
+ ob = (b_lo >> i) & 1;
+ cAB = oa & ob & rt;
+ cBA = oa & ob & ~rt;
+ cA = cAB | (oa ^ 1);
+
+ /*
+ * Conditional subtractions.
+ */
+ a_lo -= b_lo & -cAB;
+ a_hi -= b_hi & -(uint64_t)cAB;
+ pa -= qa & -(int64_t)cAB;
+ pb -= qb & -(int64_t)cAB;
+ b_lo -= a_lo & -cBA;
+ b_hi -= a_hi & -(uint64_t)cBA;
+ qa -= pa & -(int64_t)cBA;
+ qb -= pb & -(int64_t)cBA;
+
+ /*
+ * Shifting.
+ */
+ a_lo += a_lo & (cA - 1);
+ pa += pa & ((int64_t)cA - 1);
+ pb += pb & ((int64_t)cA - 1);
+ a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+ b_lo += b_lo & -cA;
+ qa += qa & -(int64_t)cA;
+ qb += qb & -(int64_t)cA;
+ b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+ }
+
+ /*
+ * Apply the computed parameters to our values. We
+ * may have to correct pa and pb depending on the
+ * returned value of zint_co_reduce() (when a and/or b
+ * had to be negated).
+ */
+ r = zint_co_reduce(a, b, len, pa, pb, qa, qb);
+ pa -= (pa + pa) & -(int64_t)(r & 1);
+ pb -= (pb + pb) & -(int64_t)(r & 1);
+ qa -= (qa + qa) & -(int64_t)(r >> 1);
+ qb -= (qb + qb) & -(int64_t)(r >> 1);
+ zint_co_reduce_mod(u0, u1, y, len, y0i, pa, pb, qa, qb);
+ zint_co_reduce_mod(v0, v1, x, len, x0i, pa, pb, qa, qb);
+ }
+
+ /*
+ * At that point, array a[] should contain the GCD, and the
+ * results (u,v) should already be set. We check that the GCD
+ * is indeed 1. We also check that the two operands x and y
+ * are odd.
+ */
+ rc = a[0] ^ 1;
+ for (j = 1; j < len; j ++) {
+ rc |= a[j];
+ }
+ return (int)((1 - ((rc | -rc) >> 31)) & x[0] & y[0]);
+}
+
+/*
+ * Add k*y*2^sc to x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_add_scaled_mul_small(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, int32_t k,
+ uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ int32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t wy, wys, ccu;
+ uint64_t z;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ /*
+ * The expression below does not overflow.
+ */
+ z = (uint64_t)((int64_t)wys * (int64_t)k + (int64_t)x[u] + cc);
+ x[u] = (uint32_t)z & 0x7FFFFFFF;
+
+ /*
+ * Right-shifting the signed value z would yield
+ * implementation-defined results (arithmetic shift is
+ * not guaranteed). However, we can cast to unsigned,
+ * and get the next carry as an unsigned word. We can
+ * then convert it back to signed by using the guaranteed
+ * fact that 'int32_t' uses two's complement with no
+ * trap representation or padding bit, and with a layout
+ * compatible with that of 'uint32_t'.
+ */
+ ccu = (uint32_t)(z >> 31);
+ cc = *(int32_t *)&ccu;
+ }
+}
+
+/*
+ * Subtract y*2^sc from x. The result is assumed to fit in the array of
+ * size xlen (truncation is applied if necessary).
+ * Scale factor 'sc' is provided as sch and scl, such that:
+ * sch = sc / 31
+ * scl = sc % 31
+ * xlen MUST NOT be lower than ylen.
+ *
+ * x[] and y[] are both signed integers, using two's complement for
+ * negative values.
+ */
+static void
+zint_sub_scaled(uint32_t *x, size_t xlen,
+ const uint32_t *y, size_t ylen, uint32_t sch, uint32_t scl) {
+ size_t u;
+ uint32_t ysign, tw;
+ uint32_t cc;
+
+ if (ylen == 0) {
+ return;
+ }
+
+ ysign = -(y[ylen - 1] >> 30) >> 1;
+ tw = 0;
+ cc = 0;
+ for (u = sch; u < xlen; u ++) {
+ size_t v;
+ uint32_t w, wy, wys;
+
+ /*
+ * Get the next word of y (scaled).
+ */
+ v = u - sch;
+ if (v < ylen) {
+ wy = y[v];
+ } else {
+ wy = ysign;
+ }
+ wys = ((wy << scl) & 0x7FFFFFFF) | tw;
+ tw = wy >> (31 - scl);
+
+ w = x[u] - wys - cc;
+ x[u] = w & 0x7FFFFFFF;
+ cc = w >> 31;
+ }
+}
+
+/*
+ * Convert a one-word signed big integer into a signed value.
+ */
+static inline int32_t
+zint_one_to_plain(const uint32_t *x) {
+ uint32_t w;
+
+ w = x[0];
+ w |= (w & 0x40000000) << 1;
+ return *(int32_t *)&w;
+}
+
+/* ==================================================================== */
+
+/*
+ * Convert a polynomial to floating-point values.
+ *
+ * Each coefficient has length flen words, and starts fstride words after
+ * the previous.
+ *
+ * IEEE-754 binary64 values can represent values in a finite range,
+ * roughly 2^(-1023) to 2^(+1023); thus, if coefficients are too large,
+ * they should be "trimmed" by pointing not to the lowest word of each,
+ * but upper.
+ */
+static void
+poly_big_to_fp(fpr *d, const uint32_t *f, size_t flen, size_t fstride,
+ unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ if (flen == 0) {
+ for (u = 0; u < n; u ++) {
+ d[u] = fpr_zero;
+ }
+ return;
+ }
+ for (u = 0; u < n; u ++, f += fstride) {
+ size_t v;
+ uint32_t neg, cc, xm;
+ fpr x, fsc;
+
+ /*
+ * Get sign of the integer; if it is negative, then we
+ * will load its absolute value instead, and negate the
+ * result.
+ */
+ neg = -(f[flen - 1] >> 30);
+ xm = neg >> 1;
+ cc = neg & 1;
+ x = fpr_zero;
+ fsc = fpr_one;
+ for (v = 0; v < flen; v ++, fsc = fpr_mul(fsc, fpr_ptwo31)) {
+ uint32_t w;
+
+ w = (f[v] ^ xm) + cc;
+ cc = w >> 31;
+ w &= 0x7FFFFFFF;
+ w -= (w << 1) & neg;
+ x = fpr_add(x, fpr_mul(fpr_of(*(int32_t *)&w), fsc));
+ }
+ d[u] = x;
+ }
+}
+
+/*
+ * Convert a polynomial to small integers. Source values are supposed
+ * to be one-word integers, signed over 31 bits. Returned value is 0
+ * if any of the coefficients exceeds the provided limit (in absolute
+ * value), or 1 on success.
+ *
+ * This is not constant-time; this is not a problem here, because on
+ * any failure, the NTRU-solving process will be deemed to have failed
+ * and the (f,g) polynomials will be discarded.
+ */
+static int
+poly_big_to_small(int8_t *d, const uint32_t *s, int lim, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = zint_one_to_plain(s + u);
+ if (z < -lim || z > lim) {
+ return 0;
+ }
+ d[u] = (int8_t)z;
+ }
+ return 1;
+}
+
+/*
+ * Subtract k*f from F, where F, f and k are polynomials modulo X^N+1.
+ * Coefficients of polynomial k are small integers (signed values in the
+ * -2^31..2^31 range) scaled by 2^sc. Value sc is provided as sch = sc / 31
+ * and scl = sc % 31.
+ *
+ * This function implements the basic quadratic multiplication algorithm,
+ * which is efficient in space (no extra buffer needed) but slow at
+ * high degree.
+ */
+static void
+poly_sub_scaled(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ int32_t kf;
+ size_t v;
+ uint32_t *x;
+ const uint32_t *y;
+
+ kf = -k[u];
+ x = F + u * Fstride;
+ y = f;
+ for (v = 0; v < n; v ++) {
+ zint_add_scaled_mul_small(
+ x, Flen, y, flen, kf, sch, scl);
+ if (u + v == n - 1) {
+ x = F;
+ kf = -kf;
+ } else {
+ x += Fstride;
+ }
+ y += fstride;
+ }
+ }
+}
+
+/*
+ * Subtract k*f from F. Coefficients of polynomial k are small integers
+ * (signed values in the -2^31..2^31 range) scaled by 2^sc. This function
+ * assumes that the degree is large, and integers relatively small.
+ * The value sc is provided as sch = sc / 31 and scl = sc % 31.
+ */
+static void
+poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
+ const uint32_t *f, size_t flen, size_t fstride,
+ const int32_t *k, uint32_t sch, uint32_t scl, unsigned logn,
+ uint32_t *tmp) {
+ uint32_t *gm, *igm, *fk, *t1, *x;
+ const uint32_t *y;
+ size_t n, u, tlen;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ tlen = flen + 1;
+ gm = tmp;
+ igm = gm + MKN(logn);
+ fk = igm + MKN(logn);
+ t1 = fk + n * tlen;
+
+ primes = PRIMES;
+
+ /*
+ * Compute k*f in fk[], in RNS notation.
+ */
+ for (u = 0; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)flen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0; v < n; v ++) {
+ t1[v] = modp_set(k[v], p);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, y = f, x = fk + u;
+ v < n; v ++, y += fstride, x += tlen) {
+ *x = zint_mod_small_signed(y, flen, p, p0i, R2, Rx);
+ }
+ modp_NTT2_ext(fk + u, tlen, gm, logn, p, p0i);
+ for (v = 0, x = fk + u; v < n; v ++, x += tlen) {
+ *x = modp_montymul(
+ modp_montymul(t1[v], *x, p, p0i), R2, p, p0i);
+ }
+ modp_iNTT2_ext(fk + u, tlen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild k*f.
+ */
+ zint_rebuild_CRT(fk, tlen, tlen, n, primes, 1, t1);
+
+ /*
+ * Subtract k*f, scaled, from F.
+ */
+ for (u = 0, x = F, y = fk; u < n; u ++, x += Fstride, y += tlen) {
+ zint_sub_scaled(x, Flen, y, tlen, sch, scl);
+ }
+}
+
+/* ==================================================================== */
+
+#define RNG_CONTEXT inner_shake256_context
+
+/*
+ * Get a random 8-byte integer from a SHAKE-based RNG. This function
+ * ensures consistent interpretation of the SHAKE output so that
+ * the same values will be obtained over different platforms, in case
+ * a known seed is used.
+ */
+static inline uint64_t
+get_rng_u64(inner_shake256_context *rng) {
+ /*
+ * We enforce little-endian representation.
+ */
+
+ uint8_t tmp[8];
+
+ inner_shake256_extract(rng, tmp, sizeof tmp);
+ return (uint64_t)tmp[0]
+ | ((uint64_t)tmp[1] << 8)
+ | ((uint64_t)tmp[2] << 16)
+ | ((uint64_t)tmp[3] << 24)
+ | ((uint64_t)tmp[4] << 32)
+ | ((uint64_t)tmp[5] << 40)
+ | ((uint64_t)tmp[6] << 48)
+ | ((uint64_t)tmp[7] << 56);
+}
+
+/*
+ * Table below incarnates a discrete Gaussian distribution:
+ * D(x) = exp(-(x^2)/(2*sigma^2))
+ * where sigma = 1.17*sqrt(q/(2*N)), q = 12289, and N = 1024.
+ * Element 0 of the table is P(x = 0).
+ * For k > 0, element k is P(x >= k+1 | x > 0).
+ * Probabilities are scaled up by 2^63.
+ */
+static const uint64_t gauss_1024_12289[] = {
+ 1283868770400643928u, 6416574995475331444u, 4078260278032692663u,
+ 2353523259288686585u, 1227179971273316331u, 575931623374121527u,
+ 242543240509105209u, 91437049221049666u, 30799446349977173u,
+ 9255276791179340u, 2478152334826140u, 590642893610164u,
+ 125206034929641u, 23590435911403u, 3948334035941u,
+ 586753615614u, 77391054539u, 9056793210u,
+ 940121950u, 86539696u, 7062824u,
+ 510971u, 32764u, 1862u,
+ 94u, 4u, 0u
+};
+
+/*
+ * Generate a random value with a Gaussian distribution centered on 0.
+ * The RNG must be ready for extraction (already flipped).
+ *
+ * Distribution has standard deviation 1.17*sqrt(q/(2*N)). The
+ * precomputed table is for N = 1024. Since the sum of two independent
+ * values of standard deviation sigma has standard deviation
+ * sigma*sqrt(2), then we can just generate more values and add them
+ * together for lower dimensions.
+ */
+static int
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
+ unsigned u, g;
+ int val;
+
+ g = 1U << (10 - logn);
+ val = 0;
+ for (u = 0; u < g; u ++) {
+ /*
+ * Each iteration generates one value with the
+ * Gaussian distribution for N = 1024.
+ *
+ * We use two random 64-bit values. First value
+ * decides on whether the generated value is 0, and,
+ * if not, the sign of the value. Second random 64-bit
+ * word is used to generate the non-zero value.
+ *
+ * For constant-time code we have to read the complete
+ * table. This has negligible cost, compared with the
+ * remainder of the keygen process (solving the NTRU
+ * equation).
+ */
+ uint64_t r;
+ uint32_t f, v, k, neg;
+
+ /*
+ * First value:
+ * - flag 'neg' is randomly selected to be 0 or 1.
+ * - flag 'f' is set to 1 if the generated value is zero,
+ * or set to 0 otherwise.
+ */
+ r = get_rng_u64(rng);
+ neg = (uint32_t)(r >> 63);
+ r &= ~((uint64_t)1 << 63);
+ f = (uint32_t)((r - gauss_1024_12289[0]) >> 63);
+
+ /*
+ * We produce a new random 63-bit integer r, and go over
+ * the array, starting at index 1. We store in v the
+ * index of the first array element which is not greater
+ * than r, unless the flag f was already 1.
+ */
+ v = 0;
+ r = get_rng_u64(rng);
+ r &= ~((uint64_t)1 << 63);
+ for (k = 1; k < (sizeof gauss_1024_12289)
+ / (sizeof gauss_1024_12289[0]); k ++) {
+ uint32_t t;
+
+ t = (uint32_t)((r - gauss_1024_12289[k]) >> 63) ^ 1;
+ v |= k & -(t & (f ^ 1));
+ f |= t;
+ }
+
+ /*
+ * We apply the sign ('neg' flag). If the value is zero,
+ * the sign has no effect.
+ */
+ v = (v ^ -neg) + neg;
+
+ /*
+ * Generated value is added to val.
+ */
+ val += *(int32_t *)&v;
+ }
+ return val;
+}
+
+/*
+ * The MAX_BL_SMALL[] and MAX_BL_LARGE[] contain the lengths, in 31-bit
+ * words, of intermediate values in the computation:
+ *
+ * MAX_BL_SMALL[depth]: length for the input f and g at that depth
+ * MAX_BL_LARGE[depth]: length for the unreduced F and G at that depth
+ *
+ * Rules:
+ *
+ * - Within an array, values grow.
+ *
+ * - The 'SMALL' array must have an entry for maximum depth, corresponding
+ * to the size of values used in the binary GCD. There is no such value
+ * for the 'LARGE' array (the binary GCD yields already reduced
+ * coefficients).
+ *
+ * - MAX_BL_LARGE[depth] >= MAX_BL_SMALL[depth + 1].
+ *
+ * - Values must be large enough to handle the common cases, with some
+ * margins.
+ *
+ * - Values must not be "too large" either because we will convert some
+ * integers into floating-point values by considering the top 10 words,
+ * i.e. 310 bits; hence, for values of length more than 10 words, we
+ * should take care to have the length centered on the expected size.
+ *
+ * The following average lengths, in bits, have been measured on thousands
+ * of random keys (fg = max length of the absolute value of coefficients
+ * of f and g at that depth; FG = idem for the unreduced F and G; for the
+ * maximum depth, F and G are the output of binary GCD, multiplied by q;
+ * for each value, the average and standard deviation are provided).
+ *
+ * Binary case:
+ * depth: 10 fg: 6307.52 (24.48) FG: 6319.66 (24.51)
+ * depth: 9 fg: 3138.35 (12.25) FG: 9403.29 (27.55)
+ * depth: 8 fg: 1576.87 ( 7.49) FG: 4703.30 (14.77)
+ * depth: 7 fg: 794.17 ( 4.98) FG: 2361.84 ( 9.31)
+ * depth: 6 fg: 400.67 ( 3.10) FG: 1188.68 ( 6.04)
+ * depth: 5 fg: 202.22 ( 1.87) FG: 599.81 ( 3.87)
+ * depth: 4 fg: 101.62 ( 1.02) FG: 303.49 ( 2.38)
+ * depth: 3 fg: 50.37 ( 0.53) FG: 153.65 ( 1.39)
+ * depth: 2 fg: 24.07 ( 0.25) FG: 78.20 ( 0.73)
+ * depth: 1 fg: 10.99 ( 0.08) FG: 39.82 ( 0.41)
+ * depth: 0 fg: 4.00 ( 0.00) FG: 19.61 ( 0.49)
+ *
+ * Integers are actually represented either in binary notation over
+ * 31-bit words (signed, using two's complement), or in RNS, modulo
+ * many small primes. These small primes are close to, but slightly
+ * lower than, 2^31. Use of RNS loses less than two bits, even for
+ * the largest values.
+ *
+ * IMPORTANT: if these values are modified, then the temporary buffer
+ * sizes (FALCON_KEYGEN_TEMP_*, in inner.h) must be recomputed
+ * accordingly.
+ */
+
+static const size_t MAX_BL_SMALL[] = {
+ 1, 1, 2, 2, 4, 7, 14, 27, 53, 106, 209
+};
+
+static const size_t MAX_BL_LARGE[] = {
+ 2, 2, 5, 7, 12, 21, 40, 78, 157, 308
+};
+
+/*
+ * Average and standard deviation for the maximum size (in bits) of
+ * coefficients of (f,g), depending on depth. These values are used
+ * to compute bounds for Babai's reduction.
+ */
+static const struct {
+ int avg;
+ int std;
+} BITLENGTH[] = {
+ { 4, 0 },
+ { 11, 1 },
+ { 24, 1 },
+ { 50, 1 },
+ { 102, 1 },
+ { 202, 2 },
+ { 401, 4 },
+ { 794, 5 },
+ { 1577, 8 },
+ { 3138, 13 },
+ { 6308, 25 }
+};
+
+/*
+ * Minimal recursion depth at which we rebuild intermediate values
+ * when reconstructing f and g.
+ */
+#define DEPTH_INT_FG 4
+
+/*
+ * Compute squared norm of a short vector. Returned value is saturated to
+ * 2^32-1 if it is not lower than 2^31.
+ */
+static uint32_t
+poly_small_sqnorm(const int8_t *f, unsigned logn) {
+ size_t n, u;
+ uint32_t s, ng;
+
+ n = MKN(logn);
+ s = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = f[u];
+ s += (uint32_t)(z * z);
+ ng |= s;
+ }
+ return s | -(ng >> 31);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'fpr'.
+ */
+static fpr *
+align_fpr(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(fpr);
+ if (km) {
+ k += (sizeof(fpr)) - km;
+ }
+ return (fpr *)(cb + k);
+}
+
+/*
+ * Align (upwards) the provided 'data' pointer with regards to 'base'
+ * so that the offset is a multiple of the size of 'uint32_t'.
+ */
+static uint32_t *
+align_u32(void *base, void *data) {
+ uint8_t *cb, *cd;
+ size_t k, km;
+
+ cb = base;
+ cd = data;
+ k = (size_t)(cd - cb);
+ km = k % sizeof(uint32_t);
+ if (km) {
+ k += (sizeof(uint32_t)) - km;
+ }
+ return (uint32_t *)(cb + k);
+}
+
+/*
+ * Convert a small vector to floating point.
+ */
+static void
+poly_small_to_fp(fpr *x, const int8_t *f, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ x[u] = fpr_of(f[u]);
+ }
+}
+
+/*
+ * Input: f,g of degree N = 2^logn; 'depth' is used only to get their
+ * individual length.
+ *
+ * Output: f',g' of degree N/2, with the length for 'depth+1'.
+ *
+ * Values are in RNS; input and/or output may also be in NTT.
+ */
+static void
+make_fg_step(uint32_t *data, unsigned logn, unsigned depth,
+ int in_ntt, int out_ntt) {
+ size_t n, hn, u;
+ size_t slen, tlen;
+ uint32_t *fd, *gd, *fs, *gs, *gm, *igm, *t1;
+ const small_prime *primes;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ slen = MAX_BL_SMALL[depth];
+ tlen = MAX_BL_SMALL[depth + 1];
+ primes = PRIMES;
+
+ /*
+ * Prepare room for the result.
+ */
+ fd = data;
+ gd = fd + hn * tlen;
+ fs = gd + hn * tlen;
+ gs = fs + n * slen;
+ gm = gs + n * slen;
+ igm = gm + n;
+ t1 = igm + n;
+ memmove(fs, data, 2 * n * slen * sizeof * data);
+
+ /*
+ * First slen words: we use the input values directly, and apply
+ * inverse NTT as we go.
+ */
+ for (u = 0; u < slen; u ++) {
+ uint32_t p, p0i, R2;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ for (v = 0, x = fs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(fs + u, slen, igm, logn, p, p0i);
+ }
+
+ for (v = 0, x = gs + u; v < n; v ++, x += slen) {
+ t1[v] = *x;
+ }
+ if (!in_ntt) {
+ modp_NTT2(t1, gm, logn, p, p0i);
+ }
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ if (in_ntt) {
+ modp_iNTT2_ext(gs + u, slen, igm, logn, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+
+ /*
+ * Since the fs and gs words have been de-NTTized, we can use the
+ * CRT to rebuild the values.
+ */
+ zint_rebuild_CRT(fs, slen, slen, n, primes, 1, gm);
+ zint_rebuild_CRT(gs, slen, slen, n, primes, 1, gm);
+
+ /*
+ * Remaining words: use modular reductions to extract the values.
+ */
+ for (u = slen; u < tlen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *x;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+ for (v = 0, x = fs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = fd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+ for (v = 0, x = gs; v < n; v ++, x += slen) {
+ t1[v] = zint_mod_small_signed(x, slen, p, p0i, R2, Rx);
+ }
+ modp_NTT2(t1, gm, logn, p, p0i);
+ for (v = 0, x = gd + u; v < hn; v ++, x += tlen) {
+ uint32_t w0, w1;
+
+ w0 = t1[(v << 1) + 0];
+ w1 = t1[(v << 1) + 1];
+ *x = modp_montymul(
+ modp_montymul(w0, w1, p, p0i), R2, p, p0i);
+ }
+
+ if (!out_ntt) {
+ modp_iNTT2_ext(fd + u, tlen, igm, logn - 1, p, p0i);
+ modp_iNTT2_ext(gd + u, tlen, igm, logn - 1, p, p0i);
+ }
+ }
+}
+
+/*
+ * Compute f and g at a specific depth, in RNS notation.
+ *
+ * Returned values are stored in the data[] array, at slen words per integer.
+ *
+ * Conditions:
+ * 0 <= depth <= logn
+ *
+ * Space use in data[]: enough room for any two successive values (f', g',
+ * f and g).
+ */
+static void
+make_fg(uint32_t *data, const int8_t *f, const int8_t *g,
+ unsigned logn, unsigned depth, int out_ntt) {
+ size_t n, u;
+ uint32_t *ft, *gt, p0;
+ unsigned d;
+ const small_prime *primes;
+
+ n = MKN(logn);
+ ft = data;
+ gt = ft + n;
+ primes = PRIMES;
+ p0 = primes[0].p;
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p0);
+ gt[u] = modp_set(g[u], p0);
+ }
+
+ if (depth == 0 && out_ntt) {
+ uint32_t *gm, *igm;
+ uint32_t p, p0i;
+
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ gm = gt + n;
+ igm = gm + MKN(logn);
+ modp_mkgm2(gm, igm, logn, primes[0].g, p, p0i);
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ return;
+ }
+
+ if (depth == 0) {
+ return;
+ }
+
+ if (depth == 1) {
+ make_fg_step(data, logn, 0, 0, out_ntt);
+ return;
+ }
+
+ make_fg_step(data, logn, 0, 0, 1);
+ for (d = 1; d + 1 < depth; d ++) {
+ make_fg_step(data, logn - d, d, 1, 1);
+ }
+ make_fg_step(data, logn - depth + 1, depth - 1, 1, out_ntt);
+
+}
+
+/*
+ * Solving the NTRU equation, deepest level: compute the resultants of
+ * f and g with X^N+1, and use binary GCD. The F and G values are
+ * returned in tmp[].
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_deepest(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t len;
+ uint32_t *Fp, *Gp, *fp, *gp, *t1, q;
+ const small_prime *primes;
+
+ len = MAX_BL_SMALL[logn_top];
+ primes = PRIMES;
+
+ Fp = tmp;
+ Gp = Fp + len;
+ fp = Gp + len;
+ gp = fp + len;
+ t1 = gp + len;
+
+ make_fg(fp, f, g, logn_top, logn_top, 0);
+
+ /*
+ * We use the CRT to rebuild the resultants as big integers.
+ * There are two such big integers. The resultants are always
+ * nonnegative.
+ */
+ zint_rebuild_CRT(fp, len, len, 2, primes, 0, t1);
+
+ /*
+ * Apply the binary GCD. The zint_bezout() function works only
+ * if both inputs are odd.
+ *
+ * We can test on the result and return 0 because that would
+ * imply failure of the NTRU solving equation, and the (f,g)
+ * values will be abandoned in that case.
+ */
+ if (!zint_bezout(Gp, Fp, fp, gp, len, t1)) {
+ return 0;
+ }
+
+ /*
+ * Multiply the two values by the target value q. Values must
+ * fit in the destination arrays.
+ * We can again test on the returned words: a non-zero output
+ * of zint_mul_small() means that we exceeded our array
+ * capacity, and that implies failure and rejection of (f,g).
+ */
+ q = 12289;
+ if (zint_mul_small(Fp, len, q) != 0
+ || zint_mul_small(Gp, len, q) != 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, intermediate level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ * This function MAY be invoked for the top-level (in which case depth = 0).
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_intermediate(unsigned logn_top,
+ const int8_t *f, const int8_t *g, unsigned depth, uint32_t *tmp) {
+ /*
+ * In this function, 'logn' is the log2 of the degree for
+ * this step. If N = 2^logn, then:
+ * - the F and G values already in fk->tmp (from the deeper
+ * levels) have degree N/2;
+ * - this function should return F and G of degree N.
+ */
+ unsigned logn;
+ size_t n, hn, slen, dlen, llen, rlen, FGlen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5;
+ int scale_fg, minbl_fg, maxbl_fg, maxbl_FG, scale_k;
+ uint32_t *x, *y;
+ int32_t *k;
+ const small_prime *primes;
+
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2 or N/3)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+ primes = PRIMES;
+
+ /*
+ * Fd and Gd are the F and G from the deeper level.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+
+ /*
+ * Compute the input f and g for this level. Note that we get f
+ * and g in RNS + NTT representation.
+ */
+ ft = Gd + dlen * hn;
+ make_fg(ft, f, g, logn_top, depth, 1);
+
+ /*
+ * Move the newly computed f and g to make room for our candidate
+ * F and G (unreduced).
+ */
+ Ft = tmp;
+ Gt = Ft + n * llen;
+ t1 = Gt + n * llen;
+ memmove(t1, ft, 2 * n * slen * sizeof * ft);
+ ft = t1;
+ gt = ft + slen * n;
+ t1 = gt + slen * n;
+
+ /*
+ * Move Fd and Gd _after_ f and g.
+ */
+ memmove(t1, Fd, 2 * hn * dlen * sizeof * Fd);
+ Fd = t1;
+ Gd = Fd + hn * dlen;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt (only n/2 values in each).
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * We do not need Fd and Gd after that point.
+ */
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = primes[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * If we processed slen words, then f and g have been
+ * de-NTTized, and are in RNS; we can rebuild them.
+ */
+ if (u == slen) {
+ zint_rebuild_CRT(ft, slen, slen, n, primes, 1, t1);
+ zint_rebuild_CRT(gt, slen, slen, n, primes, 1, t1);
+ }
+
+ gm = t1;
+ igm = gm + n;
+ fx = igm + n;
+ gx = fx + n;
+
+ modp_mkgm2(gm, igm, logn, primes[u].g, p, p0i);
+
+ if (u < slen) {
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = *x;
+ gx[v] = *y;
+ }
+ modp_iNTT2_ext(ft + u, slen, igm, logn, p, p0i);
+ modp_iNTT2_ext(gt + u, slen, igm, logn, p, p0i);
+ } else {
+ uint32_t Rx;
+
+ Rx = modp_Rx((unsigned)slen, p, p0i, R2);
+ for (v = 0, x = ft, y = gt;
+ v < n; v ++, x += slen, y += slen) {
+ fx[v] = zint_mod_small_signed(x, slen,
+ p, p0i, R2, Rx);
+ gx[v] = zint_mod_small_signed(y, slen,
+ p, p0i, R2, Rx);
+ }
+ modp_NTT2(fx, gm, logn, p, p0i);
+ modp_NTT2(gx, gm, logn, p, p0i);
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed in
+ * a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * General case:
+ *
+ * we divide degree by d = 2 or 3
+ * f'(x^d) = N(f)(x^d) = f * adj(f)
+ * g'(x^d) = N(g)(x^d) = g * adj(g)
+ * f'*G' - g'*F' = q
+ * F = F'(x^d) * adj(g)
+ * G = G'(x^d) * adj(f)
+ *
+ * We compute things in the NTT. We group roots of phi
+ * such that all roots x in a group share the same x^d.
+ * If the roots in a group are x_1, x_2... x_d, then:
+ *
+ * N(f)(x_1^d) = f(x_1)*f(x_2)*...*f(x_d)
+ *
+ * Thus, we have:
+ *
+ * G(x_1) = f(x_2)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * G(x_2) = f(x_1)*f(x_3)*...*f(x_d)*G'(x_1^d)
+ * ...
+ * G(x_d) = f(x_1)*f(x_2)*...*f(x_{d-1})*G'(x_1^d)
+ *
+ * In all cases, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, in our chosen NTT representation, roots
+ * from the same group are consecutive in RAM.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u; v < hn;
+ v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+ }
+
+ /*
+ * Rebuild F and G with the CRT.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n, primes, 1, t1);
+ zint_rebuild_CRT(Gt, llen, llen, n, primes, 1, t1);
+
+ /*
+ * At that point, Ft, Gt, ft and gt are consecutive in RAM (in that
+ * order).
+ */
+
+ /*
+ * Apply Babai reduction to bring back F and G to size slen.
+ *
+ * We use the FFT to compute successive approximations of the
+ * reduction coefficient. We first isolate the top bits of
+ * the coefficients of f and g, and convert them to floating
+ * point; with the FFT, we compute adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)).
+ *
+ * Then, we repeatedly apply the following:
+ *
+ * - Get the top bits of the coefficients of F and G into
+ * floating point, and use the FFT to compute:
+ * (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g))
+ *
+ * - Convert back that value into normal representation, and
+ * round it to the nearest integers, yielding a polynomial k.
+ * Proper scaling is applied to f, g, F and G so that the
+ * coefficients fit on 32 bits (signed).
+ *
+ * - Subtract k*f from F and k*g from G.
+ *
+ * Under normal conditions, this process reduces the size of F
+ * and G by some bits at each iteration. For constant-time
+ * operation, we do not want to measure the actual length of
+ * F and G; instead, we do the following:
+ *
+ * - f and g are converted to floating-point, with some scaling
+ * if necessary to keep values in the representable range.
+ *
+ * - For each iteration, we _assume_ a maximum size for F and G,
+ * and use the values at that size. If we overreach, then
+ * we get zeros, which is harmless: the resulting coefficients
+ * of k will be 0 and the value won't be reduced.
+ *
+ * - We conservatively assume that F and G will be reduced by
+ * at least 25 bits at each iteration.
+ *
+ * Even when reaching the bottom of the reduction, reduction
+ * coefficient will remain low. If it goes out-of-range, then
+ * something wrong occurred and the whole NTRU solving fails.
+ */
+
+ /*
+ * Memory layout:
+ * - We need to compute and keep adj(f), adj(g), and
+ * 1/(f*adj(f)+g*adj(g)) (sizes N, N and N/2 fp numbers,
+ * respectively).
+ * - At each iteration we need two extra fp buffer (N fp values),
+ * and produce a k (N 32-bit words). k will be shared with one
+ * of the fp buffers.
+ * - To compute k*f and k*g efficiently (with the NTT), we need
+ * some extra room; we reuse the space of the temporary buffers.
+ *
+ * Arrays of 'fpr' are obtained from the temporary array itself.
+ * We ensure that the base is at a properly aligned offset (the
+ * source array tmp[] is supposed to be already aligned).
+ */
+
+ rt3 = align_fpr(tmp, t1);
+ rt4 = rt3 + n;
+ rt5 = rt4 + n;
+ rt1 = rt5 + (n >> 1);
+ k = (int32_t *)align_u32(tmp, rt1);
+ rt2 = align_fpr(tmp, k + n);
+ if (rt2 < (rt1 + n)) {
+ rt2 = rt1 + n;
+ }
+ t1 = (uint32_t *)k + n;
+
+ /*
+ * Get f and g into rt3 and rt4 as floating-point approximations.
+ *
+ * We need to "scale down" the floating-point representation of
+ * coefficients when they are too big. We want to keep the value
+ * below 2^310 or so. Thus, when values are larger than 10 words,
+ * we consider only the top 10 words. Array lengths have been
+ * computed so that average maximum length will fall in the
+ * middle or the upper half of these top 10 words.
+ */
+ if (slen > 10) {
+ rlen = 10;
+ } else {
+ rlen = slen;
+ }
+ poly_big_to_fp(rt3, ft + slen - rlen, rlen, slen, logn);
+ poly_big_to_fp(rt4, gt + slen - rlen, rlen, slen, logn);
+
+ /*
+ * Values in rt3 and rt4 are downscaled by 2^(scale_fg).
+ */
+ scale_fg = 31 * (int)(slen - rlen);
+
+ /*
+ * Estimated boundaries for the maximum size (in bits) of the
+ * coefficients of (f,g). We use the measured average, and
+ * allow for a deviation of at most six times the standard
+ * deviation.
+ */
+ minbl_fg = BITLENGTH[depth].avg - 6 * BITLENGTH[depth].std;
+ maxbl_fg = BITLENGTH[depth].avg + 6 * BITLENGTH[depth].std;
+
+ /*
+ * Compute 1/(f*adj(f)+g*adj(g)) in rt5. We also keep adj(f)
+ * and adj(g) in rt3 and rt4, respectively.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt4, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(rt5, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt3, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt4, logn);
+
+ /*
+ * Reduce F and G repeatedly.
+ *
+ * The expected maximum bit length of coefficients of F and G
+ * is kept in maxbl_FG, with the corresponding word length in
+ * FGlen.
+ */
+ FGlen = llen;
+ maxbl_FG = 31 * (int)llen;
+
+ /*
+ * Each reduction operation computes the reduction polynomial
+ * "k". We need that polynomial to have coefficients that fit
+ * on 32-bit signed integers, with some scaling; thus, we use
+ * a descending sequence of scaling values, down to zero.
+ *
+ * The size of the coefficients of k is (roughly) the difference
+ * between the size of the coefficients of (F,G) and the size
+ * of the coefficients of (f,g). Thus, the maximum size of the
+ * coefficients of k is, at the start, maxbl_FG - minbl_fg;
+ * this is our starting scale value for k.
+ *
+ * We need to estimate the size of (F,G) during the execution of
+ * the algorithm; we are allowed some overestimation but not too
+ * much (poly_big_to_fp() uses a 310-bit window). Generally
+ * speaking, after applying a reduction with k scaled to
+ * scale_k, the size of (F,G) will be size(f,g) + scale_k + dd,
+ * where 'dd' is a few bits to account for the fact that the
+ * reduction is never perfect (intuitively, dd is on the order
+ * of sqrt(N), so at most 5 bits; we here allow for 10 extra
+ * bits).
+ *
+ * The size of (f,g) is not known exactly, but maxbl_fg is an
+ * upper bound.
+ */
+ scale_k = maxbl_FG - minbl_fg;
+
+ for (;;) {
+ int scale_FG, dc, new_maxbl_FG;
+ uint32_t scl, sch;
+ fpr pdc, pt;
+
+ /*
+ * Convert current F and G into floating-point. We apply
+ * scaling if the current length is more than 10 words.
+ */
+ if (FGlen > 10) {
+ rlen = 10;
+ } else {
+ rlen = FGlen;
+ }
+ scale_FG = 31 * (int)(FGlen - rlen);
+ poly_big_to_fp(rt1, Ft + FGlen - rlen, rlen, llen, logn);
+ poly_big_to_fp(rt2, Gt + FGlen - rlen, rlen, llen, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) in rt2.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(rt2, rt1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt2, rt5, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt2, logn);
+
+ /*
+ * (f,g) are scaled by 'scale_fg', meaning that the
+ * numbers in rt3/rt4 should be multiplied by 2^(scale_fg)
+ * to have their true mathematical value.
+ *
+ * (F,G) are similarly scaled by 'scale_FG'. Therefore,
+ * the value we computed in rt2 is scaled by
+ * 'scale_FG-scale_fg'.
+ *
+ * We want that value to be scaled by 'scale_k', hence we
+ * apply a corrective scaling. After scaling, the values
+ * should fit in -2^31-1..+2^31-1.
+ */
+ dc = scale_k - scale_FG + scale_fg;
+
+ /*
+ * We will need to multiply values by 2^(-dc). The value
+ * 'dc' is not secret, so we can compute 2^(-dc) with a
+ * non-constant-time process.
+ * (We could use ldexp(), but we prefer to avoid any
+ * dependency on libm. When using FP emulation, we could
+ * use our fpr_ldexp(), which is constant-time.)
+ */
+ if (dc < 0) {
+ dc = -dc;
+ pt = fpr_two;
+ } else {
+ pt = fpr_onehalf;
+ }
+ pdc = fpr_one;
+ while (dc != 0) {
+ if ((dc & 1) != 0) {
+ pdc = fpr_mul(pdc, pt);
+ }
+ dc >>= 1;
+ pt = fpr_sqr(pt);
+ }
+
+ for (u = 0; u < n; u ++) {
+ fpr xv;
+
+ xv = fpr_mul(rt2[u], pdc);
+
+ /*
+ * Sometimes the values can be out-of-bounds if
+ * the algorithm fails; we must not call
+ * fpr_rint() (and cast to int32_t) if the value
+ * is not in-bounds. Note that the test does not
+ * break constant-time discipline, since any
+ * failure here implies that we discard the current
+ * secret key (f,g).
+ */
+ if (!fpr_lt(fpr_mtwo31m1, xv)
+ || !fpr_lt(xv, fpr_ptwo31m1)) {
+ return 0;
+ }
+ k[u] = (int32_t)fpr_rint(xv);
+ }
+
+ /*
+ * Values in k[] are integers. They really are scaled
+ * down by maxbl_FG - minbl_fg bits.
+ *
+ * If we are at low depth, then we use the NTT to
+ * compute k*f and k*g.
+ */
+ sch = (uint32_t)(scale_k / 31);
+ scl = (uint32_t)(scale_k % 31);
+ if (depth <= DEPTH_INT_FG) {
+ poly_sub_scaled_ntt(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn, t1);
+ poly_sub_scaled_ntt(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn, t1);
+ } else {
+ poly_sub_scaled(Ft, FGlen, llen, ft, slen, slen,
+ k, sch, scl, logn);
+ poly_sub_scaled(Gt, FGlen, llen, gt, slen, slen,
+ k, sch, scl, logn);
+ }
+
+ /*
+ * We compute the new maximum size of (F,G), assuming that
+ * (f,g) has _maximal_ length (i.e. that reduction is
+ * "late" instead of "early". We also adjust FGlen
+ * accordingly.
+ */
+ new_maxbl_FG = scale_k + maxbl_fg + 10;
+ if (new_maxbl_FG < maxbl_FG) {
+ maxbl_FG = new_maxbl_FG;
+ if ((int)FGlen * 31 >= maxbl_FG + 31) {
+ FGlen --;
+ }
+ }
+
+ /*
+ * We suppose that scaling down achieves a reduction by
+ * at least 25 bits per iteration. We stop when we have
+ * done the loop with an unscaled k.
+ */
+ if (scale_k <= 0) {
+ break;
+ }
+ scale_k -= 25;
+ if (scale_k < 0) {
+ scale_k = 0;
+ }
+ }
+
+ /*
+ * If (F,G) length was lowered below 'slen', then we must take
+ * care to re-extend the sign.
+ */
+ if (FGlen < slen) {
+ for (u = 0; u < n; u ++, Ft += llen, Gt += llen) {
+ size_t v;
+ uint32_t sw;
+
+ sw = -(Ft[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Ft[v] = sw;
+ }
+ sw = -(Gt[FGlen - 1] >> 30) >> 1;
+ for (v = FGlen; v < slen; v ++) {
+ Gt[v] = sw;
+ }
+ }
+ }
+
+ /*
+ * Compress encoding of all values to 'slen' words (this is the
+ * expected output format).
+ */
+ for (u = 0, x = tmp, y = tmp;
+ u < (n << 1); u ++, x += slen, y += llen) {
+ memmove(x, y, slen * sizeof * y);
+ }
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, binary case, depth = 1. Upon entry, the
+ * F and G from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth1(unsigned logn_top,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ /*
+ * The first half of this function is a copy of the corresponding
+ * part in solve_NTRU_intermediate(), for the reconstruction of
+ * the unreduced F and G. The second half (Babai reduction) is
+ * done differently, because the unreduced F and G fit in 53 bits
+ * of precision, allowing a much simpler process with lower RAM
+ * usage.
+ */
+ unsigned depth, logn;
+ size_t n_top, n, hn, slen, dlen, llen, u;
+ uint32_t *Fd, *Gd, *Ft, *Gt, *ft, *gt, *t1;
+ fpr *rt1, *rt2, *rt3, *rt4, *rt5, *rt6;
+ uint32_t *x, *y;
+
+ depth = 1;
+ n_top = (size_t)1 << logn_top;
+ logn = logn_top - depth;
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ */
+
+ /*
+ * slen = size for our input f and g; also size of the reduced
+ * F and G we return (degree N)
+ *
+ * dlen = size of the F and G obtained from the deeper level
+ * (degree N/2)
+ *
+ * llen = size for intermediary F and G before reduction (degree N)
+ *
+ * We build our non-reduced F and G as two independent halves each,
+ * of degree N/2 (F = F0 + X*F1, G = G0 + X*G1).
+ */
+ slen = MAX_BL_SMALL[depth];
+ dlen = MAX_BL_SMALL[depth + 1];
+ llen = MAX_BL_LARGE[depth];
+
+ /*
+ * Fd and Gd are the F and G from the deeper level. Ft and Gt
+ * are the destination arrays for the unreduced F and G.
+ */
+ Fd = tmp;
+ Gd = Fd + dlen * hn;
+ Ft = Gd + dlen * hn;
+ Gt = Ft + llen * n;
+
+ /*
+ * We reduce Fd and Gd modulo all the small primes we will need,
+ * and store the values in Ft and Gt.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2, Rx;
+ size_t v;
+ uint32_t *xs, *ys, *xd, *yd;
+
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+ Rx = modp_Rx((unsigned)dlen, p, p0i, R2);
+ for (v = 0, xs = Fd, ys = Gd, xd = Ft + u, yd = Gt + u;
+ v < hn;
+ v ++, xs += dlen, ys += dlen, xd += llen, yd += llen) {
+ *xd = zint_mod_small_signed(xs, dlen, p, p0i, R2, Rx);
+ *yd = zint_mod_small_signed(ys, dlen, p, p0i, R2, Rx);
+ }
+ }
+
+ /*
+ * Now Fd and Gd are not needed anymore; we can squeeze them out.
+ */
+ memmove(tmp, Ft, llen * n * sizeof(uint32_t));
+ Ft = tmp;
+ memmove(Ft + llen * n, Gt, llen * n * sizeof(uint32_t));
+ Gt = Ft + llen * n;
+ ft = Gt + llen * n;
+ gt = ft + slen * n;
+
+ t1 = gt + slen * n;
+
+ /*
+ * Compute our F and G modulo sufficiently many small primes.
+ */
+ for (u = 0; u < llen; u ++) {
+ uint32_t p, p0i, R2;
+ uint32_t *gm, *igm, *fx, *gx, *Fp, *Gp;
+ unsigned e;
+ size_t v;
+
+ /*
+ * All computations are done modulo p.
+ */
+ p = PRIMES[u].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ /*
+ * We recompute things from the source f and g, of full
+ * degree. However, we will need only the n first elements
+ * of the inverse NTT table (igm); the call to modp_mkgm()
+ * below will fill n_top elements in igm[] (thus overflowing
+ * into fx[]) but later code will overwrite these extra
+ * elements.
+ */
+ gm = t1;
+ igm = gm + n_top;
+ fx = igm + n;
+ gx = fx + n_top;
+ modp_mkgm2(gm, igm, logn_top, PRIMES[u].g, p, p0i);
+
+ /*
+ * Set ft and gt to f and g modulo p, respectively.
+ */
+ for (v = 0; v < n_top; v ++) {
+ fx[v] = modp_set(f[v], p);
+ gx[v] = modp_set(g[v], p);
+ }
+
+ /*
+ * Convert to NTT and compute our f and g.
+ */
+ modp_NTT2(fx, gm, logn_top, p, p0i);
+ modp_NTT2(gx, gm, logn_top, p, p0i);
+ for (e = logn_top; e > logn; e --) {
+ modp_poly_rec_res(fx, e, p, p0i, R2);
+ modp_poly_rec_res(gx, e, p, p0i, R2);
+ }
+
+ /*
+ * From that point onward, we only need tables for
+ * degree n, so we can save some space.
+ */
+ if (depth > 0) { /* always true */
+ memmove(gm + n, igm, n * sizeof * igm);
+ igm = gm + n;
+ memmove(igm + n, fx, n * sizeof * ft);
+ fx = igm + n;
+ memmove(fx + n, gx, n * sizeof * gt);
+ gx = fx + n;
+ }
+
+ /*
+ * Get F' and G' modulo p and in NTT representation
+ * (they have degree n/2). These values were computed
+ * in a previous step, and stored in Ft and Gt.
+ */
+ Fp = gx + n;
+ Gp = Fp + hn;
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += llen, y += llen) {
+ Fp[v] = *x;
+ Gp[v] = *y;
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Compute our F and G modulo p.
+ *
+ * Equations are:
+ *
+ * f'(x^2) = N(f)(x^2) = f * adj(f)
+ * g'(x^2) = N(g)(x^2) = g * adj(g)
+ *
+ * f'*G' - g'*F' = q
+ *
+ * F = F'(x^2) * adj(g)
+ * G = G'(x^2) * adj(f)
+ *
+ * The NTT representation of f is f(w) for all w which
+ * are roots of phi. In the binary case, as well as in
+ * the ternary case for all depth except the deepest,
+ * these roots can be grouped in pairs (w,-w), and we
+ * then have:
+ *
+ * f(w) = adj(f)(-w)
+ * f(-w) = adj(f)(w)
+ *
+ * and w^2 is then a root for phi at the half-degree.
+ *
+ * At the deepest level in the ternary case, this still
+ * holds, in the following sense: the roots of x^2-x+1
+ * are (w,-w^2) (for w^3 = -1, and w != -1), and we
+ * have:
+ *
+ * f(w) = adj(f)(-w^2)
+ * f(-w^2) = adj(f)(w)
+ *
+ * In all case, we can thus compute F and G in NTT
+ * representation by a few simple multiplications.
+ * Moreover, the two roots for each pair are consecutive
+ * in our bit-reversal encoding.
+ */
+ for (v = 0, x = Ft + u, y = Gt + u;
+ v < hn; v ++, x += (llen << 1), y += (llen << 1)) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = fx[(v << 1) + 0];
+ ftB = fx[(v << 1) + 1];
+ gtA = gx[(v << 1) + 0];
+ gtB = gx[(v << 1) + 1];
+ mFp = modp_montymul(Fp[v], R2, p, p0i);
+ mGp = modp_montymul(Gp[v], R2, p, p0i);
+ x[0] = modp_montymul(gtB, mFp, p, p0i);
+ x[llen] = modp_montymul(gtA, mFp, p, p0i);
+ y[0] = modp_montymul(ftB, mGp, p, p0i);
+ y[llen] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2_ext(Ft + u, llen, igm, logn, p, p0i);
+ modp_iNTT2_ext(Gt + u, llen, igm, logn, p, p0i);
+
+ /*
+ * Also save ft and gt (only up to size slen).
+ */
+ if (u < slen) {
+ modp_iNTT2(fx, igm, logn, p, p0i);
+ modp_iNTT2(gx, igm, logn, p, p0i);
+ for (v = 0, x = ft + u, y = gt + u;
+ v < n; v ++, x += slen, y += slen) {
+ *x = fx[v];
+ *y = gx[v];
+ }
+ }
+ }
+
+ /*
+ * Rebuild f, g, F and G with the CRT. Note that the elements of F
+ * and G are consecutive, and thus can be rebuilt in a single
+ * loop; similarly, the elements of f and g are consecutive.
+ */
+ zint_rebuild_CRT(Ft, llen, llen, n << 1, PRIMES, 1, t1);
+ zint_rebuild_CRT(ft, slen, slen, n << 1, PRIMES, 1, t1);
+
+ /*
+ * Here starts the Babai reduction, specialized for depth = 1.
+ *
+ * Candidates F and G (from Ft and Gt), and base f and g (ft and gt),
+ * are converted to floating point. There is no scaling, and a
+ * single pass is sufficient.
+ */
+
+ /*
+ * Convert F and G into floating point (rt1 and rt2).
+ */
+ rt1 = align_fpr(tmp, gt + slen * n);
+ rt2 = rt1 + n;
+ poly_big_to_fp(rt1, Ft, llen, llen, logn);
+ poly_big_to_fp(rt2, Gt, llen, llen, logn);
+
+ /*
+ * Integer representation of F and G is no longer needed, we
+ * can remove it.
+ */
+ memmove(tmp, ft, 2 * slen * n * sizeof * ft);
+ ft = tmp;
+ gt = ft + slen * n;
+ rt3 = align_fpr(tmp, gt + slen * n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * Convert f and g into floating point (rt3 and rt4).
+ */
+ poly_big_to_fp(rt3, ft, slen, slen, logn);
+ poly_big_to_fp(rt4, gt, slen, slen, logn);
+
+ /*
+ * Remove unneeded ft and gt.
+ */
+ memmove(tmp, rt1, 4 * n * sizeof * rt1);
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ rt4 = rt3 + n;
+
+ /*
+ * We now have:
+ * rt1 = F
+ * rt2 = G
+ * rt3 = f
+ * rt4 = g
+ * in that order in RAM. We convert all of them to FFT.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = F*adj(f) + G*adj(g)
+ * rt6 = 1 / (f*adj(f) + g*adj(g))
+ * (Note that rt6 is half-length.)
+ */
+ rt5 = rt4 + n;
+ rt6 = rt5 + n;
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add_muladj_fft(rt5, rt1, rt2, rt3, rt4, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(rt6, rt3, rt4, logn);
+
+ /*
+ * Compute:
+ * rt5 = (F*adj(f)+G*adj(g)) / (f*adj(f)+g*adj(g))
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt5, rt6, logn);
+
+ /*
+ * Compute k as the rounded version of rt5. Check that none of
+ * the values is larger than 2^63-1 (in absolute value)
+ * because that would make the fpr_rint() do something undefined;
+ * note that any out-of-bounds value here implies a failure and
+ * (f,g) will be discarded, so we can make a simple test.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt5, logn);
+ for (u = 0; u < n; u ++) {
+ fpr z;
+
+ z = rt5[u];
+ if (!fpr_lt(z, fpr_ptwo63m1) || !fpr_lt(fpr_mtwo63m1, z)) {
+ return 0;
+ }
+ rt5[u] = fpr_of(fpr_rint(z));
+ }
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt5, logn);
+
+ /*
+ * Subtract k*f from F, and k*g from G.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt3, rt5, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(rt4, rt5, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(rt2, rt4, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt2, logn);
+
+ /*
+ * Convert back F and G to integers, and return.
+ */
+ Ft = tmp;
+ Gt = Ft + n;
+ rt3 = align_fpr(tmp, Gt + n);
+ memmove(rt3, rt1, 2 * n * sizeof * rt1);
+ rt1 = rt3;
+ rt2 = rt1 + n;
+ for (u = 0; u < n; u ++) {
+ Ft[u] = (uint32_t)fpr_rint(rt1[u]);
+ Gt[u] = (uint32_t)fpr_rint(rt2[u]);
+ }
+
+ return 1;
+}
+
+/*
+ * Solving the NTRU equation, top level. Upon entry, the F and G
+ * from the previous level should be in the tmp[] array.
+ *
+ * Returned value: 1 on success, 0 on error.
+ */
+static int
+solve_NTRU_binary_depth0(unsigned logn,
+ const int8_t *f, const int8_t *g, uint32_t *tmp) {
+ size_t n, hn, u;
+ uint32_t p, p0i, R2;
+ uint32_t *Fp, *Gp, *t1, *t2, *t3, *t4, *t5;
+ uint32_t *gm, *igm, *ft, *gt;
+ fpr *rt2, *rt3;
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Equations are:
+ *
+ * f' = f0^2 - X^2*f1^2
+ * g' = g0^2 - X^2*g1^2
+ * F' and G' are a solution to f'G' - g'F' = q (from deeper levels)
+ * F = F'*(g0 - X*g1)
+ * G = G'*(f0 - X*f1)
+ *
+ * f0, f1, g0, g1, f', g', F' and G' are all "compressed" to
+ * degree N/2 (their odd-indexed coefficients are all zero).
+ *
+ * Everything should fit in 31-bit integers, hence we can just use
+ * the first small prime p = 2147473409.
+ */
+ p = PRIMES[0].p;
+ p0i = modp_ninv31(p);
+ R2 = modp_R2(p, p0i);
+
+ Fp = tmp;
+ Gp = Fp + hn;
+ ft = Gp + hn;
+ gt = ft + n;
+ gm = gt + n;
+ igm = gm + n;
+
+ modp_mkgm2(gm, igm, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F' anf G' in NTT representation.
+ */
+ for (u = 0; u < hn; u ++) {
+ Fp[u] = modp_set(zint_one_to_plain(Fp + u), p);
+ Gp[u] = modp_set(zint_one_to_plain(Gp + u), p);
+ }
+ modp_NTT2(Fp, gm, logn - 1, p, p0i);
+ modp_NTT2(Gp, gm, logn - 1, p, p0i);
+
+ /*
+ * Load f and g and convert them to NTT representation.
+ */
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+
+ /*
+ * Build the unreduced F,G in ft and gt.
+ */
+ for (u = 0; u < n; u += 2) {
+ uint32_t ftA, ftB, gtA, gtB;
+ uint32_t mFp, mGp;
+
+ ftA = ft[u + 0];
+ ftB = ft[u + 1];
+ gtA = gt[u + 0];
+ gtB = gt[u + 1];
+ mFp = modp_montymul(Fp[u >> 1], R2, p, p0i);
+ mGp = modp_montymul(Gp[u >> 1], R2, p, p0i);
+ ft[u + 0] = modp_montymul(gtB, mFp, p, p0i);
+ ft[u + 1] = modp_montymul(gtA, mFp, p, p0i);
+ gt[u + 0] = modp_montymul(ftB, mGp, p, p0i);
+ gt[u + 1] = modp_montymul(ftA, mGp, p, p0i);
+ }
+ modp_iNTT2(ft, igm, logn, p, p0i);
+ modp_iNTT2(gt, igm, logn, p, p0i);
+
+ Gp = Fp + n;
+ t1 = Gp + n;
+ memmove(Fp, ft, 2 * n * sizeof * ft);
+
+ /*
+ * We now need to apply the Babai reduction. At that point,
+ * we have F and G in two n-word arrays.
+ *
+ * We can compute F*adj(f)+G*adj(g) and f*adj(f)+g*adj(g)
+ * modulo p, using the NTT. We still move memory around in
+ * order to save RAM.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+
+ /*
+ * Compute the NTT tables in t1 and t2. We do not keep t2
+ * (we'll recompute it later on).
+ */
+ modp_mkgm2(t1, t2, logn, PRIMES[0].g, p, p0i);
+
+ /*
+ * Convert F and G to NTT.
+ */
+ modp_NTT2(Fp, t1, logn, p, p0i);
+ modp_NTT2(Gp, t1, logn, p, p0i);
+
+ /*
+ * Load f and adj(f) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(f[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[n - u] = modp_set(-f[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Compute F*adj(f) in t2, and f*adj(f) in t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_montymul(w, Fp[u], p, p0i);
+ t3[u] = modp_montymul(w, t4[u], p, p0i);
+ }
+
+ /*
+ * Load g and adj(g) in t4 and t5, and convert them to NTT
+ * representation.
+ */
+ t4[0] = t5[0] = modp_set(g[0], p);
+ for (u = 1; u < n; u ++) {
+ t4[u] = modp_set(g[u], p);
+ t5[n - u] = modp_set(-g[u], p);
+ }
+ modp_NTT2(t4, t1, logn, p, p0i);
+ modp_NTT2(t5, t1, logn, p, p0i);
+
+ /*
+ * Add G*adj(g) to t2, and g*adj(g) to t3.
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = modp_montymul(t5[u], R2, p, p0i);
+ t2[u] = modp_add(t2[u],
+ modp_montymul(w, Gp[u], p, p0i), p);
+ t3[u] = modp_add(t3[u],
+ modp_montymul(w, t4[u], p, p0i), p);
+ }
+
+ /*
+ * Convert back t2 and t3 to normal representation (normalized
+ * around 0), and then
+ * move them to t1 and t2. We first need to recompute the
+ * inverse table for NTT.
+ */
+ modp_mkgm2(t1, t4, logn, PRIMES[0].g, p, p0i);
+ modp_iNTT2(t2, t4, logn, p, p0i);
+ modp_iNTT2(t3, t4, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint32_t)modp_norm(t2[u], p);
+ t2[u] = (uint32_t)modp_norm(t3[u], p);
+ }
+
+ /*
+ * At that point, array contents are:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * F*adj(f)+G*adj(g) (t1)
+ * f*adj(f)+g*adj(g) (t2)
+ *
+ * We want to divide t1 by t2. The result is not integral; it
+ * must be rounded. We thus need to use the FFT.
+ */
+
+ /*
+ * Get f*adj(f)+g*adj(g) in FFT representation. Since this
+ * polynomial is auto-adjoint, all its coordinates in FFT
+ * representation are actually real, so we can truncate off
+ * the imaginary parts.
+ */
+ rt3 = align_fpr(tmp, t3);
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t2)[u]);
+ }
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+ rt2 = align_fpr(tmp, t2);
+ memmove(rt2, rt3, hn * sizeof * rt3);
+
+ /*
+ * Convert F*adj(f)+G*adj(g) in FFT representation.
+ */
+ rt3 = rt2 + hn;
+ for (u = 0; u < n; u ++) {
+ rt3[u] = fpr_of(((int32_t *)t1)[u]);
+ }
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt3, logn);
+
+ /*
+ * Compute (F*adj(f)+G*adj(g))/(f*adj(f)+g*adj(g)) and get
+ * its rounded normal representation in t1.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_div_autoadj_fft(rt3, rt2, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt3, logn);
+ for (u = 0; u < n; u ++) {
+ t1[u] = modp_set((int32_t)fpr_rint(rt3[u]), p);
+ }
+
+ /*
+ * RAM contents are now:
+ *
+ * F (NTT representation) (Fp)
+ * G (NTT representation) (Gp)
+ * k (t1)
+ *
+ * We want to compute F-k*f, and G-k*g.
+ */
+ t2 = t1 + n;
+ t3 = t2 + n;
+ t4 = t3 + n;
+ t5 = t4 + n;
+ modp_mkgm2(t2, t3, logn, PRIMES[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ t4[u] = modp_set(f[u], p);
+ t5[u] = modp_set(g[u], p);
+ }
+ modp_NTT2(t1, t2, logn, p, p0i);
+ modp_NTT2(t4, t2, logn, p, p0i);
+ modp_NTT2(t5, t2, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t kw;
+
+ kw = modp_montymul(t1[u], R2, p, p0i);
+ Fp[u] = modp_sub(Fp[u],
+ modp_montymul(kw, t4[u], p, p0i), p);
+ Gp[u] = modp_sub(Gp[u],
+ modp_montymul(kw, t5[u], p, p0i), p);
+ }
+ modp_iNTT2(Fp, t3, logn, p, p0i);
+ modp_iNTT2(Gp, t3, logn, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Fp[u] = (uint32_t)modp_norm(Fp[u], p);
+ Gp[u] = (uint32_t)modp_norm(Gp[u], p);
+ }
+
+ return 1;
+}
+
+/*
+ * Solve the NTRU equation. Returned value is 1 on success, 0 on error.
+ * G can be NULL, in which case that value is computed but not returned.
+ * If any of the coefficients of F and G exceeds lim (in absolute value),
+ * then 0 is returned.
+ */
+static int
+solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
+ const int8_t *f, const int8_t *g, int lim, uint32_t *tmp) {
+ size_t n, u;
+ uint32_t *ft, *gt, *Ft, *Gt, *gm;
+ uint32_t p, p0i, r;
+ const small_prime *primes;
+
+ n = MKN(logn);
+
+ if (!solve_NTRU_deepest(logn, f, g, tmp)) {
+ return 0;
+ }
+
+ /*
+ * For logn <= 2, we need to use solve_NTRU_intermediate()
+ * directly, because coefficients are a bit too large and
+ * do not fit the hypotheses in solve_NTRU_binary_depth0().
+ */
+ if (logn <= 2) {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 0) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ } else {
+ unsigned depth;
+
+ depth = logn;
+ while (depth -- > 2) {
+ if (!solve_NTRU_intermediate(logn, f, g, depth, tmp)) {
+ return 0;
+ }
+ }
+ if (!solve_NTRU_binary_depth1(logn, f, g, tmp)) {
+ return 0;
+ }
+ if (!solve_NTRU_binary_depth0(logn, f, g, tmp)) {
+ return 0;
+ }
+ }
+
+ /*
+ * If no buffer has been provided for G, use a temporary one.
+ */
+ if (G == NULL) {
+ G = (int8_t *)(tmp + 2 * n);
+ }
+
+ /*
+ * Final F and G are in fk->tmp, one word per coefficient
+ * (signed value over 31 bits).
+ */
+ if (!poly_big_to_small(F, tmp, lim, logn)
+ || !poly_big_to_small(G, tmp + n, lim, logn)) {
+ return 0;
+ }
+
+ /*
+ * Verify that the NTRU equation is fulfilled. Since all elements
+ * have short lengths, verifying modulo a small prime p works, and
+ * allows using the NTT.
+ *
+ * We put Gt[] first in tmp[], and process it first, so that it does
+ * not overlap with G[] in case we allocated it ourselves.
+ */
+ Gt = tmp;
+ ft = Gt + n;
+ gt = ft + n;
+ Ft = gt + n;
+ gm = Ft + n;
+
+ primes = PRIMES;
+ p = primes[0].p;
+ p0i = modp_ninv31(p);
+ modp_mkgm2(gm, tmp, logn, primes[0].g, p, p0i);
+ for (u = 0; u < n; u ++) {
+ Gt[u] = modp_set(G[u], p);
+ }
+ for (u = 0; u < n; u ++) {
+ ft[u] = modp_set(f[u], p);
+ gt[u] = modp_set(g[u], p);
+ Ft[u] = modp_set(F[u], p);
+ }
+ modp_NTT2(ft, gm, logn, p, p0i);
+ modp_NTT2(gt, gm, logn, p, p0i);
+ modp_NTT2(Ft, gm, logn, p, p0i);
+ modp_NTT2(Gt, gm, logn, p, p0i);
+ r = modp_montymul(12289, 1, p, p0i);
+ for (u = 0; u < n; u ++) {
+ uint32_t z;
+
+ z = modp_sub(modp_montymul(ft[u], Gt[u], p, p0i),
+ modp_montymul(gt[u], Ft[u], p, p0i), p);
+ if (z != r) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/*
+ * Generate a random polynomial with a Gaussian distribution. This function
+ * also makes sure that the resultant of the polynomial with phi is odd.
+ */
+static void
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
+ size_t n, u;
+ unsigned mod2;
+
+ n = MKN(logn);
+ mod2 = 0;
+ for (u = 0; u < n; u ++) {
+ int s;
+
+restart:
+ s = mkgauss(rng, logn);
+
+ /*
+ * We need the coefficient to fit within -127..+127;
+ * realistically, this is always the case except for
+ * the very low degrees (N = 2 or 4), for which there
+ * is no real security anyway.
+ */
+ if (s < -127 || s > 127) {
+ goto restart;
+ }
+
+ /*
+ * We need the sum of all coefficients to be 1; otherwise,
+ * the resultant of the polynomial with X^N+1 will be even,
+ * and the binary GCD will fail.
+ */
+ if (u == n - 1) {
+ if ((mod2 ^ (unsigned)(s & 1)) == 0) {
+ goto restart;
+ }
+ } else {
+ mod2 ^= (unsigned)(s & 1);
+ }
+ f[u] = (int8_t)s;
+ }
+}
+
+/* see falcon.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_keygen(inner_shake256_context *rng,
+ int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
+ unsigned logn, uint8_t *tmp) {
+ /*
+ * Algorithm is the following:
+ *
+ * - Generate f and g with the Gaussian distribution.
+ *
+ * - If either Res(f,phi) or Res(g,phi) is even, try again.
+ *
+ * - If ||(f,g)|| is too large, try again.
+ *
+ * - If ||B~_{f,g}|| is too large, try again.
+ *
+ * - If f is not invertible mod phi mod q, try again.
+ *
+ * - Compute h = g/f mod phi mod q.
+ *
+ * - Solve the NTRU equation fG - gF = q; if the solving fails,
+ * try again. Usual failure condition is when Res(f,phi)
+ * and Res(g,phi) are not prime to each other.
+ */
+ size_t n, u;
+ uint16_t *h2, *tmp2;
+ RNG_CONTEXT *rc;
+
+ n = MKN(logn);
+ rc = rng;
+
+ /*
+ * We need to generate f and g randomly, until we find values
+ * such that the norm of (g,-f), and of the orthogonalized
+ * vector, are satisfying. The orthogonalized vector is:
+ * (q*adj(f)/(f*adj(f)+g*adj(g)), q*adj(g)/(f*adj(f)+g*adj(g)))
+ * (it is actually the (N+1)-th row of the Gram-Schmidt basis).
+ *
+ * In the binary case, coefficients of f and g are generated
+ * independently of each other, with a discrete Gaussian
+ * distribution of standard deviation 1.17*sqrt(q/(2*N)). Then,
+ * the two vectors have expected norm 1.17*sqrt(q), which is
+ * also our acceptance bound: we require both vectors to be no
+ * larger than that (this will be satisfied about 1/4th of the
+ * time, thus we expect sampling new (f,g) about 4 times for that
+ * step).
+ *
+ * We require that Res(f,phi) and Res(g,phi) are both odd (the
+ * NTRU equation solver requires it).
+ */
+ for (;;) {
+ fpr *rt1, *rt2, *rt3;
+ fpr bnorm;
+ uint32_t normf, normg, norm;
+ int lim;
+
+ /*
+ * The poly_small_mkgauss() function makes sure
+ * that the sum of coefficients is 1 modulo 2
+ * (i.e. the resultant of the polynomial with phi
+ * will be odd).
+ */
+ poly_small_mkgauss(rc, f, logn);
+ poly_small_mkgauss(rc, g, logn);
+
+ /*
+ * Verify that all coefficients are within the bounds
+ * defined in max_fg_bits. This is the case with
+ * overwhelming probability; this guarantees that the
+ * key will be encodable with FALCON_COMP_TRIM.
+ */
+ lim = 1 << (PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[logn] - 1);
+ for (u = 0; u < n; u ++) {
+ /*
+ * We can use non-CT tests since on any failure
+ * we will discard f and g.
+ */
+ if (f[u] >= lim || f[u] <= -lim
+ || g[u] >= lim || g[u] <= -lim) {
+ lim = -1;
+ break;
+ }
+ }
+ if (lim < 0) {
+ continue;
+ }
+
+ /*
+ * Bound is 1.17*sqrt(q). We compute the squared
+ * norms. With q = 12289, the squared bound is:
+ * (1.17^2)* 12289 = 16822.4121
+ * Since f and g are integral, the squared norm
+ * of (g,-f) is an integer.
+ */
+ normf = poly_small_sqnorm(f, logn);
+ normg = poly_small_sqnorm(g, logn);
+ norm = (normf + normg) | -((normf | normg) >> 31);
+ if (norm >= 16823) {
+ continue;
+ }
+
+ /*
+ * We compute the orthogonalized vector norm.
+ */
+ rt1 = (fpr *)tmp;
+ rt2 = rt1 + n;
+ rt3 = rt2 + n;
+ poly_small_to_fp(rt1, f, logn);
+ poly_small_to_fp(rt2, g, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rt2, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_invnorm2_fft(rt3, rt1, rt2, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_adj_fft(rt2, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(rt1, fpr_q, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(rt2, fpr_q, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt1, rt3, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_autoadj_fft(rt2, rt3, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(rt2, logn);
+ bnorm = fpr_zero;
+ for (u = 0; u < n; u ++) {
+ bnorm = fpr_add(bnorm, fpr_sqr(rt1[u]));
+ bnorm = fpr_add(bnorm, fpr_sqr(rt2[u]));
+ }
+ if (!fpr_lt(bnorm, fpr_bnorm_max)) {
+ continue;
+ }
+
+ /*
+ * Compute public key h = g/f mod X^N+1 mod q. If this
+ * fails, we must restart.
+ */
+ if (h == NULL) {
+ h2 = (uint16_t *)tmp;
+ tmp2 = h2 + n;
+ } else {
+ h2 = h;
+ tmp2 = (uint16_t *)tmp;
+ }
+ if (!PQCLEAN_FALCONPADDED512_CLEAN_compute_public(h2, f, g, logn, (uint8_t *)tmp2)) {
+ continue;
+ }
+
+ /*
+ * Solve the NTRU equation to get F and G.
+ */
+ lim = (1 << (PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[logn] - 1)) - 1;
+ if (!solve_NTRU(logn, F, G, f, g, lim, (uint32_t *)tmp)) {
+ continue;
+ }
+
+ /*
+ * Key pair is generated.
+ */
+ break;
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/pqclean.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/pqclean.c
new file mode 100644
index 000000000..7edf6a874
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/pqclean.c
@@ -0,0 +1,376 @@
+/*
+ * Wrapper for implementing the PQClean API.
+ */
+
+#include
+#include
+
+#include "api.h"
+#include "inner.h"
+
+#define NONCELEN 40
+
+#include "randombytes.h"
+
+/*
+ * Encoding formats (nnnn = log of degree, 9 for Falcon-512, 10 for Falcon-1024)
+ *
+ * private key:
+ * header byte: 0101nnnn
+ * private f (6 or 5 bits by element, depending on degree)
+ * private g (6 or 5 bits by element, depending on degree)
+ * private F (8 bits by element)
+ *
+ * public key:
+ * header byte: 0000nnnn
+ * public h (14 bits by element)
+ *
+ * signature:
+ * header byte: 0011nnnn
+ * nonce (r) 40 bytes
+ * value (s) compressed format
+ * padding to 666 bytes
+ *
+ * message + signature:
+ * signature 666 bytes
+ * message
+ */
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(
+ uint8_t *pk, uint8_t *sk) {
+ union {
+ uint8_t b[FALCON_KEYGEN_TEMP_9];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[512], g[512], F[512];
+ uint16_t h[512];
+ unsigned char seed[48];
+ inner_shake256_context rng;
+ size_t u, v;
+
+ /*
+ * Generate key pair.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&rng);
+ inner_shake256_inject(&rng, seed, sizeof seed);
+ inner_shake256_flip(&rng);
+ PQCLEAN_FALCONPADDED512_CLEAN_keygen(&rng, f, g, F, NULL, h, 9, tmp.b);
+ inner_shake256_ctx_release(&rng);
+
+ /*
+ * Encode private key.
+ */
+ sk[0] = 0x50 + 9;
+ u = 1;
+ v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+ f, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+ g, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_encode(
+ sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u,
+ F, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[9]);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+
+ /*
+ * Encode public key.
+ */
+ pk[0] = 0x00 + 9;
+ v = PQCLEAN_FALCONPADDED512_CLEAN_modq_encode(
+ pk + 1, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1,
+ h, 9);
+ if (v != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the signature. nonce[] receives the nonce and must have length
+ * NONCELEN bytes. sigbuf[] receives the signature value (without nonce
+ * or header byte), with sigbuflen providing the maximum value length.
+ *
+ * If a signature could be computed but not encoded because it would
+ * exceed the output buffer size, then a new signature is computed. If
+ * the provided buffer size is too low, this could loop indefinitely, so
+ * the caller must provide a size that can accommodate signatures with a
+ * large enough probability.
+ *
+ * Return value: 0 on success, -1 on error.
+ */
+static int
+do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ union {
+ uint8_t b[72 * 512];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ int8_t f[512], g[512], F[512], G[512];
+ struct {
+ int16_t sig[512];
+ uint16_t hm[512];
+ } r;
+ unsigned char seed[48];
+ inner_shake256_context sc;
+ size_t u, v;
+
+ /*
+ * Decode the private key.
+ */
+ if (sk[0] != 0x50 + 9) {
+ return -1;
+ }
+ u = 1;
+ v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+ f, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9],
+ sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+ g, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_fg_bits[9],
+ sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ v = PQCLEAN_FALCONPADDED512_CLEAN_trim_i8_decode(
+ F, 9, PQCLEAN_FALCONPADDED512_CLEAN_max_FG_bits[9],
+ sk + u, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES - u);
+ if (v == 0) {
+ return -1;
+ }
+ u += v;
+ if (u != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_SECRETKEYBYTES) {
+ return -1;
+ }
+ if (!PQCLEAN_FALCONPADDED512_CLEAN_complete_private(G, f, g, F, 9, tmp.b)) {
+ return -1;
+ }
+
+ /*
+ * Create a random nonce (40 bytes).
+ */
+ randombytes(nonce, NONCELEN);
+
+ /*
+ * Hash message nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(&sc, r.hm, 9, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Initialize a RNG.
+ */
+ randombytes(seed, sizeof seed);
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, seed, sizeof seed);
+ inner_shake256_flip(&sc);
+
+ /*
+ * Compute and return the signature. This loops until a signature
+ * value is found that fits in the provided buffer.
+ */
+ for (;;) {
+ PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(r.sig, &sc, f, g, F, G, r.hm, 9, tmp.b);
+ v = PQCLEAN_FALCONPADDED512_CLEAN_comp_encode(sigbuf, sigbuflen, r.sig, 9);
+ if (v != 0) {
+ inner_shake256_ctx_release(&sc);
+ memset(sigbuf + v, 0, sigbuflen - v);
+ return 0;
+ }
+ }
+}
+
+/*
+ * Verify a sigature. The nonce has size NONCELEN bytes. sigbuf[]
+ * (of size sigbuflen) contains the signature value, not including the
+ * header byte or nonce. Return value is 0 on success, -1 on error.
+ */
+static int
+do_verify(
+ const uint8_t *nonce, const uint8_t *sigbuf, size_t sigbuflen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ union {
+ uint8_t b[2 * 512];
+ uint64_t dummy_u64;
+ fpr dummy_fpr;
+ } tmp;
+ uint16_t h[512], hm[512];
+ int16_t sig[512];
+ inner_shake256_context sc;
+ size_t v;
+
+ /*
+ * Decode public key.
+ */
+ if (pk[0] != 0x00 + 9) {
+ return -1;
+ }
+ if (PQCLEAN_FALCONPADDED512_CLEAN_modq_decode(h, 9,
+ pk + 1, PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1)
+ != PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_PUBLICKEYBYTES - 1) {
+ return -1;
+ }
+ PQCLEAN_FALCONPADDED512_CLEAN_to_ntt_monty(h, 9);
+
+ /*
+ * Decode signature.
+ */
+ if (sigbuflen == 0) {
+ return -1;
+ }
+
+ v = PQCLEAN_FALCONPADDED512_CLEAN_comp_decode(sig, 9, sigbuf, sigbuflen);
+ if (v == 0) {
+ return -1;
+ }
+ if (v != sigbuflen) {
+ if (sigbuflen == PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1) {
+ while (v < sigbuflen) {
+ if (sigbuf[v++] != 0) {
+ return -1;
+ }
+ }
+ } else {
+ return -1;
+ }
+ }
+
+ /*
+ * Hash nonce + message into a vector.
+ */
+ inner_shake256_init(&sc);
+ inner_shake256_inject(&sc, nonce, NONCELEN);
+ inner_shake256_inject(&sc, m, mlen);
+ inner_shake256_flip(&sc);
+ PQCLEAN_FALCONPADDED512_CLEAN_hash_to_point_ct(&sc, hm, 9, tmp.b);
+ inner_shake256_ctx_release(&sc);
+
+ /*
+ * Verify signature.
+ */
+ if (!PQCLEAN_FALCONPADDED512_CLEAN_verify_raw(hm, sig, h, 9, tmp.b)) {
+ return -1;
+ }
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(
+ uint8_t *sig, size_t *siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ size_t vlen;
+
+ vlen = PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sig + 1, sig + 1 + NONCELEN, vlen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sig[0] = 0x30 + 9;
+ *siglen = 1 + NONCELEN + vlen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(
+ const uint8_t *sig, size_t siglen,
+ const uint8_t *m, size_t mlen, const uint8_t *pk) {
+ if (siglen < 1 + NONCELEN) {
+ return -1;
+ }
+ if (sig[0] != 0x30 + 9) {
+ return -1;
+ }
+ return do_verify(sig + 1,
+ sig + 1 + NONCELEN, siglen - 1 - NONCELEN, m, mlen, pk);
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign(
+ uint8_t *sm, size_t *smlen,
+ const uint8_t *m, size_t mlen, const uint8_t *sk) {
+ uint8_t *sigbuf;
+ size_t sigbuflen;
+
+ /*
+ * Move the message to its final location; this is a memmove() so
+ * it handles overlaps properly.
+ */
+ memmove(sm + PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES, m, mlen);
+ sigbuf = sm + 1 + NONCELEN;
+ sigbuflen = PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+ if (do_sign(sm + 1, sigbuf, sigbuflen, m, mlen, sk) < 0) {
+ return -1;
+ }
+ sm[0] = 0x30 + 9;
+ sigbuflen ++;
+ *smlen = mlen + NONCELEN + sigbuflen;
+ return 0;
+}
+
+/* see api.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_open(
+ uint8_t *m, size_t *mlen,
+ const uint8_t *sm, size_t smlen, const uint8_t *pk) {
+ const uint8_t *sigbuf;
+ size_t pmlen, sigbuflen;
+
+ if (smlen < PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES) {
+ return -1;
+ }
+ sigbuflen = PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES - NONCELEN - 1;
+ pmlen = smlen - PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES;
+ if (sm[0] != 0x30 + 9) {
+ return -1;
+ }
+ sigbuf = sm + 1 + NONCELEN;
+
+ /*
+ * The one-byte signature header has been verified. Nonce is at sm+1
+ * followed by the signature (pointed to by sigbuf). The message
+ * follows the signature value.
+ */
+ if (do_verify(sm + 1, sigbuf, sigbuflen,
+ sm + PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES, pmlen, pk) < 0) {
+ return -1;
+ }
+
+ /*
+ * Signature is correct, we just have to copy/move the message
+ * to its final destination. The memmove() properly handles
+ * overlaps.
+ */
+ memmove(m, sm + PQCLEAN_FALCONPADDED512_CLEAN_CRYPTO_BYTES, pmlen);
+ *mlen = pmlen;
+ return 0;
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/rng.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/rng.c
new file mode 100644
index 000000000..ccce5e886
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/rng.c
@@ -0,0 +1,188 @@
+/*
+ * PRNG and interface to the system RNG.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+
+#include "inner.h"
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
+ /*
+ * To ensure reproducibility for a given seed, we
+ * must enforce little-endian interpretation of
+ * the state words.
+ */
+ uint8_t tmp[56];
+ uint64_t th, tl;
+ int i;
+
+ uint32_t *d32 = (uint32_t *) p->state.d;
+ uint64_t *d64 = (uint64_t *) p->state.d;
+
+ inner_shake256_extract(src, tmp, 56);
+ for (i = 0; i < 14; i ++) {
+ uint32_t w;
+
+ w = (uint32_t)tmp[(i << 2) + 0]
+ | ((uint32_t)tmp[(i << 2) + 1] << 8)
+ | ((uint32_t)tmp[(i << 2) + 2] << 16)
+ | ((uint32_t)tmp[(i << 2) + 3] << 24);
+ d32[i] = w;
+ }
+ tl = d32[48 / sizeof(uint32_t)];
+ th = d32[52 / sizeof(uint32_t)];
+ d64[48 / sizeof(uint64_t)] = tl + (th << 32);
+ PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+}
+
+/*
+ * PRNG based on ChaCha20.
+ *
+ * State consists in key (32 bytes) then IV (16 bytes) and block counter
+ * (8 bytes). Normally, we should not care about local endianness (this
+ * is for a PRNG), but for the NIST competition we need reproducible KAT
+ * vectors that work across architectures, so we enforce little-endian
+ * interpretation where applicable. Moreover, output words are "spread
+ * out" over the output buffer with the interleaving pattern that is
+ * naturally obtained from the AVX2 implementation that runs eight
+ * ChaCha20 instances in parallel.
+ *
+ * The block counter is XORed into the first 8 bytes of the IV.
+ */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(prng *p) {
+
+ static const uint32_t CW[] = {
+ 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+ };
+
+ uint64_t cc;
+ size_t u;
+
+ /*
+ * State uses local endianness. Only the output bytes must be
+ * converted to little endian (if used on a big-endian machine).
+ */
+ cc = *(uint64_t *)(p->state.d + 48);
+ for (u = 0; u < 8; u ++) {
+ uint32_t state[16];
+ size_t v;
+ int i;
+
+ memcpy(&state[0], CW, sizeof CW);
+ memcpy(&state[4], p->state.d, 48);
+ state[14] ^= (uint32_t)cc;
+ state[15] ^= (uint32_t)(cc >> 32);
+ for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d) do { \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 16) | (state[d] >> 16); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 12) | (state[b] >> 20); \
+ state[a] += state[b]; \
+ state[d] ^= state[a]; \
+ state[d] = (state[d] << 8) | (state[d] >> 24); \
+ state[c] += state[d]; \
+ state[b] ^= state[c]; \
+ state[b] = (state[b] << 7) | (state[b] >> 25); \
+ } while (0)
+
+ QROUND( 0, 4, 8, 12);
+ QROUND( 1, 5, 9, 13);
+ QROUND( 2, 6, 10, 14);
+ QROUND( 3, 7, 11, 15);
+ QROUND( 0, 5, 10, 15);
+ QROUND( 1, 6, 11, 12);
+ QROUND( 2, 7, 8, 13);
+ QROUND( 3, 4, 9, 14);
+
+#undef QROUND
+
+ }
+
+ for (v = 0; v < 4; v ++) {
+ state[v] += CW[v];
+ }
+ for (v = 4; v < 14; v ++) {
+ state[v] += ((uint32_t *)p->state.d)[v - 4];
+ }
+ state[14] += ((uint32_t *)p->state.d)[10]
+ ^ (uint32_t)cc;
+ state[15] += ((uint32_t *)p->state.d)[11]
+ ^ (uint32_t)(cc >> 32);
+ cc ++;
+
+ /*
+ * We mimic the interleaving that is used in the AVX2
+ * implementation.
+ */
+ for (v = 0; v < 16; v ++) {
+ p->buf.d[(u << 2) + (v << 5) + 0] =
+ (uint8_t)state[v];
+ p->buf.d[(u << 2) + (v << 5) + 1] =
+ (uint8_t)(state[v] >> 8);
+ p->buf.d[(u << 2) + (v << 5) + 2] =
+ (uint8_t)(state[v] >> 16);
+ p->buf.d[(u << 2) + (v << 5) + 3] =
+ (uint8_t)(state[v] >> 24);
+ }
+ }
+ *(uint64_t *)(p->state.d + 48) = cc;
+
+ p->ptr = 0;
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_prng_get_bytes(prng *p, void *dst, size_t len) {
+ uint8_t *buf;
+
+ buf = dst;
+ while (len > 0) {
+ size_t clen;
+
+ clen = (sizeof p->buf.d) - p->ptr;
+ if (clen > len) {
+ clen = len;
+ }
+ memcpy(buf, p->buf.d, clen);
+ buf += clen;
+ len -= clen;
+ p->ptr += clen;
+ if (p->ptr == sizeof p->buf.d) {
+ PQCLEAN_FALCONPADDED512_CLEAN_prng_refill(p);
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/sign.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/sign.c
new file mode 100644
index 000000000..5e37a4613
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/sign.c
@@ -0,0 +1,1248 @@
+/*
+ * Falcon signature generation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* =================================================================== */
+
+/*
+ * Compute degree N from logarithm 'logn'.
+ */
+#define MKN(logn) ((size_t)1 << (logn))
+
+/* =================================================================== */
+/*
+ * Binary case:
+ * N = 2^logn
+ * phi = X^N+1
+ */
+
+/*
+ * Get the size of the LDL tree for an input with polynomials of size
+ * 2^logn. The size is expressed in the number of elements.
+ */
+static inline unsigned
+ffLDL_treesize(unsigned logn) {
+ /*
+ * For logn = 0 (polynomials are constant), the "tree" is a
+ * single element. Otherwise, the tree node has size 2^logn, and
+ * has two child trees for size logn-1 each. Thus, treesize s()
+ * must fulfill these two relations:
+ *
+ * s(0) = 1
+ * s(logn) = (2^logn) + 2*s(logn-1)
+ */
+ return (logn + 1) << logn;
+}
+
+/*
+ * Inner function for ffLDL_fft(). It expects the matrix to be both
+ * auto-adjoint and quasicyclic; also, it uses the source operands
+ * as modifiable temporaries.
+ *
+ * tmp[] must have room for at least one polynomial.
+ */
+static void
+ffLDL_fft_inner(fpr *tree,
+ fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g0[0];
+ return;
+ }
+ hn = n >> 1;
+
+ /*
+ * The LDL decomposition yields L (which is written in the tree)
+ * and the diagonal of D. Since d00 = g0, we just write d11
+ * into tmp.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
+
+ /*
+ * Split d00 (currently in g0) and d11 (currently in tmp). We
+ * reuse g0 and g1 as temporary storage spaces:
+ * d00 splits into g1, g1+hn
+ * d11 splits into g0, g0+hn
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(g1, g1 + hn, g0, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(g0, g0 + hn, tmp, logn);
+
+ /*
+ * Each split result is the first row of a new auto-adjoint
+ * quasicyclic matrix for the next recursive step.
+ */
+ ffLDL_fft_inner(tree + n,
+ g1, g1 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ g0, g0 + hn, logn - 1, tmp);
+}
+
+/*
+ * Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
+ * is provided as three polynomials (FFT representation).
+ *
+ * The "tree" array is filled with the computed tree, of size
+ * (logn+1)*(2^logn) elements (see ffLDL_treesize()).
+ *
+ * Input arrays MUST NOT overlap, except possibly the three unmodified
+ * arrays g00, g01 and g11. tmp[] should have room for at least three
+ * polynomials of 2^logn elements each.
+ */
+static void
+ffLDL_fft(fpr *tree, const fpr *g00,
+ const fpr *g01, const fpr *g11,
+ unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *d00, *d11;
+
+ n = MKN(logn);
+ if (n == 1) {
+ tree[0] = g00[0];
+ return;
+ }
+ hn = n >> 1;
+ d00 = tmp;
+ d11 = tmp + n;
+ tmp += n << 1;
+
+ memcpy(d00, g00, n * sizeof * g00);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
+
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(tmp, tmp + hn, d00, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(d00, d00 + hn, d11, logn);
+ memcpy(d11, tmp, n * sizeof * tmp);
+ ffLDL_fft_inner(tree + n,
+ d11, d11 + hn, logn - 1, tmp);
+ ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
+ d00, d00 + hn, logn - 1, tmp);
+}
+
+/*
+ * Normalize an ffLDL tree: each leaf of value x is replaced with
+ * sigma / sqrt(x).
+ */
+static void
+ffLDL_binary_normalize(fpr *tree, unsigned orig_logn, unsigned logn) {
+ /*
+ * TODO: make an iterative version.
+ */
+ size_t n;
+
+ n = MKN(logn);
+ if (n == 1) {
+ /*
+ * We actually store in the tree leaf the inverse of
+ * the value mandated by the specification: this
+ * saves a division both here and in the sampler.
+ */
+ tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma[orig_logn]);
+ } else {
+ ffLDL_binary_normalize(tree + n, orig_logn, logn - 1);
+ ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
+ orig_logn, logn - 1);
+ }
+}
+
+/* =================================================================== */
+
+/*
+ * Convert an integer polynomial (with small values) into the
+ * representation with complex numbers.
+ */
+static void
+smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
+ size_t n, u;
+
+ n = MKN(logn);
+ for (u = 0; u < n; u ++) {
+ r[u] = fpr_of(t[u]);
+ }
+}
+
+/*
+ * The expanded private key contains:
+ * - The B0 matrix (four elements)
+ * - The ffLDL tree
+ */
+
+static inline size_t
+skoff_b00(unsigned logn) {
+ (void)logn;
+ return 0;
+}
+
+static inline size_t
+skoff_b01(unsigned logn) {
+ return MKN(logn);
+}
+
+static inline size_t
+skoff_b10(unsigned logn) {
+ return 2 * MKN(logn);
+}
+
+static inline size_t
+skoff_b11(unsigned logn) {
+ return 3 * MKN(logn);
+}
+
+static inline size_t
+skoff_tree(unsigned logn) {
+ return 4 * MKN(logn);
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_expand_privkey(fpr *expanded_key,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ unsigned logn, uint8_t *tmp) {
+ size_t n;
+ fpr *rf, *rg, *rF, *rG;
+ fpr *b00, *b01, *b10, *b11;
+ fpr *g00, *g01, *g11, *gxx;
+ fpr *tree;
+
+ n = MKN(logn);
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * We load the private key elements directly into the B0 matrix,
+ * since B0 = [[g, -f], [G, -F]].
+ */
+ rf = b01;
+ rg = b00;
+ rF = b11;
+ rG = b10;
+
+ smallints_to_fpr(rf, f, logn);
+ smallints_to_fpr(rg, g, logn);
+ smallints_to_fpr(rF, F, logn);
+ smallints_to_fpr(rG, G, logn);
+
+ /*
+ * Compute the FFT for the key elements, and negate f and F.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rf, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rg, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rF, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(rG, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(rf, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(rF, logn);
+
+ /*
+ * The Gram matrix is G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle).
+ */
+ g00 = (fpr *)tmp;
+ g01 = g00 + n;
+ g11 = g01 + n;
+ gxx = g11 + n;
+
+ memcpy(g00, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(g00, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(g00, gxx, logn);
+
+ memcpy(g01, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(g01, b10, logn);
+ memcpy(gxx, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(gxx, b11, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(g01, gxx, logn);
+
+ memcpy(g11, b10, n * sizeof * b10);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(g11, logn);
+ memcpy(gxx, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(gxx, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(g11, gxx, logn);
+
+ /*
+ * Compute the Falcon tree.
+ */
+ ffLDL_fft(tree, g00, g01, g11, logn, gxx);
+
+ /*
+ * Normalize tree.
+ */
+ ffLDL_binary_normalize(tree, logn, logn);
+}
+
+typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
+
+/*
+ * Perform Fast Fourier Sampling for target vector t. The Gram matrix
+ * is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
+ * is written over (t0,t1). The Gram matrix is modified as well. The
+ * tmp[] buffer must have room for four polynomials.
+ */
+static void
+ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
+ fpr *t0, fpr *t1,
+ fpr *g00, fpr *g01, fpr *g11,
+ unsigned orig_logn, unsigned logn, fpr *tmp) {
+ size_t n, hn;
+ fpr *z0, *z1;
+
+ /*
+ * Deepest level: the LDL tree leaf value is just g00 (the
+ * array has length only 1 at this point); we normalize it
+ * with regards to sigma, then use it for sampling.
+ */
+ if (logn == 0) {
+ fpr leaf;
+
+ leaf = g00[0];
+ leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma[orig_logn]);
+ t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
+ t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
+ return;
+ }
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+
+ /*
+ * Decompose G into LDL. We only need d00 (identical to g00),
+ * d11, and l10; we do that in place.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_LDL_fft(g00, g01, g11, logn);
+
+ /*
+ * Split d00 and d11 and expand them into half-size quasi-cyclic
+ * Gram matrices. We also save l10 in tmp[].
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(tmp, tmp + hn, g00, logn);
+ memcpy(g00, tmp, n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(tmp, tmp + hn, g11, logn);
+ memcpy(g11, tmp, n * sizeof * tmp);
+ memcpy(tmp, g01, n * sizeof * g01);
+ memcpy(g01, g00, hn * sizeof * g00);
+ memcpy(g01 + hn, g11, hn * sizeof * g00);
+
+ /*
+ * The half-size Gram matrices for the recursive LDL tree
+ * building are now:
+ * - left sub-tree: g00, g00+hn, g01
+ * - right sub-tree: g11, g11+hn, g01+hn
+ * l10 is in tmp[].
+ */
+
+ /*
+ * We split t1 and use the first recursive call on the two
+ * halves, using the right sub-tree. The result is merged
+ * back into tmp + 2*n.
+ */
+ z1 = tmp + n;
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
+ g11, g11 + hn, g01 + hn, orig_logn, logn - 1, z1 + n);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * l10.
+ * At that point, l10 is in tmp, t1 is unmodified, and z1 is
+ * in tmp + (n << 1). The buffer in z1 is free.
+ *
+ * In the end, z1 is written over t1, and tb0 is in t0.
+ */
+ memcpy(z1, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(z1, tmp + (n << 1), logn);
+ memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(t0, tmp, logn);
+
+ /*
+ * Second recursive invocation, on the split tb0 (currently in t0)
+ * and the left sub-tree.
+ */
+ z0 = tmp;
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z0, z0 + hn, t0, logn);
+ ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
+ g00, g00 + hn, g01, orig_logn, logn - 1, z0 + n);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(t0, z0, z0 + hn, logn);
+}
+
+/*
+ * Perform Fast Fourier Sampling for target vector t and LDL tree T.
+ * tmp[] must have size for at least two polynomials of size 2^logn.
+ */
+static void
+ffSampling_fft(samplerZ samp, void *samp_ctx,
+ fpr *z0, fpr *z1,
+ const fpr *tree,
+ const fpr *t0, const fpr *t1, unsigned logn,
+ fpr *tmp) {
+ size_t n, hn;
+ const fpr *tree0, *tree1;
+
+ /*
+ * When logn == 2, we inline the last two recursion levels.
+ */
+ if (logn == 2) {
+ fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ tree0 = tree + 4;
+ tree1 = tree + 8;
+
+ /*
+ * We split t1 into w*, then do the recursive invocation,
+ * with output in w*. We finally merge back into z1.
+ */
+ a_re = t1[0];
+ a_im = t1[2];
+ b_re = t1[1];
+ b_im = t1[3];
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ x0 = w2;
+ x1 = w3;
+ sigma = tree1[3];
+ w2 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, w2);
+ a_im = fpr_sub(x1, w3);
+ b_re = tree1[0];
+ b_im = tree1[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree1[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z1[0] = w0 = fpr_add(a_re, c_re);
+ z1[2] = w2 = fpr_add(a_im, c_im);
+ z1[1] = w1 = fpr_sub(a_re, c_re);
+ z1[3] = w3 = fpr_sub(a_im, c_im);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+ */
+ w0 = fpr_sub(t1[0], w0);
+ w1 = fpr_sub(t1[1], w1);
+ w2 = fpr_sub(t1[2], w2);
+ w3 = fpr_sub(t1[3], w3);
+
+ a_re = w0;
+ a_im = w2;
+ b_re = tree[0];
+ b_im = tree[2];
+ w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ a_re = w1;
+ a_im = w3;
+ b_re = tree[1];
+ b_im = tree[3];
+ w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+ w0 = fpr_add(w0, t0[0]);
+ w1 = fpr_add(w1, t0[1]);
+ w2 = fpr_add(w2, t0[2]);
+ w3 = fpr_add(w3, t0[3]);
+
+ /*
+ * Second recursive invocation.
+ */
+ a_re = w0;
+ a_im = w2;
+ b_re = w1;
+ b_im = w3;
+ c_re = fpr_add(a_re, b_re);
+ c_im = fpr_add(a_im, b_im);
+ w0 = fpr_half(c_re);
+ w1 = fpr_half(c_im);
+ c_re = fpr_sub(a_re, b_re);
+ c_im = fpr_sub(a_im, b_im);
+ w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+ w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+ x0 = w2;
+ x1 = w3;
+ sigma = tree0[3];
+ w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree0[0];
+ b_im = tree0[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, w0);
+ x1 = fpr_add(c_im, w1);
+ sigma = tree0[2];
+ w0 = fpr_of(samp(samp_ctx, x0, sigma));
+ w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+ a_re = w0;
+ a_im = w1;
+ b_re = w2;
+ b_im = w3;
+ c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+ c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+ z0[0] = fpr_add(a_re, c_re);
+ z0[2] = fpr_add(a_im, c_im);
+ z0[1] = fpr_sub(a_re, c_re);
+ z0[3] = fpr_sub(a_im, c_im);
+
+ return;
+ }
+
+ /*
+ * Case logn == 1 is reachable only when using Falcon-2 (the
+ * smallest size for which Falcon is mathematically defined, but
+ * of course way too insecure to be of any use).
+ */
+ if (logn == 1) {
+ fpr x0, x1, y0, y1, sigma;
+ fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+ x0 = t1[0];
+ x1 = t1[1];
+ sigma = tree[3];
+ z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+ a_re = fpr_sub(x0, y0);
+ a_im = fpr_sub(x1, y1);
+ b_re = tree[0];
+ b_im = tree[1];
+ c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+ c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+ x0 = fpr_add(c_re, t0[0]);
+ x1 = fpr_add(c_im, t0[1]);
+ sigma = tree[2];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+ return;
+ }
+
+ /*
+ * Normal end of recursion is for logn == 0. Since the last
+ * steps of the recursions were inlined in the blocks above
+ * (when logn == 1 or 2), this case is not reachable, and is
+ * retained here only for documentation purposes.
+
+ if (logn == 0) {
+ fpr x0, x1, sigma;
+
+ x0 = t0[0];
+ x1 = t1[0];
+ sigma = tree[0];
+ z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+ z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
+ return;
+ }
+
+ */
+
+ /*
+ * General recursive case (logn >= 3).
+ */
+
+ n = (size_t)1 << logn;
+ hn = n >> 1;
+ tree0 = tree + n;
+ tree1 = tree + n + ffLDL_treesize(logn - 1);
+
+ /*
+ * We split t1 into z1 (reused as temporary storage), then do
+ * the recursive invocation, with output in tmp. We finally
+ * merge back into z1.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z1, z1 + hn, t1, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree1, z1, z1 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(z1, tmp, tmp + hn, logn);
+
+ /*
+ * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
+ */
+ memcpy(tmp, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_sub(tmp, z1, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tmp, tree, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(tmp, t0, logn);
+
+ /*
+ * Second recursive invocation.
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_split_fft(z0, z0 + hn, tmp, logn);
+ ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
+ tree0, z0, z0 + hn, logn - 1, tmp + n);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_merge_fft(z0, tmp, tmp + hn, logn);
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again. This function uses an
+ * expanded key.
+ *
+ * tmp[] must have room for at least six polynomials.
+ */
+static int
+do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const fpr *expanded_key,
+ const uint16_t *hm,
+ unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ const fpr *b00, *b01, *b10, *b11, *tree;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+ t0 = tmp;
+ t1 = t0 + n;
+ b00 = expanded_key + skoff_b00(logn);
+ b01 = expanded_key + skoff_b01(logn);
+ b10 = expanded_key + skoff_b10(logn);
+ b11 = expanded_key + skoff_b11(logn);
+ tree = expanded_key + skoff_tree(logn);
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t0, ni, logn);
+
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Apply sampling. Output is written back in [tx, ty].
+ */
+ ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(t0, tx, n * sizeof * tx);
+ memcpy(t1, ty, n * sizeof * ty);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(t1, ty, logn);
+
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t1, logn);
+
+ /*
+ * Compute the signature.
+ */
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Compute a signature: the signature contains two vectors, s1 and s2.
+ * The s1 vector is not returned. The squared norm of (s1,s2) is
+ * computed, and if it is short enough, then s2 is returned into the
+ * s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
+ * returned; the caller should then try again.
+ *
+ * tmp[] must have room for at least nine polynomials.
+ */
+static int
+do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, fpr *tmp) {
+ size_t n, u;
+ fpr *t0, *t1, *tx, *ty;
+ fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
+ fpr ni;
+ uint32_t sqn, ng;
+ int16_t *s1tmp, *s2tmp;
+
+ n = MKN(logn);
+
+ /*
+ * Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b11, logn);
+
+ /*
+ * Compute the Gram matrix G = B·B*. Formulas are:
+ * g00 = b00*adj(b00) + b01*adj(b01)
+ * g01 = b00*adj(b10) + b01*adj(b11)
+ * g10 = b10*adj(b00) + b11*adj(b01)
+ * g11 = b10*adj(b10) + b11*adj(b11)
+ *
+ * For historical reasons, this implementation uses
+ * g00, g01 and g11 (upper triangle). g10 is not kept
+ * since it is equal to adj(g01).
+ *
+ * We _replace_ the matrix B with the Gram matrix, but we
+ * must keep b01 and b11 for computing the target vector.
+ */
+ t0 = b11 + n;
+ t1 = t0 + n;
+
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(t0, logn); // t0 <- b01*adj(b01)
+
+ memcpy(t1, b00, n * sizeof * b00);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(t1, b10, logn); // t1 <- b00*adj(b10)
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(b00, logn); // b00 <- b00*adj(b00)
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(b00, t0, logn); // b00 <- g00
+ memcpy(t0, b01, n * sizeof * b01);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_muladj_fft(b01, b11, logn); // b01 <- b01*adj(b11)
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(b01, t1, logn); // b01 <- g01
+
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(b10, logn); // b10 <- b10*adj(b10)
+ memcpy(t1, b11, n * sizeof * b11);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulselfadj_fft(t1, logn); // t1 <- b11*adj(b11)
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(b10, t1, logn); // b10 <- g11
+
+ /*
+ * We rename variables to make things clearer. The three elements
+ * of the Gram matrix uses the first 3*n slots of tmp[], followed
+ * by b11 and b01 (in that order).
+ */
+ g00 = b00;
+ g01 = b01;
+ g11 = b10;
+ b01 = t0;
+ t0 = b01 + n;
+ t1 = t0 + n;
+
+ /*
+ * Memory layout at that point:
+ * g00 g01 g11 b11 b01 t0 t1
+ */
+
+ /*
+ * Set the target vector to [hm, 0] (hm is the hashed message).
+ */
+ for (u = 0; u < n; u ++) {
+ t0[u] = fpr_of(hm[u]);
+ /* This is implicit.
+ t1[u] = fpr_zero;
+ */
+ }
+
+ /*
+ * Apply the lattice basis to obtain the real target
+ * vector (after normalization with regards to modulus).
+ */
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(t0, logn);
+ ni = fpr_inverse_of_q;
+ memcpy(t1, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b01, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t1, fpr_neg(ni), logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t0, b11, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mulconst(t0, ni, logn);
+
+ /*
+ * b01 and b11 can be discarded, so we move back (t0,t1).
+ * Memory layout is now:
+ * g00 g01 g11 t0 t1
+ */
+ memcpy(b11, t0, n * 2 * sizeof * t0);
+ t0 = g11 + n;
+ t1 = t0 + n;
+
+ /*
+ * Apply sampling; result is written over (t0,t1).
+ */
+ ffSampling_fft_dyntree(samp, samp_ctx,
+ t0, t1, g00, g01, g11, logn, logn, t1 + n);
+
+ /*
+ * We arrange the layout back to:
+ * b00 b01 b10 b11 t0 t1
+ *
+ * We did not conserve the matrix basis, so we must recompute
+ * it now.
+ */
+ b00 = tmp;
+ b01 = b00 + n;
+ b10 = b01 + n;
+ b11 = b10 + n;
+ memmove(b11 + n, t0, n * 2 * sizeof * t0);
+ t0 = b11 + n;
+ t1 = t0 + n;
+ smallints_to_fpr(b01, f, logn);
+ smallints_to_fpr(b00, g, logn);
+ smallints_to_fpr(b11, F, logn);
+ smallints_to_fpr(b10, G, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b01, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b00, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b11, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_FFT(b10, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b01, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_neg(b11, logn);
+ tx = t1 + n;
+ ty = tx + n;
+
+ /*
+ * Get the lattice point corresponding to that tiny vector.
+ */
+ memcpy(tx, t0, n * sizeof * t0);
+ memcpy(ty, t1, n * sizeof * t1);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(tx, b00, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b10, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(tx, ty, logn);
+ memcpy(ty, t0, n * sizeof * t0);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(ty, b01, logn);
+
+ memcpy(t0, tx, n * sizeof * tx);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_mul_fft(t1, b11, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_poly_add(t1, ty, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t0, logn);
+ PQCLEAN_FALCONPADDED512_CLEAN_iFFT(t1, logn);
+
+ s1tmp = (int16_t *)tx;
+ sqn = 0;
+ ng = 0;
+ for (u = 0; u < n; u ++) {
+ int32_t z;
+
+ z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
+ sqn += (uint32_t)(z * z);
+ ng |= sqn;
+ s1tmp[u] = (int16_t)z;
+ }
+ sqn |= -(ng >> 31);
+
+ /*
+ * With "normal" degrees (e.g. 512 or 1024), it is very
+ * improbable that the computed vector is not short enough;
+ * however, it may happen in practice for the very reduced
+ * versions (e.g. degree 16 or below). In that case, the caller
+ * will loop, and we must not write anything into s2[] because
+ * s2[] may overlap with the hashed message hm[] and we need
+ * hm[] for the next iteration.
+ */
+ s2tmp = (int16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
+ }
+ if (PQCLEAN_FALCONPADDED512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
+ memcpy(s2, s2tmp, n * sizeof * s2);
+ memcpy(tmp, s1tmp, n * sizeof * s1tmp);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Sample an integer value along a half-gaussian distribution centered
+ * on zero and standard deviation 1.8205, with a precision of 72 bits.
+ */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_gaussian0_sampler(prng *p) {
+
+ static const uint32_t dist[] = {
+ 10745844u, 3068844u, 3741698u,
+ 5559083u, 1580863u, 8248194u,
+ 2260429u, 13669192u, 2736639u,
+ 708981u, 4421575u, 10046180u,
+ 169348u, 7122675u, 4136815u,
+ 30538u, 13063405u, 7650655u,
+ 4132u, 14505003u, 7826148u,
+ 417u, 16768101u, 11363290u,
+ 31u, 8444042u, 8086568u,
+ 1u, 12844466u, 265321u,
+ 0u, 1232676u, 13644283u,
+ 0u, 38047u, 9111839u,
+ 0u, 870u, 6138264u,
+ 0u, 14u, 12545723u,
+ 0u, 0u, 3104126u,
+ 0u, 0u, 28824u,
+ 0u, 0u, 198u,
+ 0u, 0u, 1u
+ };
+
+ uint32_t v0, v1, v2, hi;
+ uint64_t lo;
+ size_t u;
+ int z;
+
+ /*
+ * Get a random 72-bit value, into three 24-bit limbs v0..v2.
+ */
+ lo = prng_get_u64(p);
+ hi = prng_get_u8(p);
+ v0 = (uint32_t)lo & 0xFFFFFF;
+ v1 = (uint32_t)(lo >> 24) & 0xFFFFFF;
+ v2 = (uint32_t)(lo >> 48) | (hi << 16);
+
+ /*
+ * Sampled value is z, such that v0..v2 is lower than the first
+ * z elements of the table.
+ */
+ z = 0;
+ for (u = 0; u < (sizeof dist) / sizeof(dist[0]); u += 3) {
+ uint32_t w0, w1, w2, cc;
+
+ w0 = dist[u + 2];
+ w1 = dist[u + 1];
+ w2 = dist[u + 0];
+ cc = (v0 - w0) >> 31;
+ cc = (v1 - w1 - cc) >> 31;
+ cc = (v2 - w2 - cc) >> 31;
+ z += (int)cc;
+ }
+ return z;
+
+}
+
+/*
+ * Sample a bit with probability exp(-x) for some x >= 0.
+ */
+static int
+BerExp(prng *p, fpr x, fpr ccs) {
+ int s, i;
+ fpr r;
+ uint32_t sw, w;
+ uint64_t z;
+
+ /*
+ * Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
+ * and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
+ */
+ s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
+ r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
+
+ /*
+ * It may happen (quite rarely) that s >= 64; if sigma = 1.2
+ * (the minimum value for sigma), r = 0 and b = 1, then we get
+ * s >= 64 if the half-Gaussian produced a z >= 13, which happens
+ * with probability about 0.000000000230383991, which is
+ * approximatively equal to 2^(-32). In any case, if s >= 64,
+ * then BerExp will be non-zero with probability less than
+ * 2^(-64), so we can simply saturate s at 63.
+ */
+ sw = (uint32_t)s;
+ sw ^= (sw ^ 63) & -((63 - sw) >> 31);
+ s = (int)sw;
+
+ /*
+ * Compute exp(-r); we know that 0 <= r < log(2) at this point, so
+ * we can use fpr_expm_p63(), which yields a result scaled to 2^63.
+ * We scale it up to 2^64, then right-shift it by s bits because
+ * we really want exp(-x) = 2^(-s)*exp(-r).
+ *
+ * The "-1" operation makes sure that the value fits on 64 bits
+ * (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
+ * case). The bias is negligible since fpr_expm_p63() only computes
+ * with 51 bits of precision or so.
+ */
+ z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
+
+ /*
+ * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
+ * exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
+ * PRNG output to limit its consumption, the sign of the difference
+ * yields the expected result.
+ */
+ i = 64;
+ do {
+ i -= 8;
+ w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
+ } while (!w && i > 0);
+ return (int)(w >> 31);
+}
+
+/*
+ * The sampler produces a random integer that follows a discrete Gaussian
+ * distribution, centered on mu, and with standard deviation sigma. The
+ * provided parameter isigma is equal to 1/sigma.
+ *
+ * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
+ * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
+ */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
+ sampler_context *spc;
+ int s;
+ fpr r, dss, ccs;
+
+ spc = ctx;
+
+ /*
+ * Center is mu. We compute mu = s + r where s is an integer
+ * and 0 <= r < 1.
+ */
+ s = (int)fpr_floor(mu);
+ r = fpr_sub(mu, fpr_of(s));
+
+ /*
+ * dss = 1/(2*sigma^2) = 0.5*(isigma^2).
+ */
+ dss = fpr_half(fpr_sqr(isigma));
+
+ /*
+ * ccs = sigma_min / sigma = sigma_min * isigma.
+ */
+ ccs = fpr_mul(isigma, spc->sigma_min);
+
+ /*
+ * We now need to sample on center r.
+ */
+ for (;;) {
+ int z0, z, b;
+ fpr x;
+
+ /*
+ * Sample z for a Gaussian distribution. Then get a
+ * random bit b to turn the sampling into a bimodal
+ * distribution: if b = 1, we use z+1, otherwise we
+ * use -z. We thus have two situations:
+ *
+ * - b = 1: z >= 1 and sampled against a Gaussian
+ * centered on 1.
+ * - b = 0: z <= 0 and sampled against a Gaussian
+ * centered on 0.
+ */
+ z0 = PQCLEAN_FALCONPADDED512_CLEAN_gaussian0_sampler(&spc->p);
+ b = (int)prng_get_u8(&spc->p) & 1;
+ z = b + ((b << 1) - 1) * z0;
+
+ /*
+ * Rejection sampling. We want a Gaussian centered on r;
+ * but we sampled against a Gaussian centered on b (0 or
+ * 1). But we know that z is always in the range where
+ * our sampling distribution is greater than the Gaussian
+ * distribution, so rejection works.
+ *
+ * We got z with distribution:
+ * G(z) = exp(-((z-b)^2)/(2*sigma0^2))
+ * We target distribution:
+ * S(z) = exp(-((z-r)^2)/(2*sigma^2))
+ * Rejection sampling works by keeping the value z with
+ * probability S(z)/G(z), and starting again otherwise.
+ * This requires S(z) <= G(z), which is the case here.
+ * Thus, we simply need to keep our z with probability:
+ * P = exp(-x)
+ * where:
+ * x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
+ *
+ * Here, we scale up the Bernouilli distribution, which
+ * makes rejection more probable, but makes rejection
+ * rate sufficiently decorrelated from the Gaussian
+ * center and standard deviation that the whole sampler
+ * can be said to be constant-time.
+ */
+ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
+ x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
+ if (BerExp(&spc->p, x, ccs)) {
+ /*
+ * Rejection sampling was centered on r, but the
+ * actual center is mu = s + r.
+ */
+ return s + z;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
+ const fpr *expanded_key,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED512_CLEAN_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED512_CLEAN_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_tree(samp, samp_ctx, sig,
+ expanded_key, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
+ const int8_t *f, const int8_t *g,
+ const int8_t *F, const int8_t *G,
+ const uint16_t *hm, unsigned logn, uint8_t *tmp) {
+ fpr *ftmp;
+
+ ftmp = (fpr *)tmp;
+ for (;;) {
+ /*
+ * Signature produces short vectors s1 and s2. The
+ * signature is acceptable only if the aggregate vector
+ * s1,s2 is short; we must use the same bound as the
+ * verifier.
+ *
+ * If the signature is acceptable, then we return only s2
+ * (the verifier recomputes s1 from s2, the hashed message,
+ * and the public key).
+ */
+ sampler_context spc;
+ samplerZ samp;
+ void *samp_ctx;
+
+ /*
+ * Normal sampling. We use a fast PRNG seeded from our
+ * SHAKE context ('rng').
+ */
+ spc.sigma_min = fpr_sigma_min[logn];
+ PQCLEAN_FALCONPADDED512_CLEAN_prng_init(&spc.p, rng);
+ samp = PQCLEAN_FALCONPADDED512_CLEAN_sampler;
+ samp_ctx = &spc;
+
+ /*
+ * Do the actual signature.
+ */
+ if (do_sign_dyn(samp, samp_ctx, sig,
+ f, g, F, G, hm, logn, ftmp)) {
+ break;
+ }
+ }
+}
diff --git a/src/sig/falcon/pqclean_falcon-padded-512_clean/vrfy.c b/src/sig/falcon/pqclean_falcon-padded-512_clean/vrfy.c
new file mode 100644
index 000000000..5bcc2b52b
--- /dev/null
+++ b/src/sig/falcon/pqclean_falcon-padded-512_clean/vrfy.c
@@ -0,0 +1,852 @@
+/*
+ * Falcon signature verification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2017-2019 Falcon Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include "inner.h"
+
+/* ===================================================================== */
+/*
+ * Constants for NTT.
+ *
+ * n = 2^logn (2 <= n <= 1024)
+ * phi = X^n + 1
+ * q = 12289
+ * q0i = -1/q mod 2^16
+ * R = 2^16 mod q
+ * R2 = 2^32 mod q
+ */
+
+#define Q 12289
+#define Q0I 12287
+#define R 4091
+#define R2 10952
+
+/*
+ * Table for NTT, binary case:
+ * GMb[x] = R*(g^rev(x)) mod q
+ * where g = 7 (it is a 2048-th primitive root of 1 modulo q)
+ * and rev() is the bit-reversal function over 10 bits.
+ */
+static const uint16_t GMb[] = {
+ 4091, 7888, 11060, 11208, 6960, 4342, 6275, 9759,
+ 1591, 6399, 9477, 5266, 586, 5825, 7538, 9710,
+ 1134, 6407, 1711, 965, 7099, 7674, 3743, 6442,
+ 10414, 8100, 1885, 1688, 1364, 10329, 10164, 9180,
+ 12210, 6240, 997, 117, 4783, 4407, 1549, 7072,
+ 2829, 6458, 4431, 8877, 7144, 2564, 5664, 4042,
+ 12189, 432, 10751, 1237, 7610, 1534, 3983, 7863,
+ 2181, 6308, 8720, 6570, 4843, 1690, 14, 3872,
+ 5569, 9368, 12163, 2019, 7543, 2315, 4673, 7340,
+ 1553, 1156, 8401, 11389, 1020, 2967, 10772, 7045,
+ 3316, 11236, 5285, 11578, 10637, 10086, 9493, 6180,
+ 9277, 6130, 3323, 883, 10469, 489, 1502, 2851,
+ 11061, 9729, 2742, 12241, 4970, 10481, 10078, 1195,
+ 730, 1762, 3854, 2030, 5892, 10922, 9020, 5274,
+ 9179, 3604, 3782, 10206, 3180, 3467, 4668, 2446,
+ 7613, 9386, 834, 7703, 6836, 3403, 5351, 12276,
+ 3580, 1739, 10820, 9787, 10209, 4070, 12250, 8525,
+ 10401, 2749, 7338, 10574, 6040, 943, 9330, 1477,
+ 6865, 9668, 3585, 6633, 12145, 4063, 3684, 7680,
+ 8188, 6902, 3533, 9807, 6090, 727, 10099, 7003,
+ 6945, 1949, 9731, 10559, 6057, 378, 7871, 8763,
+ 8901, 9229, 8846, 4551, 9589, 11664, 7630, 8821,
+ 5680, 4956, 6251, 8388, 10156, 8723, 2341, 3159,
+ 1467, 5460, 8553, 7783, 2649, 2320, 9036, 6188,
+ 737, 3698, 4699, 5753, 9046, 3687, 16, 914,
+ 5186, 10531, 4552, 1964, 3509, 8436, 7516, 5381,
+ 10733, 3281, 7037, 1060, 2895, 7156, 8887, 5357,
+ 6409, 8197, 2962, 6375, 5064, 6634, 5625, 278,
+ 932, 10229, 8927, 7642, 351, 9298, 237, 5858,
+ 7692, 3146, 12126, 7586, 2053, 11285, 3802, 5204,
+ 4602, 1748, 11300, 340, 3711, 4614, 300, 10993,
+ 5070, 10049, 11616, 12247, 7421, 10707, 5746, 5654,
+ 3835, 5553, 1224, 8476, 9237, 3845, 250, 11209,
+ 4225, 6326, 9680, 12254, 4136, 2778, 692, 8808,
+ 6410, 6718, 10105, 10418, 3759, 7356, 11361, 8433,
+ 6437, 3652, 6342, 8978, 5391, 2272, 6476, 7416,
+ 8418, 10824, 11986, 5733, 876, 7030, 2167, 2436,
+ 3442, 9217, 8206, 4858, 5964, 2746, 7178, 1434,
+ 7389, 8879, 10661, 11457, 4220, 1432, 10832, 4328,
+ 8557, 1867, 9454, 2416, 3816, 9076, 686, 5393,
+ 2523, 4339, 6115, 619, 937, 2834, 7775, 3279,
+ 2363, 7488, 6112, 5056, 824, 10204, 11690, 1113,
+ 2727, 9848, 896, 2028, 5075, 2654, 10464, 7884,
+ 12169, 5434, 3070, 6400, 9132, 11672, 12153, 4520,
+ 1273, 9739, 11468, 9937, 10039, 9720, 2262, 9399,
+ 11192, 315, 4511, 1158, 6061, 6751, 11865, 357,
+ 7367, 4550, 983, 8534, 8352, 10126, 7530, 9253,
+ 4367, 5221, 3999, 8777, 3161, 6990, 4130, 11652,
+ 3374, 11477, 1753, 292, 8681, 2806, 10378, 12188,
+ 5800, 11811, 3181, 1988, 1024, 9340, 2477, 10928,
+ 4582, 6750, 3619, 5503, 5233, 2463, 8470, 7650,
+ 7964, 6395, 1071, 1272, 3474, 11045, 3291, 11344,
+ 8502, 9478, 9837, 1253, 1857, 6233, 4720, 11561,
+ 6034, 9817, 3339, 1797, 2879, 6242, 5200, 2114,
+ 7962, 9353, 11363, 5475, 6084, 9601, 4108, 7323,
+ 10438, 9471, 1271, 408, 6911, 3079, 360, 8276,
+ 11535, 9156, 9049, 11539, 850, 8617, 784, 7919,
+ 8334, 12170, 1846, 10213, 12184, 7827, 11903, 5600,
+ 9779, 1012, 721, 2784, 6676, 6552, 5348, 4424,
+ 6816, 8405, 9959, 5150, 2356, 5552, 5267, 1333,
+ 8801, 9661, 7308, 5788, 4910, 909, 11613, 4395,
+ 8238, 6686, 4302, 3044, 2285, 12249, 1963, 9216,
+ 4296, 11918, 695, 4371, 9793, 4884, 2411, 10230,
+ 2650, 841, 3890, 10231, 7248, 8505, 11196, 6688,
+ 4059, 6060, 3686, 4722, 11853, 5816, 7058, 6868,
+ 11137, 7926, 4894, 12284, 4102, 3908, 3610, 6525,
+ 7938, 7982, 11977, 6755, 537, 4562, 1623, 8227,
+ 11453, 7544, 906, 11816, 9548, 10858, 9703, 2815,
+ 11736, 6813, 6979, 819, 8903, 6271, 10843, 348,
+ 7514, 8339, 6439, 694, 852, 5659, 2781, 3716,
+ 11589, 3024, 1523, 8659, 4114, 10738, 3303, 5885,
+ 2978, 7289, 11884, 9123, 9323, 11830, 98, 2526,
+ 2116, 4131, 11407, 1844, 3645, 3916, 8133, 2224,
+ 10871, 8092, 9651, 5989, 7140, 8480, 1670, 159,
+ 10923, 4918, 128, 7312, 725, 9157, 5006, 6393,
+ 3494, 6043, 10972, 6181, 11838, 3423, 10514, 7668,
+ 3693, 6658, 6905, 11953, 10212, 11922, 9101, 8365,
+ 5110, 45, 2400, 1921, 4377, 2720, 1695, 51,
+ 2808, 650, 1896, 9997, 9971, 11980, 8098, 4833,
+ 4135, 4257, 5838, 4765, 10985, 11532, 590, 12198,
+ 482, 12173, 2006, 7064, 10018, 3912, 12016, 10519,
+ 11362, 6954, 2210, 284, 5413, 6601, 3865, 10339,
+ 11188, 6231, 517, 9564, 11281, 3863, 1210, 4604,
+ 8160, 11447, 153, 7204, 5763, 5089, 9248, 12154,
+ 11748, 1354, 6672, 179, 5532, 2646, 5941, 12185,
+ 862, 3158, 477, 7279, 5678, 7914, 4254, 302,
+ 2893, 10114, 6890, 9560, 9647, 11905, 4098, 9824,
+ 10269, 1353, 10715, 5325, 6254, 3951, 1807, 6449,
+ 5159, 1308, 8315, 3404, 1877, 1231, 112, 6398,
+ 11724, 12272, 7286, 1459, 12274, 9896, 3456, 800,
+ 1397, 10678, 103, 7420, 7976, 936, 764, 632,
+ 7996, 8223, 8445, 7758, 10870, 9571, 2508, 1946,
+ 6524, 10158, 1044, 4338, 2457, 3641, 1659, 4139,
+ 4688, 9733, 11148, 3946, 2082, 5261, 2036, 11850,
+ 7636, 12236, 5366, 2380, 1399, 7720, 2100, 3217,
+ 10912, 8898, 7578, 11995, 2791, 1215, 3355, 2711,
+ 2267, 2004, 8568, 10176, 3214, 2337, 1750, 4729,
+ 4997, 7415, 6315, 12044, 4374, 7157, 4844, 211,
+ 8003, 10159, 9290, 11481, 1735, 2336, 5793, 9875,
+ 8192, 986, 7527, 1401, 870, 3615, 8465, 2756,
+ 9770, 2034, 10168, 3264, 6132, 54, 2880, 4763,
+ 11805, 3074, 8286, 9428, 4881, 6933, 1090, 10038,
+ 2567, 708, 893, 6465, 4962, 10024, 2090, 5718,
+ 10743, 780, 4733, 4623, 2134, 2087, 4802, 884,
+ 5372, 5795, 5938, 4333, 6559, 7549, 5269, 10664,
+ 4252, 3260, 5917, 10814, 5768, 9983, 8096, 7791,
+ 6800, 7491, 6272, 1907, 10947, 6289, 11803, 6032,
+ 11449, 1171, 9201, 7933, 2479, 7970, 11337, 7062,
+ 8911, 6728, 6542, 8114, 8828, 6595, 3545, 4348,
+ 4610, 2205, 6999, 8106, 5560, 10390, 9321, 2499,
+ 2413, 7272, 6881, 10582, 9308, 9437, 3554, 3326,
+ 5991, 11969, 3415, 12283, 9838, 12063, 4332, 7830,
+ 11329, 6605, 12271, 2044, 11611, 7353, 11201, 11582,
+ 3733, 8943, 9978, 1627, 7168, 3935, 5050, 2762,
+ 7496, 10383, 755, 1654, 12053, 4952, 10134, 4394,
+ 6592, 7898, 7497, 8904, 12029, 3581, 10748, 5674,
+ 10358, 4901, 7414, 8771, 710, 6764, 8462, 7193,
+ 5371, 7274, 11084, 290, 7864, 6827, 11822, 2509,
+ 6578, 4026, 5807, 1458, 5721, 5762, 4178, 2105,
+ 11621, 4852, 8897, 2856, 11510, 9264, 2520, 8776,
+ 7011, 2647, 1898, 7039, 5950, 11163, 5488, 6277,
+ 9182, 11456, 633, 10046, 11554, 5633, 9587, 2333,
+ 7008, 7084, 5047, 7199, 9865, 8997, 569, 6390,
+ 10845, 9679, 8268, 11472, 4203, 1997, 2, 9331,
+ 162, 6182, 2000, 3649, 9792, 6363, 7557, 6187,
+ 8510, 9935, 5536, 9019, 3706, 12009, 1452, 3067,
+ 5494, 9692, 4865, 6019, 7106, 9610, 4588, 10165,
+ 6261, 5887, 2652, 10172, 1580, 10379, 4638, 9949
+};
+
+/*
+ * Table for inverse NTT, binary case:
+ * iGMb[x] = R*((1/g)^rev(x)) mod q
+ * Since g = 7, 1/g = 8778 mod 12289.
+ */
+static const uint16_t iGMb[] = {
+ 4091, 4401, 1081, 1229, 2530, 6014, 7947, 5329,
+ 2579, 4751, 6464, 11703, 7023, 2812, 5890, 10698,
+ 3109, 2125, 1960, 10925, 10601, 10404, 4189, 1875,
+ 5847, 8546, 4615, 5190, 11324, 10578, 5882, 11155,
+ 8417, 12275, 10599, 7446, 5719, 3569, 5981, 10108,
+ 4426, 8306, 10755, 4679, 11052, 1538, 11857, 100,
+ 8247, 6625, 9725, 5145, 3412, 7858, 5831, 9460,
+ 5217, 10740, 7882, 7506, 12172, 11292, 6049, 79,
+ 13, 6938, 8886, 5453, 4586, 11455, 2903, 4676,
+ 9843, 7621, 8822, 9109, 2083, 8507, 8685, 3110,
+ 7015, 3269, 1367, 6397, 10259, 8435, 10527, 11559,
+ 11094, 2211, 1808, 7319, 48, 9547, 2560, 1228,
+ 9438, 10787, 11800, 1820, 11406, 8966, 6159, 3012,
+ 6109, 2796, 2203, 1652, 711, 7004, 1053, 8973,
+ 5244, 1517, 9322, 11269, 900, 3888, 11133, 10736,
+ 4949, 7616, 9974, 4746, 10270, 126, 2921, 6720,
+ 6635, 6543, 1582, 4868, 42, 673, 2240, 7219,
+ 1296, 11989, 7675, 8578, 11949, 989, 10541, 7687,
+ 7085, 8487, 1004, 10236, 4703, 163, 9143, 4597,
+ 6431, 12052, 2991, 11938, 4647, 3362, 2060, 11357,
+ 12011, 6664, 5655, 7225, 5914, 9327, 4092, 5880,
+ 6932, 3402, 5133, 9394, 11229, 5252, 9008, 1556,
+ 6908, 4773, 3853, 8780, 10325, 7737, 1758, 7103,
+ 11375, 12273, 8602, 3243, 6536, 7590, 8591, 11552,
+ 6101, 3253, 9969, 9640, 4506, 3736, 6829, 10822,
+ 9130, 9948, 3566, 2133, 3901, 6038, 7333, 6609,
+ 3468, 4659, 625, 2700, 7738, 3443, 3060, 3388,
+ 3526, 4418, 11911, 6232, 1730, 2558, 10340, 5344,
+ 5286, 2190, 11562, 6199, 2482, 8756, 5387, 4101,
+ 4609, 8605, 8226, 144, 5656, 8704, 2621, 5424,
+ 10812, 2959, 11346, 6249, 1715, 4951, 9540, 1888,
+ 3764, 39, 8219, 2080, 2502, 1469, 10550, 8709,
+ 5601, 1093, 3784, 5041, 2058, 8399, 11448, 9639,
+ 2059, 9878, 7405, 2496, 7918, 11594, 371, 7993,
+ 3073, 10326, 40, 10004, 9245, 7987, 5603, 4051,
+ 7894, 676, 11380, 7379, 6501, 4981, 2628, 3488,
+ 10956, 7022, 6737, 9933, 7139, 2330, 3884, 5473,
+ 7865, 6941, 5737, 5613, 9505, 11568, 11277, 2510,
+ 6689, 386, 4462, 105, 2076, 10443, 119, 3955,
+ 4370, 11505, 3672, 11439, 750, 3240, 3133, 754,
+ 4013, 11929, 9210, 5378, 11881, 11018, 2818, 1851,
+ 4966, 8181, 2688, 6205, 6814, 926, 2936, 4327,
+ 10175, 7089, 6047, 9410, 10492, 8950, 2472, 6255,
+ 728, 7569, 6056, 10432, 11036, 2452, 2811, 3787,
+ 945, 8998, 1244, 8815, 11017, 11218, 5894, 4325,
+ 4639, 3819, 9826, 7056, 6786, 8670, 5539, 7707,
+ 1361, 9812, 2949, 11265, 10301, 9108, 478, 6489,
+ 101, 1911, 9483, 3608, 11997, 10536, 812, 8915,
+ 637, 8159, 5299, 9128, 3512, 8290, 7068, 7922,
+ 3036, 4759, 2163, 3937, 3755, 11306, 7739, 4922,
+ 11932, 424, 5538, 6228, 11131, 7778, 11974, 1097,
+ 2890, 10027, 2569, 2250, 2352, 821, 2550, 11016,
+ 7769, 136, 617, 3157, 5889, 9219, 6855, 120,
+ 4405, 1825, 9635, 7214, 10261, 11393, 2441, 9562,
+ 11176, 599, 2085, 11465, 7233, 6177, 4801, 9926,
+ 9010, 4514, 9455, 11352, 11670, 6174, 7950, 9766,
+ 6896, 11603, 3213, 8473, 9873, 2835, 10422, 3732,
+ 7961, 1457, 10857, 8069, 832, 1628, 3410, 4900,
+ 10855, 5111, 9543, 6325, 7431, 4083, 3072, 8847,
+ 9853, 10122, 5259, 11413, 6556, 303, 1465, 3871,
+ 4873, 5813, 10017, 6898, 3311, 5947, 8637, 5852,
+ 3856, 928, 4933, 8530, 1871, 2184, 5571, 5879,
+ 3481, 11597, 9511, 8153, 35, 2609, 5963, 8064,
+ 1080, 12039, 8444, 3052, 3813, 11065, 6736, 8454,
+ 2340, 7651, 1910, 10709, 2117, 9637, 6402, 6028,
+ 2124, 7701, 2679, 5183, 6270, 7424, 2597, 6795,
+ 9222, 10837, 280, 8583, 3270, 6753, 2354, 3779,
+ 6102, 4732, 5926, 2497, 8640, 10289, 6107, 12127,
+ 2958, 12287, 10292, 8086, 817, 4021, 2610, 1444,
+ 5899, 11720, 3292, 2424, 5090, 7242, 5205, 5281,
+ 9956, 2702, 6656, 735, 2243, 11656, 833, 3107,
+ 6012, 6801, 1126, 6339, 5250, 10391, 9642, 5278,
+ 3513, 9769, 3025, 779, 9433, 3392, 7437, 668,
+ 10184, 8111, 6527, 6568, 10831, 6482, 8263, 5711,
+ 9780, 467, 5462, 4425, 11999, 1205, 5015, 6918,
+ 5096, 3827, 5525, 11579, 3518, 4875, 7388, 1931,
+ 6615, 1541, 8708, 260, 3385, 4792, 4391, 5697,
+ 7895, 2155, 7337, 236, 10635, 11534, 1906, 4793,
+ 9527, 7239, 8354, 5121, 10662, 2311, 3346, 8556,
+ 707, 1088, 4936, 678, 10245, 18, 5684, 960,
+ 4459, 7957, 226, 2451, 6, 8874, 320, 6298,
+ 8963, 8735, 2852, 2981, 1707, 5408, 5017, 9876,
+ 9790, 2968, 1899, 6729, 4183, 5290, 10084, 7679,
+ 7941, 8744, 5694, 3461, 4175, 5747, 5561, 3378,
+ 5227, 952, 4319, 9810, 4356, 3088, 11118, 840,
+ 6257, 486, 6000, 1342, 10382, 6017, 4798, 5489,
+ 4498, 4193, 2306, 6521, 1475, 6372, 9029, 8037,
+ 1625, 7020, 4740, 5730, 7956, 6351, 6494, 6917,
+ 11405, 7487, 10202, 10155, 7666, 7556, 11509, 1546,
+ 6571, 10199, 2265, 7327, 5824, 11396, 11581, 9722,
+ 2251, 11199, 5356, 7408, 2861, 4003, 9215, 484,
+ 7526, 9409, 12235, 6157, 9025, 2121, 10255, 2519,
+ 9533, 3824, 8674, 11419, 10888, 4762, 11303, 4097,
+ 2414, 6496, 9953, 10554, 808, 2999, 2130, 4286,
+ 12078, 7445, 5132, 7915, 245, 5974, 4874, 7292,
+ 7560, 10539, 9952, 9075, 2113, 3721, 10285, 10022,
+ 9578, 8934, 11074, 9498, 294, 4711, 3391, 1377,
+ 9072, 10189, 4569, 10890, 9909, 6923, 53, 4653,
+ 439, 10253, 7028, 10207, 8343, 1141, 2556, 7601,
+ 8150, 10630, 8648, 9832, 7951, 11245, 2131, 5765,
+ 10343, 9781, 2718, 1419, 4531, 3844, 4066, 4293,
+ 11657, 11525, 11353, 4313, 4869, 12186, 1611, 10892,
+ 11489, 8833, 2393, 15, 10830, 5003, 17, 565,
+ 5891, 12177, 11058, 10412, 8885, 3974, 10981, 7130,
+ 5840, 10482, 8338, 6035, 6964, 1574, 10936, 2020,
+ 2465, 8191, 384, 2642, 2729, 5399, 2175, 9396,
+ 11987, 8035, 4375, 6611, 5010, 11812, 9131, 11427,
+ 104, 6348, 9643, 6757, 12110, 5617, 10935, 541,
+ 135, 3041, 7200, 6526, 5085, 12136, 842, 4129,
+ 7685, 11079, 8426, 1008, 2725, 11772, 6058, 1101,
+ 1950, 8424, 5688, 6876, 12005, 10079, 5335, 927,
+ 1770, 273, 8377, 2271, 5225, 10283, 116, 11807,
+ 91, 11699, 757, 1304, 7524, 6451, 8032, 8154,
+ 7456, 4191, 309, 2318, 2292, 10393, 11639, 9481,
+ 12238, 10594, 9569, 7912, 10368, 9889, 12244, 7179,
+ 3924, 3188, 367, 2077, 336, 5384, 5631, 8596,
+ 4621, 1775, 8866, 451, 6108, 1317, 6246, 8795,
+ 5896, 7283, 3132, 11564, 4977, 12161, 7371, 1366,
+ 12130, 10619, 3809, 5149, 6300, 2638, 4197, 1418,
+ 10065, 4156, 8373, 8644, 10445, 882, 8158, 10173,
+ 9763, 12191, 459, 2966, 3166, 405, 5000, 9311,
+ 6404, 8986, 1551, 8175, 3630, 10766, 9265, 700,
+ 8573, 9508, 6630, 11437, 11595, 5850, 3950, 4775,
+ 11941, 1446, 6018, 3386, 11470, 5310, 5476, 553,
+ 9474, 2586, 1431, 2741, 473, 11383, 4745, 836,
+ 4062, 10666, 7727, 11752, 5534, 312, 4307, 4351,
+ 5764, 8679, 8381, 8187, 5, 7395, 4363, 1152,
+ 5421, 5231, 6473, 436, 7567, 8603, 6229, 8230
+};
+
+/*
+ * Reduce a small signed integer modulo q. The source integer MUST
+ * be between -q/2 and +q/2.
+ */
+static inline uint32_t
+mq_conv_small(int x) {
+ /*
+ * If x < 0, the cast to uint32_t will set the high bit to 1.
+ */
+ uint32_t y;
+
+ y = (uint32_t)x;
+ y += Q & -(y >> 31);
+ return y;
+}
+
+/*
+ * Addition modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_add(uint32_t x, uint32_t y) {
+ /*
+ * We compute x + y - q. If the result is negative, then the
+ * high bit will be set, and 'd >> 31' will be equal to 1;
+ * thus '-(d >> 31)' will be an all-one pattern. Otherwise,
+ * it will be an all-zero pattern. In other words, this
+ * implements a conditional addition of q.
+ */
+ uint32_t d;
+
+ d = x + y - Q;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Subtraction modulo q. Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_sub(uint32_t x, uint32_t y) {
+ /*
+ * As in mq_add(), we use a conditional addition to ensure the
+ * result is in the 0..q-1 range.
+ */
+ uint32_t d;
+
+ d = x - y;
+ d += Q & -(d >> 31);
+ return d;
+}
+
+/*
+ * Division by 2 modulo q. Operand must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_rshift1(uint32_t x) {
+ x += Q & -(x & 1);
+ return (x >> 1);
+}
+
+/*
+ * Montgomery multiplication modulo q. If we set R = 2^16 mod q, then
+ * this function computes: x * y / R mod q
+ * Operands must be in the 0..q-1 range.
+ */
+static inline uint32_t
+mq_montymul(uint32_t x, uint32_t y) {
+ uint32_t z, w;
+
+ /*
+ * We compute x*y + k*q with a value of k chosen so that the 16
+ * low bits of the result are 0. We can then shift the value.
+ * After the shift, result may still be larger than q, but it
+ * will be lower than 2*q, so a conditional subtraction works.
+ */
+
+ z = x * y;
+ w = ((z * Q0I) & 0xFFFF) * Q;
+
+ /*
+ * When adding z and w, the result will have its low 16 bits
+ * equal to 0. Since x, y and z are lower than q, the sum will
+ * be no more than (2^15 - 1) * q + (q - 1)^2, which will
+ * fit on 29 bits.
+ */
+ z = (z + w) >> 16;
+
+ /*
+ * After the shift, analysis shows that the value will be less
+ * than 2q. We do a subtraction then conditional subtraction to
+ * ensure the result is in the expected range.
+ */
+ z -= Q;
+ z += Q & -(z >> 31);
+ return z;
+}
+
+/*
+ * Montgomery squaring (computes (x^2)/R).
+ */
+static inline uint32_t
+mq_montysqr(uint32_t x) {
+ return mq_montymul(x, x);
+}
+
+/*
+ * Divide x by y modulo q = 12289.
+ */
+static inline uint32_t
+mq_div_12289(uint32_t x, uint32_t y) {
+ /*
+ * We invert y by computing y^(q-2) mod q.
+ *
+ * We use the following addition chain for exponent e = 12287:
+ *
+ * e0 = 1
+ * e1 = 2 * e0 = 2
+ * e2 = e1 + e0 = 3
+ * e3 = e2 + e1 = 5
+ * e4 = 2 * e3 = 10
+ * e5 = 2 * e4 = 20
+ * e6 = 2 * e5 = 40
+ * e7 = 2 * e6 = 80
+ * e8 = 2 * e7 = 160
+ * e9 = e8 + e2 = 163
+ * e10 = e9 + e8 = 323
+ * e11 = 2 * e10 = 646
+ * e12 = 2 * e11 = 1292
+ * e13 = e12 + e9 = 1455
+ * e14 = 2 * e13 = 2910
+ * e15 = 2 * e14 = 5820
+ * e16 = e15 + e10 = 6143
+ * e17 = 2 * e16 = 12286
+ * e18 = e17 + e0 = 12287
+ *
+ * Additions on exponents are converted to Montgomery
+ * multiplications. We define all intermediate results as so
+ * many local variables, and let the C compiler work out which
+ * must be kept around.
+ */
+ uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+ uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18;
+
+ y0 = mq_montymul(y, R2);
+ y1 = mq_montysqr(y0);
+ y2 = mq_montymul(y1, y0);
+ y3 = mq_montymul(y2, y1);
+ y4 = mq_montysqr(y3);
+ y5 = mq_montysqr(y4);
+ y6 = mq_montysqr(y5);
+ y7 = mq_montysqr(y6);
+ y8 = mq_montysqr(y7);
+ y9 = mq_montymul(y8, y2);
+ y10 = mq_montymul(y9, y8);
+ y11 = mq_montysqr(y10);
+ y12 = mq_montysqr(y11);
+ y13 = mq_montymul(y12, y9);
+ y14 = mq_montysqr(y13);
+ y15 = mq_montysqr(y14);
+ y16 = mq_montymul(y15, y10);
+ y17 = mq_montysqr(y16);
+ y18 = mq_montymul(y17, y0);
+
+ /*
+ * Final multiplication with x, which is not in Montgomery
+ * representation, computes the correct division result.
+ */
+ return mq_montymul(y18, x);
+}
+
+/*
+ * Compute NTT on a ring element.
+ */
+static void
+mq_NTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+
+ n = (size_t)1 << logn;
+ t = n;
+ for (m = 1; m < n; m <<= 1) {
+ size_t ht, i, j1;
+
+ ht = t >> 1;
+ for (i = 0, j1 = 0; i < m; i ++, j1 += t) {
+ size_t j, j2;
+ uint32_t s;
+
+ s = GMb[m + i];
+ j2 = j1 + ht;
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v;
+
+ u = a[j];
+ v = mq_montymul(a[j + ht], s);
+ a[j] = (uint16_t)mq_add(u, v);
+ a[j + ht] = (uint16_t)mq_sub(u, v);
+ }
+ }
+ t = ht;
+ }
+}
+
+/*
+ * Compute the inverse NTT on a ring element, binary case.
+ */
+static void
+mq_iNTT(uint16_t *a, unsigned logn) {
+ size_t n, t, m;
+ uint32_t ni;
+
+ n = (size_t)1 << logn;
+ t = 1;
+ m = n;
+ while (m > 1) {
+ size_t hm, dt, i, j1;
+
+ hm = m >> 1;
+ dt = t << 1;
+ for (i = 0, j1 = 0; i < hm; i ++, j1 += dt) {
+ size_t j, j2;
+ uint32_t s;
+
+ j2 = j1 + t;
+ s = iGMb[hm + i];
+ for (j = j1; j < j2; j ++) {
+ uint32_t u, v, w;
+
+ u = a[j];
+ v = a[j + t];
+ a[j] = (uint16_t)mq_add(u, v);
+ w = mq_sub(u, v);
+ a[j + t] = (uint16_t)
+ mq_montymul(w, s);
+ }
+ }
+ t = dt;
+ m = hm;
+ }
+
+ /*
+ * To complete the inverse NTT, we must now divide all values by
+ * n (the vector size). We thus need the inverse of n, i.e. we
+ * need to divide 1 by 2 logn times. But we also want it in
+ * Montgomery representation, i.e. we also want to multiply it
+ * by R = 2^16. In the common case, this should be a simple right
+ * shift. The loop below is generic and works also in corner cases;
+ * its computation time is negligible.
+ */
+ ni = R;
+ for (m = n; m > 1; m >>= 1) {
+ ni = mq_rshift1(ni);
+ }
+ for (m = 0; m < n; m ++) {
+ a[m] = (uint16_t)mq_montymul(a[m], ni);
+ }
+}
+
+/*
+ * Convert a polynomial (mod q) to Montgomery representation.
+ */
+static void
+mq_poly_tomonty(uint16_t *f, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], R2);
+ }
+}
+
+/*
+ * Multiply two polynomials together (NTT representation, and using
+ * a Montgomery multiplication). Result f*g is written over f.
+ */
+static void
+mq_poly_montymul_ntt(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_montymul(f[u], g[u]);
+ }
+}
+
+/*
+ * Subtract polynomial g from polynomial f.
+ */
+static void
+mq_poly_sub(uint16_t *f, const uint16_t *g, unsigned logn) {
+ size_t u, n;
+
+ n = (size_t)1 << logn;
+ for (u = 0; u < n; u ++) {
+ f[u] = (uint16_t)mq_sub(f[u], g[u]);
+ }
+}
+
+/* ===================================================================== */
+
+/* see inner.h */
+void
+PQCLEAN_FALCONPADDED512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn) {
+ mq_NTT(h, logn);
+ mq_poly_tomonty(h, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
+ const uint16_t *h, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+
+ /*
+ * Reduce s2 elements modulo q ([0..q-1] range).
+ */
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
+ */
+ mq_NTT(tt, logn);
+ mq_poly_montymul_ntt(tt, h, logn);
+ mq_iNTT(tt, logn);
+ mq_poly_sub(tt, c0, logn);
+
+ /*
+ * Normalize -s1 elements into the [-q/2..q/2] range.
+ */
+ for (u = 0; u < n; u ++) {
+ int32_t w;
+
+ w = (int32_t)tt[u];
+ w -= (int32_t)(Q & -(((Q >> 1) - (uint32_t)w) >> 31));
+ ((int16_t *)tt)[u] = (int16_t)w;
+ }
+
+ /*
+ * Signature is valid if and only if the aggregate (-s1,s2) vector
+ * is short enough.
+ */
+ return PQCLEAN_FALCONPADDED512_CLEAN_is_short((int16_t *)tt, s2, logn);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_compute_public(uint16_t *h,
+ const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ tt[u] = (uint16_t)mq_conv_small(f[u]);
+ h[u] = (uint16_t)mq_conv_small(g[u]);
+ }
+ mq_NTT(h, logn);
+ mq_NTT(tt, logn);
+ for (u = 0; u < n; u ++) {
+ if (tt[u] == 0) {
+ return 0;
+ }
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_complete_private(int8_t *G,
+ const int8_t *f, const int8_t *g, const int8_t *F,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *t1, *t2;
+
+ n = (size_t)1 << logn;
+ t1 = (uint16_t *)tmp;
+ t2 = t1 + n;
+ for (u = 0; u < n; u ++) {
+ t1[u] = (uint16_t)mq_conv_small(g[u]);
+ t2[u] = (uint16_t)mq_conv_small(F[u]);
+ }
+ mq_NTT(t1, logn);
+ mq_NTT(t2, logn);
+ mq_poly_tomonty(t1, logn);
+ mq_poly_montymul_ntt(t1, t2, logn);
+ for (u = 0; u < n; u ++) {
+ t2[u] = (uint16_t)mq_conv_small(f[u]);
+ }
+ mq_NTT(t2, logn);
+ for (u = 0; u < n; u ++) {
+ if (t2[u] == 0) {
+ return 0;
+ }
+ t1[u] = (uint16_t)mq_div_12289(t1[u], t2[u]);
+ }
+ mq_iNTT(t1, logn);
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+ int32_t gi;
+
+ w = t1[u];
+ w -= (Q & ~ -((w - (Q >> 1)) >> 31));
+ gi = *(int32_t *)&w;
+ if (gi < -127 || gi > +127) {
+ return 0;
+ }
+ G[u] = (int8_t)gi;
+ }
+ return 1;
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_is_invertible(
+ const int16_t *s2, unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+ }
+ mq_NTT(tt, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ }
+ return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_verify_recover(uint16_t *h,
+ const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+ unsigned logn, uint8_t *tmp) {
+ size_t u, n;
+ uint16_t *tt;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+
+ /*
+ * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+ * and c0 - s1 into h[].
+ */
+ tt = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u];
+ w += Q & -(w >> 31);
+ tt[u] = (uint16_t)w;
+
+ w = (uint32_t)s1[u];
+ w += Q & -(w >> 31);
+ w = mq_sub(c0[u], w);
+ h[u] = (uint16_t)w;
+ }
+
+ /*
+ * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+ * is zero (in NTT representation) then the operation fails. We
+ * keep that information into a flag so that we do not deviate
+ * from strict constant-time processing; if all coefficients of
+ * s2 are non-zero, then the high bit of r will be zero.
+ */
+ mq_NTT(tt, logn);
+ mq_NTT(h, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ r |= (uint32_t)(tt[u] - 1);
+ h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+ }
+ mq_iNTT(h, logn);
+
+ /*
+ * Signature is acceptable if and only if it is short enough,
+ * and s2 was invertible mod phi mod q. The caller must still
+ * check that the rebuilt public key matches the expected
+ * value (e.g. through a hash).
+ */
+ r = ~r & (uint32_t) - PQCLEAN_FALCONPADDED512_CLEAN_is_short(s1, s2, logn);
+ return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCONPADDED512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+ uint16_t *s2;
+ size_t u, n;
+ uint32_t r;
+
+ n = (size_t)1 << logn;
+ s2 = (uint16_t *)tmp;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)sig[u];
+ w += Q & -(w >> 31);
+ s2[u] = (uint16_t)w;
+ }
+ mq_NTT(s2, logn);
+ r = 0;
+ for (u = 0; u < n; u ++) {
+ uint32_t w;
+
+ w = (uint32_t)s2[u] - 1u;
+ r += (w >> 31);
+ }
+ return (int)r;
+}
diff --git a/src/sig/falcon/sig_falcon.h b/src/sig/falcon/sig_falcon.h
index dfd43e88b..a8eb1454f 100644
--- a/src/sig/falcon/sig_falcon.h
+++ b/src/sig/falcon/sig_falcon.h
@@ -8,7 +8,7 @@
#if defined(OQS_ENABLE_SIG_falcon_512)
#define OQS_SIG_falcon_512_length_public_key 897
#define OQS_SIG_falcon_512_length_secret_key 1281
-#define OQS_SIG_falcon_512_length_signature 666
+#define OQS_SIG_falcon_512_length_signature 752
OQS_SIG *OQS_SIG_falcon_512_new(void);
OQS_API OQS_STATUS OQS_SIG_falcon_512_keypair(uint8_t *public_key, uint8_t *secret_key);
@@ -19,7 +19,7 @@ OQS_API OQS_STATUS OQS_SIG_falcon_512_verify(const uint8_t *message, size_t mess
#if defined(OQS_ENABLE_SIG_falcon_1024)
#define OQS_SIG_falcon_1024_length_public_key 1793
#define OQS_SIG_falcon_1024_length_secret_key 2305
-#define OQS_SIG_falcon_1024_length_signature 1280
+#define OQS_SIG_falcon_1024_length_signature 1462
OQS_SIG *OQS_SIG_falcon_1024_new(void);
OQS_API OQS_STATUS OQS_SIG_falcon_1024_keypair(uint8_t *public_key, uint8_t *secret_key);
@@ -27,4 +27,26 @@ OQS_API OQS_STATUS OQS_SIG_falcon_1024_sign(uint8_t *signature, size_t *signatur
OQS_API OQS_STATUS OQS_SIG_falcon_1024_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
#endif
+#if defined(OQS_ENABLE_SIG_falcon_padded_512)
+#define OQS_SIG_falcon_padded_512_length_public_key 897
+#define OQS_SIG_falcon_padded_512_length_secret_key 1281
+#define OQS_SIG_falcon_padded_512_length_signature 666
+
+OQS_SIG *OQS_SIG_falcon_padded_512_new(void);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024)
+#define OQS_SIG_falcon_padded_1024_length_public_key 1793
+#define OQS_SIG_falcon_padded_1024_length_secret_key 2305
+#define OQS_SIG_falcon_padded_1024_length_signature 1280
+
+OQS_SIG *OQS_SIG_falcon_padded_1024_new(void);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
#endif
diff --git a/src/sig/falcon/sig_falcon_padded_1024.c b/src/sig/falcon/sig_falcon_padded_1024.c
new file mode 100644
index 000000000..53b8c3926
--- /dev/null
+++ b/src/sig/falcon/sig_falcon_padded_1024.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+
+#include
+
+#include
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024)
+
+OQS_SIG *OQS_SIG_falcon_padded_1024_new(void) {
+
+ OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+ if (sig == NULL) {
+ return NULL;
+ }
+ sig->method_name = OQS_SIG_alg_falcon_padded_1024;
+ sig->alg_version = "20211101 with PQClean patches";
+
+ sig->claimed_nist_level = 5;
+ sig->euf_cma = true;
+
+ sig->length_public_key = OQS_SIG_falcon_padded_1024_length_public_key;
+ sig->length_secret_key = OQS_SIG_falcon_padded_1024_length_secret_key;
+ sig->length_signature = OQS_SIG_falcon_padded_1024_length_signature;
+
+ sig->keypair = OQS_SIG_falcon_padded_1024_keypair;
+ sig->sign = OQS_SIG_falcon_padded_1024_sign;
+ sig->verify = OQS_SIG_falcon_padded_1024_verify;
+
+ return sig;
+}
+
+extern int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+extern int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+extern int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(public_key, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(public_key, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#else
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#else
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_1024_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_1024_avx2)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AVX2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_1024_aarch64)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_AARCH64_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#else
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED1024_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/falcon/sig_falcon_padded_512.c b/src/sig/falcon/sig_falcon_padded_512.c
new file mode 100644
index 000000000..9521187b8
--- /dev/null
+++ b/src/sig/falcon/sig_falcon_padded_512.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+
+#include
+
+#include
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_512)
+
+OQS_SIG *OQS_SIG_falcon_padded_512_new(void) {
+
+ OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+ if (sig == NULL) {
+ return NULL;
+ }
+ sig->method_name = OQS_SIG_alg_falcon_padded_512;
+ sig->alg_version = "20211101 with PQClean patches";
+
+ sig->claimed_nist_level = 1;
+ sig->euf_cma = true;
+
+ sig->length_public_key = OQS_SIG_falcon_padded_512_length_public_key;
+ sig->length_secret_key = OQS_SIG_falcon_padded_512_length_secret_key;
+ sig->length_signature = OQS_SIG_falcon_padded_512_length_signature;
+
+ sig->keypair = OQS_SIG_falcon_padded_512_keypair;
+ sig->sign = OQS_SIG_falcon_padded_512_sign;
+ sig->verify = OQS_SIG_falcon_padded_512_verify;
+
+ return sig;
+}
+
+extern int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+extern int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+extern int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(public_key, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(public_key, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#else
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#else
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_falcon_padded_512_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_falcon_padded_512_avx2)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AVX2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#elif defined(OQS_ENABLE_SIG_falcon_padded_512_aarch64)
+#if defined(OQS_DIST_BUILD)
+ if (OQS_CPU_has_extension(OQS_CPU_EXT_ARM_NEON)) {
+#endif /* OQS_DIST_BUILD */
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_AARCH64_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+ } else {
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+ }
+#endif /* OQS_DIST_BUILD */
+#else
+ return (OQS_STATUS) PQCLEAN_FALCONPADDED512_CLEAN_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/sig.c b/src/sig/sig.c
index b953af756..ae4147838 100644
--- a/src/sig/sig.c
+++ b/src/sig/sig.c
@@ -26,6 +26,8 @@ OQS_API const char *OQS_SIG_alg_identifier(size_t i) {
OQS_SIG_alg_ml_dsa_87,
OQS_SIG_alg_falcon_512,
OQS_SIG_alg_falcon_1024,
+ OQS_SIG_alg_falcon_padded_512,
+ OQS_SIG_alg_falcon_padded_1024,
OQS_SIG_alg_sphincs_sha2_128f_simple,
OQS_SIG_alg_sphincs_sha2_128s_simple,
OQS_SIG_alg_sphincs_sha2_192f_simple,
@@ -133,6 +135,20 @@ OQS_API int OQS_SIG_alg_is_enabled(const char *method_name) {
return 0;
#endif
+ } else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_512)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_512
+ return 1;
+#else
+ return 0;
+#endif
+
+ } else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_1024)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_1024
+ return 1;
+#else
+ return 0;
+#endif
+
} else if (0 == strcasecmp(method_name, OQS_SIG_alg_sphincs_sha2_128f_simple)) {
#ifdef OQS_ENABLE_SIG_sphincs_sha2_128f_simple
return 1;
@@ -305,6 +321,20 @@ OQS_API OQS_SIG *OQS_SIG_new(const char *method_name) {
return NULL;
#endif
+ } else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_512)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_512
+ return OQS_SIG_falcon_padded_512_new();
+#else
+ return NULL;
+#endif
+
+ } else if (0 == strcasecmp(method_name, OQS_SIG_alg_falcon_padded_1024)) {
+#ifdef OQS_ENABLE_SIG_falcon_padded_1024
+ return OQS_SIG_falcon_padded_1024_new();
+#else
+ return NULL;
+#endif
+
} else if (0 == strcasecmp(method_name, OQS_SIG_alg_sphincs_sha2_128f_simple)) {
#ifdef OQS_ENABLE_SIG_sphincs_sha2_128f_simple
return OQS_SIG_sphincs_sha2_128f_simple_new();
diff --git a/src/sig/sig.h b/src/sig/sig.h
index 97a40cd88..11db75f00 100644
--- a/src/sig/sig.h
+++ b/src/sig/sig.h
@@ -54,6 +54,10 @@ extern "C" {
#define OQS_SIG_alg_falcon_512 "Falcon-512"
/** Algorithm identifier for Falcon-1024 */
#define OQS_SIG_alg_falcon_1024 "Falcon-1024"
+/** Algorithm identifier for Falcon-padded-512 */
+#define OQS_SIG_alg_falcon_padded_512 "Falcon-padded-512"
+/** Algorithm identifier for Falcon-padded-1024 */
+#define OQS_SIG_alg_falcon_padded_1024 "Falcon-padded-1024"
/** Algorithm identifier for SPHINCS+-SHA2-128f-simple */
#define OQS_SIG_alg_sphincs_sha2_128f_simple "SPHINCS+-SHA2-128f-simple"
/** Algorithm identifier for SPHINCS+-SHA2-128s-simple */
@@ -83,7 +87,7 @@ extern "C" {
///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_START
/** Number of algorithm identifiers above. */
-#define OQS_SIG_algs_length 23
+#define OQS_SIG_algs_length 25
///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_END
/**
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
for (j = 0; j < 8; j++) {
u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128f-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
for (j = 0; j < 8; j++) {
u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c
index 329753380..a03540d3b 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-128s-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
for (j = 0; j < 8; j++) {
u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192f-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
for (j = 0; j < 8; j++) {
u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-192s-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
for (j = 0; j < 8; j++) {
u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256f-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c
index d97750c09..d2afbb0c4 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_avx2/sha256x8.c
@@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen,
memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
for (j = 0; j < 8; j++) {
u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c
index 5ba5e9cf3..828558f00 100644
--- a/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c
+++ b/src/sig/sphincs/pqclean_sphincs-sha2-256s-simple_clean/hash_sha2.c
@@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA256 output.. */
- for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha256(out, inbuf, inlen + 4);
out += SPX_SHA256_OUTPUT_BYTES;
@@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen,
memcpy(inbuf, in, inlen);
/* While we can fit in at least another full block of SHA512 output.. */
- for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
+ for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) {
u32_to_bytes(inbuf + inlen, i);
sha512(out, inbuf, inlen + 4);
out += SPX_SHA512_OUTPUT_BYTES;
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-128f-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
}
state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
- state[(SPX_N / 8) * (1 + inblocks) + 4],
- _mm256_set1_epi64x(0x1f)
- );
+ state[(SPX_N / 8) * (1 + inblocks) + 4],
+ _mm256_set1_epi64x(0x1f)
+ );
for (int i = 17; i < 25; i++) {
state[i] = _mm256_set1_epi64x(0);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-128s-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
}
state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
- state[(SPX_N / 8) * (1 + inblocks) + 4],
- _mm256_set1_epi64x(0x1f)
- );
+ state[(SPX_N / 8) * (1 + inblocks) + 4],
+ _mm256_set1_epi64x(0x1f)
+ );
for (int i = 17; i < 25; i++) {
state[i] = _mm256_set1_epi64x(0);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-192f-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
}
state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
- state[(SPX_N / 8) * (1 + inblocks) + 4],
- _mm256_set1_epi64x(0x1f)
- );
+ state[(SPX_N / 8) * (1 + inblocks) + 4],
+ _mm256_set1_epi64x(0x1f)
+ );
for (int i = 17; i < 25; i++) {
state[i] = _mm256_set1_epi64x(0);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-192s-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
}
state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
- state[(SPX_N / 8) * (1 + inblocks) + 4],
- _mm256_set1_epi64x(0x1f)
- );
+ state[(SPX_N / 8) * (1 + inblocks) + 4],
+ _mm256_set1_epi64x(0x1f)
+ );
for (int i = 17; i < 25; i++) {
state[i] = _mm256_set1_epi64x(0);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-256f-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
}
state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
- state[(SPX_N / 8) * (1 + inblocks) + 4],
- _mm256_set1_epi64x(0x1f)
- );
+ state[(SPX_N / 8) * (1 + inblocks) + 4],
+ _mm256_set1_epi64x(0x1f)
+ );
for (int i = 17; i < 25; i++) {
state[i] = _mm256_set1_epi64x(0);
}
diff --git a/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c b/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c
index 89dc9a422..bbe043852 100644
--- a/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c
+++ b/src/sig/sphincs/pqclean_sphincs-shake-256s-simple_avx2/thash_shake_simplex4.c
@@ -58,9 +58,9 @@ void thashx4(unsigned char *out0,
}
state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56));
state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256(
- state[(SPX_N / 8) * (1 + inblocks) + 4],
- _mm256_set1_epi64x(0x1f)
- );
+ state[(SPX_N / 8) * (1 + inblocks) + 4],
+ _mm256_set1_epi64x(0x1f)
+ );
for (int i = 17; i < 25; i++) {
state[i] = _mm256_set1_epi64x(0);
}
diff --git a/tests/KATs/sig/kats.json b/tests/KATs/sig/kats.json
index 73595b103..e60fe897b 100644
--- a/tests/KATs/sig/kats.json
+++ b/tests/KATs/sig/kats.json
@@ -19,6 +19,14 @@
"all": "f4f23c1153682007d5dec02c35e47061c17900fcf0adb3fd0437f1988fa13655",
"single": "da27fe8a462de7307ddf1f9b00072a457d9c5b14e838c148fbe2662094b9a2ca"
},
+ "Falcon-padded-1024": {
+ "all": "907a4931ddc2ce8360478a45f1bffededd6a04015b00233ecd851a62ecba06c1",
+ "single": "ddcc5683293388249e6fe85e992ea19d0986d34e060a44f82bc3db524a8c8390"
+ },
+ "Falcon-padded-512": {
+ "all": "362ecc0537ca1fe25143fb7ccb04de8ee7703469d13ebcf311ab124a5c374a65",
+ "single": "91842d41138e7cfaf6e2e8f12a03c3b3411302255121e4d07d02f91a003c0395"
+ },
"ML-DSA-44": {
"all": "183bc0c4398ade4fc17b6a7d876b82545a96331139a4f27269c95664b8c483f9",
"single": "e6f3ec4dc0b02dd3bcbbc6b105190e1890ca0bb3f802e2b571f0d70f3993a2e1"
diff --git a/tests/constant_time/sig/issues.json b/tests/constant_time/sig/issues.json
index 2cb9f200b..3b174fdca 100644
--- a/tests/constant_time/sig/issues.json
+++ b/tests/constant_time/sig/issues.json
@@ -5,6 +5,8 @@
"Dilithium5": [],
"Falcon-1024": ["falcon"],
"Falcon-512": ["falcon"],
+ "Falcon-padded-1024": ["falcon"],
+ "Falcon-padded-512": ["falcon"],
"ML-DSA-44-ipd": [],
"ML-DSA-65-ipd": [],
"ML-DSA-87-ipd": [],
diff --git a/tests/constant_time/sig/passes.json b/tests/constant_time/sig/passes.json
index fee99dcfc..a6096eb64 100644
--- a/tests/constant_time/sig/passes.json
+++ b/tests/constant_time/sig/passes.json
@@ -5,6 +5,8 @@
"Dilithium5": ["dilithium", "dilithium-avx2", "dilithium-aarch64"],
"Falcon-1024": ["falcon_keygen", "falcon_sign"],
"Falcon-512": ["falcon_keygen", "falcon_sign"],
+ "Falcon-padded-1024": ["falcon_keygen", "falcon_sign"],
+ "Falcon-padded-512": ["falcon_keygen", "falcon_sign"],
"ML-DSA-44-ipd": ["ml_dsa", "ml_dsa-avx2"],
"ML-DSA-65-ipd": ["ml_dsa", "ml_dsa-avx2"],
"ML-DSA-87-ipd": ["ml_dsa", "ml_dsa-avx2"],
diff --git a/tests/kat_sig.c b/tests/kat_sig.c
index db70d1dd3..21c208f3a 100644
--- a/tests/kat_sig.c
+++ b/tests/kat_sig.c
@@ -132,6 +132,26 @@ OQS_STATUS combine_message_signature(uint8_t **signed_msg, size_t *signed_msg_le
(*signed_msg)[42 + msg_len] = 0x2A;
memcpy(*signed_msg + 42 + msg_len + 1, falc_sig, signature_len - 41);
return OQS_SUCCESS;
+ } else if (0 == strcmp(sig->method_name, "Falcon-padded-512")) {
+ // signed_msg = signature || msg
+ *signed_msg_len = signature_len + msg_len;
+ *signed_msg = malloc(*signed_msg_len);
+ if (*signed_msg == NULL) {
+ return OQS_ERROR;
+ }
+ memcpy(*signed_msg, signature, signature_len);
+ memcpy(*signed_msg + signature_len, msg, msg_len);
+ return OQS_SUCCESS;
+ } else if (0 == strcmp(sig->method_name, "Falcon-padded-1024")) {
+ // signed_msg = signature || msg
+ *signed_msg_len = signature_len + msg_len;
+ *signed_msg = malloc(*signed_msg_len);
+ if (*signed_msg == NULL) {
+ return OQS_ERROR;
+ }
+ memcpy(*signed_msg, signature, signature_len);
+ memcpy(*signed_msg + signature_len, msg, msg_len);
+ return OQS_SUCCESS;
} else if (0 == strcmp(sig->method_name, "SPHINCS+-SHA2-128f-simple")) {
// signed_msg = signature || msg
*signed_msg_len = signature_len + msg_len;