From 58ef07d4ed8b4af1d1a781a1a31a03cf5391602e Mon Sep 17 00:00:00 2001
From: Anjan Roy <anjanroy@yandex.com>
Date: Sun, 13 Mar 2022 07:35:33 +0000
Subject: [PATCH 1/7] keccak256 2-to-1 hash function implemented

---
 include/keccak_256.hpp | 119 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 include/keccak_256.hpp

diff --git a/include/keccak_256.hpp b/include/keccak_256.hpp
new file mode 100644
index 0000000..bd3f0fa
--- /dev/null
+++ b/include/keccak_256.hpp
@@ -0,0 +1,119 @@
+#pragma once
+#include "sha3.hpp"
+
+namespace keccak_256 {
+
+// Keccak-256 specific input/ output width constants
+constexpr size_t IN_LEN_BITS = 512;
+constexpr size_t IN_LEN_BYTES = IN_LEN_BITS >> 3;
+
+constexpr size_t OUT_LEN_BITS = IN_LEN_BITS >> 1;
+constexpr size_t OUT_LEN_BYTES = IN_LEN_BYTES >> 1;
+
+// From input byte array ( = 64 bytes ) preparing 5 x 5 x 64 keccak state array
+// as twenty five 64 -bit unsigned integers
+//
+// Combined techniques adapted from section 3.1.2 of
+// http://dx.doi.org/10.6028/NIST.FIPS.202; algorithm 10
+// defined in section B.1 of above linked document
+//
+// Note, in SHA3 specification padding rule is different than what it's for
+// keccak-256 i.e. to be very spcific, compare
+// https://github.com/itzmeanjan/merklize-sha/blob/e421134ea4c9b1a832458bc870c3e79dc2849ecc/include/sha3_256.hpp#L43
+// ( this is sha3-256 implementation ) and write to 9-th state lane ( at index 8
+// of state array ) in following function body
+//
+// I suggest you read https://keccak.team/files/Keccak-implementation-3.2.pdf 's
+// section 1.1 where padding rule is defined under `Keccak[r, c](M)` definition
+void
+to_state_array(const sycl::uchar* __restrict in,
+               sycl::ulong* const __restrict state)
+{
+#pragma unroll 8
+  for (size_t i = 0; i < 8; i++) {
+    state[i] = static_cast<sycl::ulong>(in[(i << 3) + 7]) << 56 |
+               static_cast<sycl::ulong>(in[(i << 3) + 6]) << 48 |
+               static_cast<sycl::ulong>(in[(i << 3) + 5]) << 40 |
+               static_cast<sycl::ulong>(in[(i << 3) + 4]) << 32 |
+               static_cast<sycl::ulong>(in[(i << 3) + 3]) << 24 |
+               static_cast<sycl::ulong>(in[(i << 3) + 2]) << 16 |
+               static_cast<sycl::ulong>(in[(i << 3) + 1]) << 8 |
+               static_cast<sycl::ulong>(in[(i << 3) + 0]) << 0;
+  }
+
+  // see how 0b01 is padded to input message; following keccak-256
+  // implementation guide
+  // https://keccak.team/files/Keccak-implementation-3.2.pdf 's section 1.1
+  // where `Keccak[r, c](M)` is defined ( spcifically padding rule block in
+  // pseudocode, at very end of mentioned section )
+  //
+  // ! read right to left !
+  state[8] = 0b1ull;
+
+#pragma unroll 7
+  for (size_t i = 9; i < 16; i++) {
+    state[i] = 0ull;
+  }
+
+  // this 1 is added to input message bits due to padding requirement
+  // defined in keccak-256 implementation guide
+  // https://keccak.team/files/Keccak-implementation-3.2.pdf 's section 1.1
+  // where `Keccak[r, c](M)` is defined ( spcifically padding rule block in
+  // pseudocode, at very end of mentioned section )
+  //
+  // ! read right to left, so it's actually 1 << 63 !
+  state[16] = 9223372036854775808ull;
+
+#pragma unroll 8
+  for (size_t i = 17; i < 25; i++) {
+    state[i] = 0ull;
+  }
+}
+
+// From absorbed hash state array of dimension 5 x 5 x 64, produces 32 -bytes
+// digest using method defined in section 3.1.3 of
+// http://dx.doi.org/10.6028/NIST.FIPS.202 and algorithm 11 defined in section
+// B.1 of above hyperlinked document
+//
+// Note, digest preparation method is same for both sha3-256 and keccak-256
+void
+to_digest_bytes(const sycl::ulong* __restrict in,
+                sycl::uchar* const __restrict digest)
+{
+#pragma unroll 4
+  for (size_t i = 0; i < 4; i++) {
+    const sycl::ulong lane = in[i];
+
+    digest[(i << 3) + 0] = static_cast<sycl::uchar>((lane >> 0) & 0xffull);
+    digest[(i << 3) + 1] = static_cast<sycl::uchar>((lane >> 8) & 0xffull);
+    digest[(i << 3) + 2] = static_cast<sycl::uchar>((lane >> 16) & 0xffull);
+    digest[(i << 3) + 3] = static_cast<sycl::uchar>((lane >> 24) & 0xffull);
+    digest[(i << 3) + 4] = static_cast<sycl::uchar>((lane >> 32) & 0xffull);
+    digest[(i << 3) + 5] = static_cast<sycl::uchar>((lane >> 40) & 0xffull);
+    digest[(i << 3) + 6] = static_cast<sycl::uchar>((lane >> 48) & 0xffull);
+    digest[(i << 3) + 7] = static_cast<sycl::uchar>((lane >> 56) & 0xffull);
+  }
+}
+
+// Keccak-256 2-to-1 hasher, where input is 64 contiguous bytes which is hashed
+// to produce 32 -bytes output
+//
+// This function itself doesn't do much instead of calling other functions
+// which actually
+// - prepares state bit array from input byte array ( difference lies here, when
+// comparing between sha3-256 & keccak-256 )
+// - permutes input using `keccak-p[b, n_r]`
+// - truncates first 256 -bits from state bit array
+//
+// See section 6.1 of http://dx.doi.org/10.6028/NIST.FIPS.202
+void
+hash(const sycl::uchar* __restrict in, sycl::uchar* const __restrict digest)
+{
+  sycl::ulong state[25];
+
+  to_state_array(in, state);
+  keccak_p(state);
+  to_digest_bytes(state, digest);
+}
+
+}

From 9ec4208a2fc03b340dbd59c5059884e4ee9b03ff Mon Sep 17 00:00:00 2001
From: Anjan Roy <anjanroy@yandex.com>
Date: Sun, 13 Mar 2022 07:35:59 +0000
Subject: [PATCH 2/7] accompanying test case for keccak-256 2-to-1 hash
 implementation

---
 include/test_keccak_256.hpp | 44 +++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 include/test_keccak_256.hpp

diff --git a/include/test_keccak_256.hpp b/include/test_keccak_256.hpp
new file mode 100644
index 0000000..cf3b781
--- /dev/null
+++ b/include/test_keccak_256.hpp
@@ -0,0 +1,44 @@
+#pragma once
+#include "keccak_256.hpp"
+#include <cassert>
+
+void
+test_keccak_256(sycl::queue& q)
+{
+  // obtained by executing following snippet in python3 shell
+  //
+  // $ python3 -m pip install --user pysha3
+  // $ python3
+  // >>> import sha3
+  // >>> list(sha3.keccak_256(bytes([i for i in range(64)])).digest())
+  //
+  // note, same input is prepared inside 👇 for loop
+  constexpr sycl::uchar expected[32] = {
+    0,   32,  48,  189, 227, 212, 207, 137, 145, 150, 73,
+    119, 92,  215, 24,  117, 196, 208, 171, 23,  8,   163,
+    128, 224, 63,  239, 195, 162, 138, 162, 72,  49
+  };
+
+  // acquire resources
+  sycl::uchar* in = static_cast<sycl::uchar*>(sycl::malloc_shared(64, q));
+  sycl::uchar* out = static_cast<sycl::uchar*>(sycl::malloc_shared(32, q));
+
+#pragma unroll 16
+  for (size_t i = 0; i < 64; i++) {
+    // preparing input for testing 2-to-1 Keccak-256 hash
+    in[i] = i;
+  }
+
+  // enqueue kernel execution in single work-item model
+  q.single_task<class kernelTestSHA3_256>([=]() { keccak_256::hash(in, out); });
+  q.wait();
+
+  // check result !
+  for (size_t i = 0; i < 32; i++) {
+    assert(out[i] == expected[i]);
+  }
+
+  // ensure resources are deallocated
+  sycl::free(in, q);
+  sycl::free(out, q);
+}

From ddb7ac9d31787d68d04a504d31f6371117690918 Mon Sep 17 00:00:00 2001
From: Anjan Roy <anjanroy@yandex.com>
Date: Sun, 13 Mar 2022 09:30:24 +0000
Subject: [PATCH 3/7] extended binary merklization implementation to work with
 keccak256 as 2-to-1 hash function of choice

---
 include/merklize.hpp        |  33 +++++++++--
 include/test_keccak_256.hpp |   3 +-
 include/test_merklize.hpp   | 114 ++++++++++++++++++++++++------------
 test/main.cpp               |  10 ++++
 4 files changed, 116 insertions(+), 44 deletions(-)

diff --git a/include/merklize.hpp b/include/merklize.hpp
index 90e29c2..b78fcf9 100644
--- a/include/merklize.hpp
+++ b/include/merklize.hpp
@@ -3,7 +3,7 @@
 #if !(defined SHA1 || defined SHA2_224 || defined SHA2_256 ||                  \
       defined SHA2_384 || defined SHA2_512 || defined SHA2_512_224 ||          \
       defined SHA2_512_256 || defined SHA3_256 || defined SHA3_224 ||          \
-      defined SHA3_384 || defined SHA3_512)
+      defined SHA3_384 || defined SHA3_512 || defined KECCAK_256)
 #define SHA2_256
 #endif
 
@@ -40,6 +40,9 @@
 #elif defined SHA3_512
 #include "sha3_512.hpp"
 #pragma message "Choosing to compile Merklization with SHA3-512 !"
+#elif defined KECCAK_256
+#include "keccak_256.hpp"
+#pragma message "Choosing to compile Merklization with KECCAK-256 !"
 #endif
 
 // Binary merklization --- collects motivation from
@@ -56,7 +59,7 @@ merklize(sycl::queue& q,
   defined SHA2_512_256
          const sycl::ulong* __restrict leaf_nodes,
 #elif defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||              \
-  defined SHA3_512
+  defined SHA3_512 || defined KECCAK_256
          const sycl::uchar* __restrict leaf_nodes,
 #endif
 
@@ -69,7 +72,7 @@ merklize(sycl::queue& q,
   defined SHA2_512_256
          sycl::ulong* const __restrict intermediates,
 #elif defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||              \
-  defined SHA3_512
+  defined SHA3_512 || defined KECCAK_256
          sycl::uchar* const __restrict intermediates,
 #endif
 
@@ -116,12 +119,16 @@ merklize(sycl::queue& q,
 #elif defined SHA3_512
   assert(i_size == leaf_cnt * sha3_512::OUT_LEN_BYTES);
   assert(o_size == (itmd_cnt + 1) * sha3_512::OUT_LEN_BYTES);
+#elif defined KECCAK_256
+  assert(i_size == leaf_cnt * keccak_256::OUT_LEN_BYTES);
+  assert(o_size == (itmd_cnt + 1) * keccak_256::OUT_LEN_BYTES);
 #endif
 
   // both input and output allocation has same size
 #if defined SHA1 || defined SHA2_224 || defined SHA2_256 ||                    \
   defined SHA2_384 || defined SHA2_512 || defined SHA2_512_256 ||              \
-  defined SHA3_256 || defined SHA3_224 || defined SHA3_384 || defined SHA3_512
+  defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||                  \
+  defined SHA3_512 || defined KECCAK_256
 
   assert(i_size == o_size);
 
@@ -160,7 +167,7 @@ merklize(sycl::queue& q,
   // note that `o_size` is in terms of bytes
   const size_t elm_cnt = o_size >> 3;
 #elif defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||              \
-  defined SHA3_512
+  defined SHA3_512 || defined KECCAK_256
   // # -of 8 -bit unsigned integers ( read a byte ), which can be contiguously
   // placed on output memory allocation
   //
@@ -227,6 +234,9 @@ merklize(sycl::queue& q,
 #elif defined SHA3_512
         const size_t in_idx = idx * sha3_512::IN_LEN_BYTES;
         const size_t out_idx = idx * sha3_512::OUT_LEN_BYTES;
+#elif defined KECCAK_256
+        const size_t in_idx = idx * keccak_256::IN_LEN_BYTES;
+        const size_t out_idx = idx * keccak_256::OUT_LEN_BYTES;
 #endif
 
 #if defined SHA1
@@ -270,6 +280,11 @@ merklize(sycl::queue& q,
         sycl::uchar* out = intermediates + o_offset + out_idx;
 
         sha3_512::hash(in, out);
+#elif defined KECCAK_256
+        const sycl::uchar* in = leaf_nodes + i_offset + in_idx;
+        sycl::uchar* out = intermediates + o_offset + out_idx;
+
+        keccak_256::hash(in, out);
 #endif
       });
   });
@@ -398,6 +413,9 @@ merklize(sycl::queue& q,
 #elif defined SHA3_512
           const size_t in_idx = idx * sha3_512::IN_LEN_BYTES;
           const size_t out_idx = idx * sha3_512::OUT_LEN_BYTES;
+#elif defined KECCAK_256
+          const size_t in_idx = idx * keccak_256::IN_LEN_BYTES;
+          const size_t out_idx = idx * keccak_256::OUT_LEN_BYTES;
 #endif
 
 #if defined SHA1
@@ -446,6 +464,11 @@ merklize(sycl::queue& q,
           sycl::uchar* out = intermediates + o_offset_ + out_idx;
 
           sha3_512::hash(in, out);
+#elif defined KECCAK_256
+          const sycl::uchar* in = intermediates + i_offset_ + in_idx;
+          sycl::uchar* out = intermediates + o_offset_ + out_idx;
+
+          keccak_256::hash(in, out);
 #endif
         });
     });
diff --git a/include/test_keccak_256.hpp b/include/test_keccak_256.hpp
index cf3b781..e653235 100644
--- a/include/test_keccak_256.hpp
+++ b/include/test_keccak_256.hpp
@@ -30,7 +30,8 @@ test_keccak_256(sycl::queue& q)
   }
 
   // enqueue kernel execution in single work-item model
-  q.single_task<class kernelTestSHA3_256>([=]() { keccak_256::hash(in, out); });
+  q.single_task<class kernelTestKeccak_256>(
+    [=]() { keccak_256::hash(in, out); });
   q.wait();
 
   // check result !
diff --git a/include/test_merklize.hpp b/include/test_merklize.hpp
index 3432948..dd7e6bc 100644
--- a/include/test_merklize.hpp
+++ b/include/test_merklize.hpp
@@ -44,6 +44,9 @@ test_merklize(sycl::queue& q)
 #elif defined SHA3_512
   constexpr size_t i_size = leaf_cnt * sha3_512::OUT_LEN_BYTES; // in bytes
   constexpr size_t o_size = leaf_cnt * sha3_512::OUT_LEN_BYTES; // in bytes
+#elif defined KECCAK_256
+  constexpr size_t i_size = leaf_cnt * keccak_256::OUT_LEN_BYTES; // in bytes
+  constexpr size_t o_size = leaf_cnt * keccak_256::OUT_LEN_BYTES; // in bytes
 #endif
 
   // obtained using following code snippet run on python3 shell
@@ -54,11 +57,11 @@ test_merklize(sycl::queue& q)
   // >>> a = [0xff] * 40 # two leaf nodes
   // >>> b = list(hashlib.sha1(bytes(a)).digest()) # = [244, 67, 49, 150, 149,
   // 151, 153, 23, 160, 59, 113, 112, 73, 35, 84, 35, 135, 77, 39, 22]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha1(bytes(c)).digest()) # = [7, 7, 30, 157, 84, 109,
   // 232, 147, 213, 85, 108, 21, 251, 107, 125, 35, 100, 216, 165, 28]
-
+  //
   // >>> e = d * 2
   // >>> f = list(hashlib.sha1(bytes(e)).digest()) # = [139, 49, 56, 44, 55, 31,
   // 24, 110, 245, 27, 105, 167, 84, 13, 218, 12, 209, 49, 184, 54]
@@ -71,15 +74,15 @@ test_merklize(sycl::queue& q)
   // >>> b = list(hashlib.sha224(bytes(a)).digest()); b
   // [140, 250, 128, 28, 254, 116, 112, 113, 88, 113, 102, 5, 189, 54, 5, 27,
   // 74, 136, 109, 48, 20, 8, 50, 168, 140, 123, 210, 114]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha224(bytes(c)).digest()); d
   // [71, 212, 232, 90, 92, 160, 135, 245, 176, 115, 198, 156, 203, 178, 147,
   // 104, 12, 141, 40, 52, 153, 47, 215, 175, 88, 78, 74, 219]
-
+  //
   // >>> e = d * 2
   // >>> f = list(hashlib.sha224(bytes(e)).digest())
-
+  //
   // >>> f
   // [68, 112, 247, 219, 202, 225, 184, 209, 196, 9, 206, 28, 243, 98, 103, 193,
   // 123, 100, 218, 42, 254, 195, 132, 224, 199, 116, 140, 223]
@@ -94,15 +97,15 @@ test_merklize(sycl::queue& q)
   // >>> b = list(hashlib.sha256(bytes(a)).digest()); b
   // [134, 103, 231, 24, 41, 78, 158, 13, 241, 211, 6, 0, 186, 62, 235, 32, 31,
   // 118, 74, 173, 45, 173, 114, 116, 134, 67, 228, 162, 133, 225, 209, 247]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha256(bytes(c)).digest()); d
   // [55, 93, 108, 123, 40, 10, 30, 48, 249, 104, 219, 29, 148, 141, 160, 249,
   // 119, 191, 145, 57, 176, 213, 81, 103, 97, 172, 135, 71, 0, 32, 138, 186]
-
+  //
   // >>> e = d * 2
   // >>> f = list(hashlib.sha256(bytes(e)).digest())
-
+  //
   // >>> f
   // [190, 27, 112, 21, 237, 80, 215, 73, 10, 81, 241, 177, 29, 255, 128, 74,
   // 68, 64, 119, 92, 200, 8, 185, 207, 210, 97, 87, 128, 92, 31, 142, 134]
@@ -117,16 +120,16 @@ test_merklize(sycl::queue& q)
   // [120, 195, 4, 101, 32, 184, 165, 150, 9, 221, 16, 126, 43, 186, 64, 107,
   // 143, 124, 119, 179, 53, 135, 31, 39, 146, 115, 75, 158, 151, 254, 247, 182,
   // 91, 31, 17, 212, 123, 219, 246, 75, 217, 24, 111, 77, 215, 195, 125, 165]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha384(bytes(c)).digest()); d
   // [227, 29, 252, 255, 250, 146, 71, 38, 152, 231, 169, 100, 72, 182, 172, 85,
   // 39, 82, 76, 213, 182, 23, 141, 45, 195, 141, 134, 156, 50, 73, 29, 223,
   // 251, 156, 145, 97, 16, 6, 12, 104, 80, 1, 254, 85, 175, 233, 154, 150]
-
+  //
   // >>> e = d * 2
   // >>> f = list(hashlib.sha384(bytes(e)).digest())
-
+  //
   // >>> f
   // [239, 157, 55, 183, 110, 217, 152, 174, 198, 161, 104, 34, 255, 210, 42,
   // 127, 109, 225, 231, 137, 155, 208, 1, 12, 92, 229, 164, 16, 115, 202, 32,
@@ -146,17 +149,17 @@ test_merklize(sycl::queue& q)
   // 4, 207, 86, 153, 208, 31, 254, 102, 160, 26, 213, 84, 207, 176, 219, 137,
   // 107, 188, 22, 224, 139, 212, 251, 202, 179, 99, 100, 144, 158, 223, 80,
   // 236, 182, 200, 4, 39, 34, 164, 197, 148, 86, 217, 4, 130, 68, 205, 87, 240]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha512(bytes(c)).digest()); d
   // [29, 250, 176, 241, 104, 30, 96, 125, 162, 189, 87, 132, 239, 233, 197, 38,
   // 115, 203, 5, 77, 121, 19, 221, 76, 158, 205, 2, 246, 119, 235, 142, 168,
   // 208, 86, 184, 121, 23, 124, 5, 35, 213, 226, 12, 28, 89, 184, 202, 238, 78,
   // 226, 3, 191, 191, 67, 130, 141, 106, 49, 60, 195, 37, 126, 191, 246]
-
+  //
   // >>> e = d * 2
   // >>> f = list(hashlib.sha512(bytes(e)).digest())
-
+  //
   // >>> f
   // [16, 89, 255, 34, 217, 58, 55, 214, 124, 223, 84, 72, 189, 98, 82, 87, 164,
   // 252, 176, 254, 76, 1, 212, 167, 85, 125, 123, 2, 88, 197, 250, 70, 142, 62,
@@ -175,15 +178,15 @@ test_merklize(sycl::queue& q)
   // >>> b = list(SHA512.new(data= bytes([0xff] * 56),truncate='224').digest());
   // b [48, 203, 99, 172, 231, 234, 247, 242, 145, 165, 10, 53, 219, 85, 130,
   // 55, 155, 52, 43, 55, 172, 78, 125, 185, 119, 230, 148, 129]
-
+  //
   // >>> c = b * 2
   // >>> d = list(SHA512.new(data= bytes(c),truncate='224').digest()); d
   // [35, 211, 149, 84, 66, 218, 192, 196, 121, 52, 94, 75, 251, 40, 83, 102,
   // 182, 23, 45, 239, 44, 2, 97, 100, 31, 26, 4, 142]
-
+  //
   // >>> e = d * 2
   // >>> f =  list(SHA512.new(data= bytes(e),truncate='224').digest())
-
+  //
   // >>> f
   // [45, 213, 101, 185, 49, 91, 242, 198, 250, 179, 90, 147, 49, 158, 113, 189,
   // 131, 120, 67, 135, 193, 39, 110, 71, 26, 195, 63, 193]
@@ -197,15 +200,15 @@ test_merklize(sycl::queue& q)
   // >>> b = list(SHA512.new(data= bytes(a),truncate='256').digest()); b
   // [254, 218, 174, 155, 245, 143, 133, 143, 128, 130, 195, 85, 44, 169, 71,
   // 77, 8, 123, 94, 131, 30, 38, 179, 26, 164, 7, 159, 115, 132, 111, 54, 93]
-
+  //
   // >>> c = b * 2
   // >>> d = list(SHA512.new(data= bytes(c),truncate='256').digest()); d
   // [111, 152, 206, 204, 224, 191, 32, 143, 125, 172, 90, 72, 37, 40, 72, 147,
   // 253, 199, 207, 161, 98, 76, 13, 24, 105, 250, 17, 79, 29, 58, 7, 136]
-
+  //
   // >>> e = d * 2
   // >>> f =  list(SHA512.new(data= bytes(e),truncate='256').digest())
-
+  //
   // >>> f
   // [129, 151, 248, 46, 143, 39, 163, 78, 234, 177, 146, 147, 233, 80, 172,
   // 144, 1, 184, 229, 187, 174, 201, 189, 160, 169, 168, 64, 21, 112, 149, 72,
@@ -221,15 +224,15 @@ test_merklize(sycl::queue& q)
   // >>> b = list(hashlib.sha3_256(bytes(a)).digest()); b
   // [127, 216, 219, 145, 139, 238, 83, 121, 178, 47, 88, 60, 230, 71, 159, 120,
   // 77, 35, 40, 22, 190, 170, 86, 66, 36, 58, 115, 74, 129, 101, 161, 90]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha3_256(bytes(c)).digest()); d
   // [105, 66, 252, 242, 214, 146, 9, 148, 126, 206, 138, 110, 64, 115, 31, 49,
   // 54, 217, 247, 151, 154, 223, 58, 84, 111, 217, 196, 181, 72, 62, 22, 52]
-
+  //
   // >>> e = d * 2
   // >>> f =  list(hashlib.sha3_256(bytes(e)).digest())
-
+  //
   // >>> f
   // [159, 200, 74, 194, 101, 231, 247, 10, 65, 194, 250, 128, 32, 140, 171, 51,
   // 143, 128, 183, 61, 78, 102, 179, 87, 41, 4, 59, 151, 162, 190, 109, 76]
@@ -243,15 +246,15 @@ test_merklize(sycl::queue& q)
   // >>> b = list(hashlib.sha3_224(bytes(a)).digest()); b
   // [49, 228, 11, 40, 246, 167, 246, 82, 85, 97, 72, 228, 3, 119, 46, 39, 63,
   // 25, 58, 233, 130, 72, 222, 235, 18, 114, 166, 34]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha3_224(bytes(c)).digest()); d
   // [200, 148, 113, 17, 93, 252, 82, 235, 69, 198, 146, 204, 127, 203, 235,
   // 238, 55, 222, 219, 95, 25, 108, 225, 225, 192, 235, 241, 241]
-
+  //
   // >>> e = d * 2
   // >>> f =  list(hashlib.sha3_224(bytes(e)).digest())
-
+  //
   // >>> f
   // [255, 38, 15, 99, 54, 66, 125, 85, 251, 165, 20, 200, 220, 70, 206, 152,
   // 237, 28, 64, 8, 62, 226, 202, 222, 2, 25, 165, 60]
@@ -266,16 +269,16 @@ test_merklize(sycl::queue& q)
   // [171, 233, 159, 157, 95, 204, 31, 31, 236, 79, 72, 45, 206, 134, 237, 245,
   // 217, 103, 151, 124, 43, 36, 121, 15, 238, 100, 216, 167, 98, 24, 155, 47,
   // 2, 140, 237, 192, 14, 196, 134, 95, 201, 176, 235, 150, 211, 121, 69, 172]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha3_384(bytes(c)).digest()); d
   // [5, 197, 34, 253, 78, 138, 132, 51, 248, 1, 221, 153, 56, 43, 167, 187,
   // 116, 63, 213, 227, 228, 178, 57, 226, 110, 244, 49, 15, 171, 35, 123, 215,
   // 130, 253, 144, 161, 229, 124, 246, 255, 214, 243, 211, 54, 36, 50, 121, 34]
-
+  //
   // >>> e = d * 2
   // >>> f =  list(hashlib.sha3_384(bytes(e)).digest())
-
+  //
   // >>> f
   // [254, 147, 220, 144, 226, 81, 255, 216, 251, 31, 114, 222, 160, 4, 214,
   // 253, 241, 188, 170, 34, 234, 105, 40, 43, 185, 57, 62, 159, 178, 128, 231,
@@ -295,17 +298,17 @@ test_merklize(sycl::queue& q)
   // 146, 238, 82, 76, 148, 243, 19, 57, 100, 55, 163, 147, 116, 220, 180, 58,
   // 110, 62, 22, 14, 161, 121, 230, 51, 182, 43, 210, 196, 98, 152, 203, 89,
   // 79, 8, 59, 77, 22, 182, 226, 66, 52, 173, 74, 113, 254, 148, 12, 89, 143]
-
+  //
   // >>> c = b * 2
   // >>> d = list(hashlib.sha3_512(bytes(c)).digest()); d
   // [90, 38, 103, 232, 22, 8, 142, 185, 126, 112, 249, 248, 215, 110, 229, 137,
   // 98, 207, 23, 227, 59, 253, 237, 21, 219, 78, 2, 171, 18, 10, 225, 178, 175,
   // 234, 197, 55, 73, 194, 24, 65, 30, 62, 13, 45, 118, 210, 177, 7, 195, 79,
   // 87, 133, 141, 223, 151, 63, 237, 89, 2, 137, 221, 249, 22, 193]
-
+  //
   // >>> e = d * 2
   // >>> f =  list(hashlib.sha3_512(bytes(e)).digest())
-
+  //
   // >>> f
   // [104, 212, 199, 69, 96, 90, 255, 254, 172, 66, 99, 91, 90, 90, 62, 47, 134,
   // 86, 55, 203, 175, 8, 19, 95, 220, 54, 162, 251, 214, 102, 195, 100, 185,
@@ -318,6 +321,30 @@ test_merklize(sycl::queue& q)
     177, 100, 141, 206, 4,   39,  65,  1,   168, 4,  149, 112, 77,
     212, 175, 50,  150, 42,  29,  174, 20,  201, 12, 120, 26
   };
+#elif defined KECCAK_256
+  // $ python3 -m pip install --user pysha3
+  // $ python3
+  //
+  // >>> a = [0xff] * 64
+  // >>> b = list(sha3.keccak_256(bytes(a)).digest()); b
+  // [189, 139, 21, 23, 115, 219, 190, 253, 123, 13, 246, 127, 45, 204, 72, 41,
+  // 1, 114, 139, 109, 244, 119, 244, 251, 47, 25, 39, 51, 160, 5, 211, 150]
+  //
+  // >>> c = b * 2
+  // >>> d = list(sha3.keccak_256(bytes(c)).digest()); d
+  // [238, 17, 177, 202, 20, 1, 218, 228, 63, 30, 216, 224, 237, 4, 93, 208, 56,
+  // 203, 176, 73, 157, 86, 222, 106, 194, 202, 66, 147, 11, 147, 162, 74]
+  //
+  // >>> e = d * 2
+  // >>> f =  list(sha3.keccak_256(bytes(e)).digest())
+  //
+  // >>> f
+  // [236, 6, 179, 40, 94, 80, 24, 219, 209, 152, 28, 100, 219, 246, 233, 206,
+  // 160, 47, 165, 145, 240, 50, 43, 81, 207, 188, 49, 167, 41, 80, 9, 40]
+  constexpr sycl::uchar expected[32] = {
+    236, 6,  179, 40,  94,  80, 24, 219, 209, 152, 28, 100, 219, 246, 233, 206,
+    160, 47, 165, 145, 240, 50, 43, 81,  207, 188, 49, 167, 41,  80,  9,   40
+  };
 #endif
 
 #if defined SHA1 || defined SHA2_224 || defined SHA2_256
@@ -334,13 +361,14 @@ test_merklize(sycl::queue& q)
   sycl::ulong* out_0 = (sycl::ulong*)sycl::malloc_shared(o_size, q);
   sycl::uchar* out_1 = (sycl::uchar*)sycl::malloc_shared(o_size, q);
 #elif defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||              \
-  defined SHA3_512
+  defined SHA3_512 || defined KECCAK_256
   // acquire resources
   sycl::uchar* in = (sycl::uchar*)sycl::malloc_shared(i_size, q);
   sycl::uchar* out = (sycl::uchar*)sycl::malloc_shared(o_size, q);
 #endif
 
-#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 || defined SHA3_512
+#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||                \
+  defined SHA3_512 || defined KECCAK_256
 
   // prepare input bytes
   q.memset(in, 0xff, i_size).wait();
@@ -390,7 +418,8 @@ test_merklize(sycl::queue& q)
 
 #endif
 
-#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 || defined SHA3_512
+#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||                \
+  defined SHA3_512 || defined KECCAK_256
 
   // wait until completely merklized !
   merklize(q, in, i_size, leaf_cnt, out, o_size, leaf_cnt - 1, leaf_cnt >> 1);
@@ -449,12 +478,15 @@ test_merklize(sycl::queue& q)
                      sha3_384::OUT_LEN_BYTES
 #elif defined SHA3_512
                      sha3_512::OUT_LEN_BYTES
+#elif defined KECCAK_256
+                     keccak_256::OUT_LEN_BYTES
 #endif
 
        ;
 
        i++) {
-#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 || defined SHA3_512
+#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||                \
+  defined SHA3_512 || defined KECCAK_256
 
     assert(*(out + i) == 0);
 
@@ -513,10 +545,15 @@ test_merklize(sycl::queue& q)
   for (size_t i = sha3_512::OUT_LEN_BYTES, j = 0;
        i < (sha3_512::OUT_LEN_BYTES << 1) && j < sha3_512::OUT_LEN_BYTES;
        i++, j++)
+#elif defined KECCAK_256
+  for (size_t i = keccak_256::OUT_LEN_BYTES, j = 0;
+       i < (keccak_256::OUT_LEN_BYTES << 1) && j < keccak_256::OUT_LEN_BYTES;
+       i++, j++)
 #endif
 
   {
-#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 || defined SHA3_512
+#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||                \
+  defined SHA3_512 || defined KECCAK_256
 
     assert(*(out + i) == expected[j]);
 
@@ -528,7 +565,8 @@ test_merklize(sycl::queue& q)
   }
 
   // ensure resources are deallocated
-#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 || defined SHA3_512
+#if defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||                \
+  defined SHA3_512 || defined KECCAK_256
 
   sycl::free(in, q);
   sycl::free(out, q);
diff --git a/test/main.cpp b/test/main.cpp
index b40d22a..c366e87 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -23,6 +23,8 @@
 #include "test_sha3_384.hpp"
 #elif defined SHA3_512
 #include "test_sha3_512.hpp"
+#elif defined KECCAK_256
+#include "test_keccak_256.hpp"
 #endif
 
 int
@@ -93,6 +95,11 @@ main(int argc, char** argv)
   test_sha3_512(q);
   std::cout << "passed SHA3-512 test !" << std::endl;
 
+#elif defined KECCAK_256
+
+  test_keccak_256(q);
+  std::cout << "passed Keccak-256 test !" << std::endl;
+
 #endif
 
   test_merklize(q);
@@ -129,6 +136,9 @@ main(int argc, char** argv)
 #elif defined SHA3_512
   std::cout << "passed binary merklization ( using SHA3-512 ) test !"
             << std::endl;
+#elif defined KECCAK_256
+  std::cout << "passed binary merklization ( using KECCAK-256 ) test !"
+            << std::endl;
 #endif
 
   return EXIT_SUCCESS;

From f03707f42f20e9deb4cdbbdc0e10f3490d514ee1 Mon Sep 17 00:00:00 2001
From: Anjan Roy <anjanroy@yandex.com>
Date: Sun, 13 Mar 2022 09:35:04 +0000
Subject: [PATCH 4/7] benchmark keccak256 based binary merklization

---
 bench/main.cpp             | 4 ++++
 include/bench_merklize.hpp | 7 ++++++-
 run.sh                     | 1 +
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/bench/main.cpp b/bench/main.cpp
index 230ea9f..d3e449d 100644
--- a/bench/main.cpp
+++ b/bench/main.cpp
@@ -73,6 +73,10 @@ main(int argc, char** argv)
 #elif defined SHA3_512
   std::cout << "\nBenchmarking Binary Merklization using SHA3-512" << std::endl
             << std::endl;
+#elif defined KECCAK_256
+  std::cout << "\nBenchmarking Binary Merklization using KECCAK-256"
+            << std::endl
+            << std::endl;
 #endif
 
   std::cout << std::setw(16) << std::right << "leaf count"
diff --git a/include/bench_merklize.hpp b/include/bench_merklize.hpp
index 158d5b7..de94bef 100644
--- a/include/bench_merklize.hpp
+++ b/include/bench_merklize.hpp
@@ -54,6 +54,9 @@ benchmark_merklize(sycl::queue& q,
 #elif defined SHA3_512
   const size_t i_size = leaf_cnt * sha3_512::OUT_LEN_BYTES; // in bytes
   const size_t o_size = leaf_cnt * sha3_512::OUT_LEN_BYTES; // in bytes
+#elif defined KECCAK_256
+  const size_t i_size = leaf_cnt * keccak_256::OUT_LEN_BYTES; // in bytes
+  const size_t o_size = leaf_cnt * keccak_256::OUT_LEN_BYTES; // in bytes
 #endif
 
 #if defined SHA1 || defined SHA2_224 || defined SHA2_256
@@ -70,7 +73,7 @@ benchmark_merklize(sycl::queue& q,
   sycl::ulong* i_d = static_cast<sycl::ulong*>(sycl::malloc_device(i_size, q));
   sycl::ulong* o_d = static_cast<sycl::ulong*>(sycl::malloc_device(o_size, q));
 #elif defined SHA3_256 || defined SHA3_224 || defined SHA3_384 ||              \
-  defined SHA3_512
+  defined SHA3_512 || defined KECCAK_256
   // allocate resources
   sycl::uchar* i_h = static_cast<sycl::uchar*>(sycl::malloc_host(i_size, q));
   sycl::uchar* o_h = static_cast<sycl::uchar*>(sycl::malloc_host(o_size, q));
@@ -135,6 +138,8 @@ benchmark_merklize(sycl::queue& q,
                      (sha3_384::OUT_LEN_BYTES)
 #elif defined SHA3_512
                      (sha3_512::OUT_LEN_BYTES)
+#elif defined KECCAK_256
+                     (keccak_256::OUT_LEN_BYTES)
 #endif
 
          ;
diff --git a/run.sh b/run.sh
index 9e98a31..bd0b7f5 100755
--- a/run.sh
+++ b/run.sh
@@ -20,3 +20,4 @@ SHA=sha3_256     make; make clean
 SHA=sha3_224     make; make clean
 SHA=sha3_384     make; make clean
 SHA=sha3_512     make; make clean
+SHA=keccak_256     make; make clean

From 3a5fa2998a0f51761c651eee1ac908e96dc26a2e Mon Sep 17 00:00:00 2001
From: Anjan Roy <anjanroy@yandex.com>
Date: Mon, 14 Mar 2022 04:03:32 +0000
Subject: [PATCH 5/7] added benchmark results of keccak256 based binary
 merklization on multiple platforms ( cpu, gpu etc. )

---
 README.md                        |  10 ++-
 results/keccak-256/intel_cpu.md  | 109 +++++++++++++++++++++++++++++++
 results/keccak-256/intel_gpu.md  |  24 +++++++
 results/keccak-256/nvidia_gpu.md |  24 +++++++
 4 files changed, 165 insertions(+), 2 deletions(-)
 create mode 100644 results/keccak-256/intel_cpu.md
 create mode 100644 results/keccak-256/intel_gpu.md
 create mode 100644 results/keccak-256/nvidia_gpu.md

diff --git a/README.md b/README.md
index fb69a5c..7a968be 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ SYCL accelerated Binary Merklization using SHA1, SHA2 & SHA3
 
 ## Motivation
 
-After implementing BLAKE3 using SYCL, I decided to accelerate 2-to-1 hash implementation of all variants of SHA1, SHA2 & SHA3 families of cryptographic hash functions. BLAKE3 lends itself pretty well to parallelization efforts, due to its inherent data parallel friendly algorithmic construction, where each 1024 -bytes chunk can be compressed independently ( read parallelly ) and finally it's a binary merklization problem with compressed chunks as leaf nodes of binary merkle tree. But none of SHA1, SHA2 & SHA3 families of cryptographic hash functions are data parallel, requiring to process each message block ( can be 512 -bit/ 1024 -bit or padded to 1600 -bit in case of SHA3 family ) sequentially, which is why I only concentrated on accelerating Binary Merklization where SHA1/ SHA2/ SHA3 families of cryptographic ( 2-to-1 ) hash functions are used for computing all intermediate nodes of tree when N -many leaf nodes are provided, where `N = 2 ^ i | i = {1, 2, 3 ...}`. Each of these N -many leaf nodes are respective hash digests --- for example, when using SHA2-256 variant for computing all intermediate nodes of binary merkle tree, each of provided leaf node is 32 -bytes wide, representing a SHA2-256 digest. Now, N -many leaf digests are merged into N/ 2 -many digests which are intermediate nodes, living just above leaf nodes. Then in next phase, those N/ 2 -many intermediates are used for computing N/ 4 -many of intermediates which are living just above them. This process continues until root of merkle tree is computed. Notice, that in each level of tree, each consecutive pair of digests can be hashed independently --- and that's the scope of parallelism I'd like to make use of during binary merklization. In following depiction, when N ( = 4 ) nodes are provided as input, two intermediates can be computed in parallel and once they're computed root of tree can be computed as a single task.
+After implementing BLAKE3 using SYCL, I decided to accelerate 2-to-1 hash implementation of all variants of SHA1, SHA2 & SHA3 families of cryptographic hash functions ( along with `keccak256` ). BLAKE3 lends itself pretty well to parallelization efforts, due to its inherent data parallel friendly algorithmic construction, where each 1024 -bytes chunk can be compressed independently ( read parallelly ) and finally it's a binary merklization problem with compressed chunks as leaf nodes of binary merkle tree. But none of SHA1, SHA2 & SHA3 ( or keccak256 ) families of cryptographic hash functions are data parallel, requiring to process each message block ( can be 512 -bit/ 1024 -bit or padded to 1600 -bit in case of SHA3 family ) sequentially, which is why I only concentrated on accelerating Binary Merklization where SHA1/ SHA2/ SHA3 families of cryptographic ( 2-to-1 ) hash functions are used for computing all intermediate nodes of tree when N -many leaf nodes are provided, where `N = 2 ^ i | i = {1, 2, 3 ...}`. Each of these N -many leaf nodes are respective hash digests --- for example, when using SHA2-256 variant for computing all intermediate nodes of binary merkle tree, each of provided leaf node is 32 -bytes wide, representing a SHA2-256 digest. Now, N -many leaf digests are merged into N/ 2 -many digests which are intermediate nodes, living just above leaf nodes. Then in next phase, those N/ 2 -many intermediates are used for computing N/ 4 -many of intermediates which are living just above them. This process continues until root of merkle tree is computed. Notice, that in each level of tree, each consecutive pair of digests can be hashed independently --- and that's the scope of parallelism I'd like to make use of during binary merklization. In following depiction, when N ( = 4 ) nodes are provided as input, two intermediates can be computed in parallel and once they're computed root of tree can be computed as a single task.
 
 ```bash
   ((a, b), (c, d))          < --- [Level 1] [Root]
@@ -25,7 +25,7 @@ input   = [a, b, c, d]
 output  = [0, ((a, b), (c, d)), (a, b), (c, d)]
 ```
 
-Here in this repository, I'm keeping binary merklization kernels, implemented in SYCL, while using SHA1/ SHA2/ SHA3 variants as 2-to-1 hash function, which one to use is compile-time choice using pre-processor directive.
+Here in this repository, I'm keeping binary merklization kernels, implemented in SYCL, while using SHA1/ SHA2/ SHA3 variants as 2-to-1 hash function ( along with keccak256 ), which one to use is compile-time choice using pre-processor directive.
 
 If you happen to be interested in Binary Merklization using Rescue Prime Hash/ BLAKE3, consider seeing following links.
 
@@ -36,6 +36,8 @@ If you happen to be interested in Binary Merklization using Rescue Prime Hash/ B
 
 > During SHA3 implementations, I've followed SHA-3 Standard [specification](http://dx.doi.org/10.6028/NIST.FIPS.202).
 
+> During Keccak256 implementation, I took some inspiration from [here](https://keccak.team/files/Keccak-implementation-3.2.pdf); though note that, keccak256 & sha3-256 are very much similar, except input message padding rule; see https://github.com/itzmeanjan/merklize-sha/pull/10 PR description.
+
 > Using SHA1 for binary merklization may not be a good choice these days, see [here](https://csrc.nist.gov/Projects/Hash-Functions/NIST-Policy-on-Hash-Functions). But still I'm keeping SHA1 implementation, just as a reference.
 
 ## Prerequisites
@@ -153,5 +155,9 @@ I'm keeping binary merklization benchmark results of
   - [Nvidia GPU(s)](results/sha3-512/nvidia_gpu.md)
   - [Intel CPU(s)](results/sha3-512/intel_cpu.md)
   - [Intel GPU(s)](results/sha3-512/intel_gpu.md)
+- KECCAK-256
+  - [Nvidia GPU(s)](results/keccak-256/nvidia_gpu.md)
+  - [Intel CPU(s)](results/keccak-256/intel_cpu.md)
+  - [Intel GPU(s)](results/keccak-256/intel_gpu.md)
 
 obtained after executing them on multiple accelerators.
diff --git a/results/keccak-256/intel_cpu.md b/results/keccak-256/intel_cpu.md
new file mode 100644
index 0000000..3508569
--- /dev/null
+++ b/results/keccak-256/intel_cpu.md
@@ -0,0 +1,109 @@
+### Binary Merklization using KECCAK-256 on Intel CPU(s)
+
+Compiling with
+
+```bash
+SHA=keccak_256 make aot_cpu
+```
+
+### On `Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz`
+
+```bash
+$ lscpu | grep -i cpu\(s\)
+
+CPU(s):                          4
+On-line CPU(s) list:             0-3
+NUMA node0 CPU(s):               0-3
+```
+
+```bash
+running on Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
+
+
+Benchmarking Binary Merklization using KECCAK-256
+
+      leaf count                  execution time                host-to-device tx time          device-to-host tx time
+        2 ^ 20                   466.478477 ms                     3.288778 ms                     3.442020 ms
+        2 ^ 21                   898.963977 ms                     6.508914 ms                     6.558546 ms
+        2 ^ 22                      1.797621 s                    13.061319 ms                    13.172746 ms
+        2 ^ 23                      3.591501 s                    27.324937 ms                    27.123078 ms
+        2 ^ 24                      7.186666 s                    54.148528 ms                    54.237210 ms
+        2 ^ 25                     14.404052 s                   123.865217 ms                   108.246855 ms
+```
+
+### On `Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz`
+
+```bash
+$ lscpu | grep -i cpu\(s\)
+
+CPU(s):                          128
+On-line CPU(s) list:             0-127
+NUMA node0 CPU(s):               0-31,64-95
+NUMA node1 CPU(s):               32-63,96-127
+```
+
+```bash
+running on Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
+
+
+Benchmarking Binary Merklization using KECCAK-256
+
+      leaf count                  execution time                host-to-device tx time          device-to-host tx time
+        2 ^ 20                    13.362355 ms                     1.821476 ms                     1.326708 ms
+        2 ^ 21                    20.922397 ms                     3.589614 ms                     2.430955 ms
+        2 ^ 22                    33.674320 ms                     6.493885 ms                     4.294246 ms
+        2 ^ 23                   106.859444 ms                    11.947260 ms                     8.593155 ms
+        2 ^ 24                   117.165222 ms                    23.851139 ms                     8.417020 ms
+        2 ^ 25                   233.647003 ms                    25.051263 ms                    16.673447 ms
+```
+
+### On `Intel(R) Xeon(R) Gold 6128 CPU @ 3.40GHz`
+
+```bash
+$ lscpu | grep -i cpu\(s\)
+
+CPU(s):                          24
+On-line CPU(s) list:             0-23
+NUMA node0 CPU(s):               0-5,12-17
+NUMA node1 CPU(s):               6-11,18-23
+```
+
+```bash
+running on Intel(R) Xeon(R) Gold 6128 CPU @ 3.40GHz
+
+
+Benchmarking Binary Merklization using KECCAK-256
+
+      leaf count                  execution time                host-to-device tx time          device-to-host tx time
+        2 ^ 20                    34.571529 ms                     1.809763 ms                   897.616875 us
+        2 ^ 21                    61.404680 ms                     3.326612 ms                     1.588368 ms
+        2 ^ 22                   117.968746 ms                     5.674248 ms                     7.157974 ms
+        2 ^ 23                   231.852088 ms                     9.238144 ms                    13.273680 ms
+        2 ^ 24                   462.241001 ms                    20.315251 ms                    12.602417 ms
+        2 ^ 25                   924.972606 ms                    31.446401 ms                    24.707977 ms
+```
+
+### On `Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz`
+
+```bash
+$ lscpu | grep -i cpu\(s\)
+
+CPU(s):                          12
+On-line CPU(s) list:             0-11
+NUMA node0 CPU(s):               0-11
+```
+
+```bash
+running on Intel(R) Xeon(R) E-2176G CPU @ 3.70GHz
+
+
+Benchmarking Binary Merklization using KECCAK-256
+
+      leaf count                  execution time                host-to-device tx time          device-to-host tx time
+        2 ^ 20                    73.894415 ms                   932.138625 us                   850.445250 us
+        2 ^ 21                   109.423621 ms                     1.782943 ms                     1.715456 ms
+        2 ^ 22                   218.244072 ms                     3.493360 ms                     3.446031 ms
+        2 ^ 23                   436.918616 ms                     6.905427 ms                     6.842661 ms
+        2 ^ 24                   883.594877 ms                    13.812258 ms                    13.749230 ms
+        2 ^ 25                      1.930962 s                    27.554382 ms                    27.591307 ms
+```
diff --git a/results/keccak-256/intel_gpu.md b/results/keccak-256/intel_gpu.md
new file mode 100644
index 0000000..231c9c5
--- /dev/null
+++ b/results/keccak-256/intel_gpu.md
@@ -0,0 +1,24 @@
+### Binary Merklization using KECCAK-256 on Intel GPU(s)
+
+Compiling with
+
+```bash
+SHA=keccak_256 make aot_gpu
+```
+
+### On `Intel(R) UHD Graphics P630 [0x3e96]`
+
+```bash
+running on Intel(R) UHD Graphics P630 [0x3e96]
+
+
+Benchmarking Binary Merklization using KECCAK-256
+
+      leaf count                  execution time                host-to-device tx time          device-to-host tx time
+        2 ^ 20                   108.488926 ms                     1.332275 ms                   745.381500 us
+        2 ^ 21                   212.384799 ms                     1.497735 ms                     1.454533 ms
+        2 ^ 22                   422.459127 ms                     5.289694 ms                     2.832562 ms
+        2 ^ 23                   841.035348 ms                     5.684048 ms                     5.597084 ms
+        2 ^ 24                      1.679276 s                    11.176738 ms                    11.080438 ms
+        2 ^ 25                      3.355854 s                    22.150604 ms                    22.356589 ms
+```
diff --git a/results/keccak-256/nvidia_gpu.md b/results/keccak-256/nvidia_gpu.md
new file mode 100644
index 0000000..13b1145
--- /dev/null
+++ b/results/keccak-256/nvidia_gpu.md
@@ -0,0 +1,24 @@
+### Binary Merklization using KECCAK-256 on Nvidia GPU(s)
+
+Compile with
+
+```bash
+SHA=keccak_256 make cuda
+```
+
+### On `Tesla V100-SXM2-16GB`
+
+```bash
+running on Tesla V100-SXM2-16GB
+
+
+Benchmarking Binary Merklization using KECCAK-256
+
+      leaf count                  execution time                host-to-device tx time          device-to-host tx time
+        2 ^ 20                   751.924875 us                     1.167792 ms                     1.005363 ms
+        2 ^ 21                     1.344910 ms                     2.304931 ms                     2.016678 ms
+        2 ^ 22                     2.517974 ms                     4.593017 ms                     4.025208 ms
+        2 ^ 23                     4.864380 ms                     9.128906 ms                     8.053345 ms
+        2 ^ 24                     8.179686 ms                    18.250488 ms                    16.049194 ms
+        2 ^ 25                    16.144776 ms                    36.534668 ms                    32.099121 ms
+```

From 75dfd47f7fce3483f650affc85852a04c3156d21 Mon Sep 17 00:00:00 2001
From: Anjan Roy <anjanroy@yandex.com>
Date: Mon, 14 Mar 2022 04:42:08 +0000
Subject: [PATCH 6/7] usage example of keccak256 2-to-1 hash

---
 example/keccak_256.cpp | 68 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 example/keccak_256.cpp

diff --git a/example/keccak_256.cpp b/example/keccak_256.cpp
new file mode 100644
index 0000000..bffed52
--- /dev/null
+++ b/example/keccak_256.cpp
@@ -0,0 +1,68 @@
+#include "keccak_256.hpp"
+#include <cassert>
+
+// This example attempts to show how to use 2-to-1 KECCAK-256 hash function !
+int
+main(int argc, char** argv)
+{
+  // $ python3
+  // >>> a = [0xff] * 32
+  //
+  // first input digest
+  constexpr sycl::uchar digest_0[32] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+  };
+
+  // >>> b = [0x0f] * 32
+  //
+  // second input digest
+  constexpr sycl::uchar digest_1[32] = { 15, 15, 15, 15, 15, 15, 15, 15,
+                                         15, 15, 15, 15, 15, 15, 15, 15,
+                                         15, 15, 15, 15, 15, 15, 15, 15,
+                                         15, 15, 15, 15, 15, 15, 15, 15 };
+
+  // >>> c = a + b
+  // >>> import sha3
+  // >>> list(sha3.keccak_256(bytes(c)).digest())
+  //
+  // final output digest after merging two input digests
+  constexpr sycl::uchar digest_2[32] = { 134, 168, 210, 52, 189, 102, 98,  76,
+                                         116, 7,   56,  37, 176, 235, 150, 128,
+                                         138, 149, 200, 26, 32,  23,  79,  38,
+                                         139, 137, 109, 85, 195, 183, 230, 9 };
+
+  sycl::default_selector s{};
+  sycl::device d{ s };
+  sycl::context c{ d };
+  sycl::queue q{ c, d };
+
+  // so that input digests can be transferred from host to device ( by runtime )
+  sycl::uchar* in = static_cast<sycl::uchar*>(
+    sycl::malloc_shared(sizeof(digest_0) + sizeof(digest_1), q));
+
+  // so that output digest can be transferred from device to host ( by runtime )
+  sycl::uchar* out =
+    static_cast<sycl::uchar*>(sycl::malloc_shared(sizeof(digest_2), q));
+
+  // copy both input digests to device memory
+  q.memcpy(in + 0, digest_0, sizeof(digest_0)).wait();
+  q.memcpy(in + sizeof(digest_0), digest_1, sizeof(digest_1)).wait();
+
+  // compute 2-to-1 hash
+  q.single_task<class kernelExampleKECCAK_256>(
+    [=]() { keccak_256::hash(in, out); });
+  q.wait();
+
+  // finally assert !
+  for (size_t i = 0; i < sizeof(digest_2); i++) {
+    assert(*(out + i) == digest_2[i]);
+  }
+
+  // deallocate resources !
+  sycl::free(in, q);
+  sycl::free(out, q);
+
+  return EXIT_SUCCESS;
+}

From 35574c78275c4b5d5f07cd07989a042aa029408d Mon Sep 17 00:00:00 2001
From: Anjan Roy <anjanroy@yandex.com>
Date: Mon, 14 Mar 2022 04:43:15 +0000
Subject: [PATCH 7/7] final touch in documentation

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7a968be..b5880ed 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 # merklize-sha
 
-SYCL accelerated Binary Merklization using SHA1, SHA2 & SHA3
+SYCL accelerated Binary Merklization using SHA1, SHA2 & SHA3 ( along with keccak256 )
 
 ## Motivation
 
-After implementing BLAKE3 using SYCL, I decided to accelerate 2-to-1 hash implementation of all variants of SHA1, SHA2 & SHA3 families of cryptographic hash functions ( along with `keccak256` ). BLAKE3 lends itself pretty well to parallelization efforts, due to its inherent data parallel friendly algorithmic construction, where each 1024 -bytes chunk can be compressed independently ( read parallelly ) and finally it's a binary merklization problem with compressed chunks as leaf nodes of binary merkle tree. But none of SHA1, SHA2 & SHA3 ( or keccak256 ) families of cryptographic hash functions are data parallel, requiring to process each message block ( can be 512 -bit/ 1024 -bit or padded to 1600 -bit in case of SHA3 family ) sequentially, which is why I only concentrated on accelerating Binary Merklization where SHA1/ SHA2/ SHA3 families of cryptographic ( 2-to-1 ) hash functions are used for computing all intermediate nodes of tree when N -many leaf nodes are provided, where `N = 2 ^ i | i = {1, 2, 3 ...}`. Each of these N -many leaf nodes are respective hash digests --- for example, when using SHA2-256 variant for computing all intermediate nodes of binary merkle tree, each of provided leaf node is 32 -bytes wide, representing a SHA2-256 digest. Now, N -many leaf digests are merged into N/ 2 -many digests which are intermediate nodes, living just above leaf nodes. Then in next phase, those N/ 2 -many intermediates are used for computing N/ 4 -many of intermediates which are living just above them. This process continues until root of merkle tree is computed. Notice, that in each level of tree, each consecutive pair of digests can be hashed independently --- and that's the scope of parallelism I'd like to make use of during binary merklization. In following depiction, when N ( = 4 ) nodes are provided as input, two intermediates can be computed in parallel and once they're computed root of tree can be computed as a single task.
+After implementing BLAKE3 using SYCL, I decided to accelerate 2-to-1 hash implementation of all variants of SHA1, SHA2 & SHA3 families of cryptographic hash functions ( along with keccak256 ). BLAKE3 lends itself pretty well to parallelization efforts, due to its inherent data parallel friendly algorithmic construction, where each 1024 -bytes chunk can be compressed independently ( read parallelly ) and finally it's a binary merklization problem with compressed chunks as leaf nodes of binary merkle tree. But none of SHA1, SHA2 & SHA3 ( or keccak256 ) families of cryptographic hash functions are data parallel, requiring to process each message block ( can be 512 -bit/ 1024 -bit or padded to 1600 -bit in case of SHA3 family ) sequentially, which is why I only concentrated on accelerating Binary Merklization where SHA1/ SHA2/ SHA3 families of cryptographic ( 2-to-1 ) hash functions are used for computing all intermediate nodes of tree when N -many leaf nodes are provided, where `N = 2 ^ i | i = {1, 2, 3 ...}`. Each of these N -many leaf nodes are respective hash digests --- for example, when using SHA2-256 variant for computing all intermediate nodes of binary merkle tree, each of provided leaf node is 32 -bytes wide, representing a SHA2-256 digest. Now, N -many leaf digests are merged into N/ 2 -many digests which are intermediate nodes, living just above leaf nodes. Then in next phase, those N/ 2 -many intermediates are used for computing N/ 4 -many of intermediates which are living just above them. This process continues until root of merkle tree is computed. Notice, that in each level of tree, each consecutive pair of digests can be hashed independently --- and that's the scope of parallelism I'd like to make use of during binary merklization. In following depiction, when N ( = 4 ) nodes are provided as input, two intermediates can be computed in parallel and once they're computed root of tree can be computed as a single task.
 
 ```bash
   ((a, b), (c, d))          < --- [Level 1] [Root]
@@ -90,12 +90,13 @@ If you happen to be interested in 2-to-1 hash implementation of
 - [SHA3-256](https://github.com/itzmeanjan/merklize-sha/blob/8f9b168/example/sha3_256.cpp)
 - [SHA3-384](https://github.com/itzmeanjan/merklize-sha/blob/8f9b168/example/sha3_384.cpp)
 - [SHA3-512](https://github.com/itzmeanjan/merklize-sha/blob/8f9b168/example/sha3_512.cpp)
+- [KECCAK-256](https://github.com/itzmeanjan/merklize-sha/blob/75dfd47/example/keccak_256.cpp)
 
 where two digests of respective hash functions are input, in byte concatenated form, to `hash( ... )` function, consider taking a look at above hyperlinked examples.
 
 > Compile above examples using `dpcpp -fsycl example/<file>.cpp -I./include`
 
-You will probably like to see how binary merklization kernels use these 2-to-1 hash functions; see [here](https://github.com/itzmeanjan/merklize-sha/blob/4aadd99/include/merklize.hpp)
+You will probably like to see how binary merklization kernels use these 2-to-1 hash functions; see [here](https://github.com/itzmeanjan/merklize-sha/blob/ddb7ac9/include/merklize.hpp)
 
 ## Tests