From be2c9c5dfefc7f4ad7591c58f7857b177bef5b0e Mon Sep 17 00:00:00 2001
From: WenLei <lei.wen2@zte.com.cn>
Date: Thu, 28 Aug 2025 14:46:28 +0800
Subject: [PATCH] Add hardware-accelerated CRC32 support for riscv64 using the
 v,zbc,zvbc extension

---
 .../hadoop-common/src/CMakeLists.txt          |   3 +
 .../src/org/apache/hadoop/util/bulk_crc32.c   |   4 +-
 .../src/org/apache/hadoop/util/bulk_crc32.h   |   2 +
 .../apache/hadoop/util/bulk_crc32_riscv64.c   | 611 ++++++++++++++++++
 .../org/apache/hadoop/util/test_bulk_crc32.c  |   2 +
 5 files changed, 620 insertions(+), 2 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv64.c

diff --git a/hadoop-common-project/hadoop-common/src/CMakeLists.txt b/hadoop-common-project/hadoop-common/src/CMakeLists.txt
index d2ef03645a4ae..e979eeedce081 100644
--- a/hadoop-common-project/hadoop-common/src/CMakeLists.txt
+++ b/hadoop-common-project/hadoop-common/src/CMakeLists.txt
@@ -157,6 +157,9 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "x
   set(BULK_CRC_ARCH_SOURCE_FIlE "${SRC}/util/bulk_crc32_x86.c")
 elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
   set(BULK_CRC_ARCH_SOURCE_FIlE "${SRC}/util/bulk_crc32_aarch64.c")
+elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
+  set(BULK_CRC_ARCH_SOURCE_FIlE "${SRC}/util/bulk_crc32_riscv64.c")
+  add_compile_options("-march=rv64gcv_zbc_zvbc")
 else()
   message("No HW CRC acceleration for ${CMAKE_SYSTEM_PROCESSOR}, falling back to SW")
 endif()
diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
index b3bb69959b233..c9eec6d32e15e 100644
--- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
@@ -156,7 +156,7 @@ static uint32_t crc_val(uint32_t crc) {
  * Computes the CRC32c checksum for the specified buffer using the slicing by 8 
  * algorithm over 64 bit quantities.
  */
-static uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length) {
+uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length) {
   uint32_t running_length = ((length)/8)*8;
   uint32_t end_bytes = length - running_length; 
   int li;
@@ -201,7 +201,7 @@ static void pipelined_crc32c_sb8(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3,
  * Update a CRC using the "zlib" polynomial -- what Hadoop calls CHECKSUM_CRC32
  * using slicing-by-8
  */
-static uint32_t crc32_zlib_sb8(
+uint32_t crc32_zlib_sb8(
     uint32_t crc, const uint8_t *buf, size_t length) {
   uint32_t running_length = ((length)/8)*8;
   uint32_t end_bytes = length - running_length; 
diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.h b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.h
index b38a65acc6b6a..0a61f987bc1c7 100644
--- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.h
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.h
@@ -40,6 +40,8 @@ typedef struct crc32_error {
   const uint8_t *bad_data; // pointer to start of data chunk with error
 } crc32_error_t;
 
+extern uint32_t crc32c_sb8(uint32_t crc, const uint8_t *buf, size_t length);
+extern uint32_t crc32_zlib_sb8(uint32_t crc, const uint8_t *buf, size_t length);
 
 /**
  * Either calculates checksums for or verifies a buffer of data.
diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv64.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv64.c
new file mode 100644
index 0000000000000..8f7e2a1ae7da6
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32_riscv64.c
@@ -0,0 +1,611 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "bulk_crc32.h"
+#include "gcc_optimizations.h"
+
+// CRC32C (iSCSI) polynomial constants
+#define CRC32C_CONST_0     0xdd45aab8U
+#define CRC32C_CONST_1     0x493c7d27U
+#define CRC32C_CONST_QUO   0x0dea713f1ULL
+#define CRC32C_CONST_POLY  0x105ec76f1ULL
+
+// CRC32 (GZIP/zlib) polynomial constants
+#define CRC32_CONST_0      0xb8bc6765U
+#define CRC32_CONST_1      0xccaa009eU
+#define CRC32_CONST_QUO    0x1f7011641ULL
+#define CRC32_CONST_POLY   0x1db710641ULL
+
+// Folding constants for CRC32C
+static const uint64_t crc32c_fold_const[4] __attribute__((aligned(16))) = {
+    0x00000000740eef02ULL, 0x000000009e4addf8ULL,
+    0x00000000f20c0dfeULL, 0x00000000493c7d27ULL
+};
+
+// Folding constants for CRC32 (GZIP)
+static const uint64_t crc32_fold_const[4] __attribute__((aligned(16))) = {
+    0x000000008f352d95ULL, 0x000000001d9513d7ULL,
+    0x00000000ae689191ULL, 0x00000000ccaa009eULL
+};
+
+/**
+ * Hardware-accelerated CRC32C using RISC-V vector crypto extensions.
+ * This uses the reflected polynomial version compatible with standard CRC32C.
+ */
+static uint32_t crc32c_vclmul(const uint8_t *buf, size_t len, uint32_t crc) {
+    if (unlikely(len < 64)) {
+        // Fall back to table-based implementation for small buffers
+        return crc32c_sb8(crc, buf, len);
+    }
+
+    uint32_t result;
+    const uint64_t *fold_consts = crc32c_fold_const;
+
+    __asm__ __volatile__(
+        // Initialize CRC
+        "li             t5, 0xffffffff\n\t"
+        "and            %[crc], %[crc], t5\n\t"
+        "li             a3, 0\n\t"
+        "li             t1, 64\n\t"
+
+        // Set vector configuration for 128-bit elements
+        "vsetivli       zero, 2, e64, m1, ta, ma\n\t"
+
+        // Load first 64 bytes and initialize
+        "mv             a4, %[buf]\n\t"
+        "vle64.v        v0, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v1, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v2, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v3, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "andi           a3, %[len], ~63\n\t"
+        "addi           t0, a3, -64\n\t"
+
+        // XOR initial CRC into first vector
+        "vmv.s.x        v4, zero\n\t"
+        "vmv.s.x        v5, %[crc]\n\t"
+        "vslideup.vi    v5, v4, 1\n\t"
+        "vxor.vv        v0, v0, v5\n\t"
+        "vmv.s.x        v8, zero\n\t"
+
+        // Load folding constant
+        "add            a5, a4, t0\n\t"
+        "mv             t4, %[consts]\n\t"
+        "vle64.v        v5, 0(t4)\n\t"
+
+        // Check if we need main loop
+        "addi           t0, %[len], -64\n\t"
+        "bltu           t0, t1, 2f\n\t"
+
+        // Main loop - process 64 bytes at a time
+        "1:\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v0, v5\n\t"
+        "vclmulh.vv     v0, v0, v5\n\t"
+        "vredxor.vs     v0, v0, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v0, 1\n\t"
+        "vxor.vv        v0, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v1, v5\n\t"
+        "vclmulh.vv     v1, v1, v5\n\t"
+        "vredxor.vs     v1, v1, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v1, 1\n\t"
+        "vxor.vv        v1, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v2, v5\n\t"
+        "vclmulh.vv     v2, v2, v5\n\t"
+        "vredxor.vs     v2, v2, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v2, 1\n\t"
+        "vxor.vv        v2, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v3, v5\n\t"
+        "vclmulh.vv     v3, v3, v5\n\t"
+        "vredxor.vs     v3, v3, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v3, 1\n\t"
+        "vxor.vv        v3, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "bne            a4, a5, 1b\n\t"
+
+        // Fold 512 bits to 128 bits
+        "2:\n\t"
+        "addi           t4, t4, 16\n\t"
+        "vle64.v        v5, 0(t4)\n\t"
+        "vclmul.vv      v6, v0, v5\n\t"
+        "vclmulh.vv     v7, v0, v5\n\t"
+        "vredxor.vs     v6, v6, v8\n\t"
+        "vredxor.vs     v7, v7, v8\n\t"
+        "vslideup.vi    v6, v7, 1\n\t"
+        "vxor.vv        v0, v6, v1\n\t"
+
+        "vclmul.vv      v6, v0, v5\n\t"
+        "vclmulh.vv     v7, v0, v5\n\t"
+        "vredxor.vs     v6, v6, v8\n\t"
+        "vredxor.vs     v7, v7, v8\n\t"
+        "vslideup.vi    v6, v7, 1\n\t"
+        "vxor.vv        v0, v6, v2\n\t"
+
+        "vclmul.vv      v6, v0, v5\n\t"
+        "vclmulh.vv     v7, v0, v5\n\t"
+        "vredxor.vs     v6, v6, v8\n\t"
+        "vredxor.vs     v7, v7, v8\n\t"
+        "vslideup.vi    v6, v7, 1\n\t"
+        "vxor.vv        v0, v6, v3\n\t"
+
+        // Extract 128-bit result from vector register
+        "addi           sp, sp, -16\n\t"
+        "vse64.v        v0, (sp)\n\t"
+        "ld             t0, 0(sp)\n\t"
+        "ld             t1, 8(sp)\n\t"
+        "addi           sp, sp, 16\n\t"
+
+        // Barrett reduction
+        "li             t2, %[const0]\n\t"
+        "and            t2, t2, t5\n\t"
+        "li             t3, %[const1]\n\t"
+
+        "clmul          t4, t0, t3\n\t"
+        "clmulh         t3, t0, t3\n\t"
+        "xor            t1, t1, t4\n\t"
+        "and            t4, t1, t5\n\t"
+        "srli           t1, t1, 32\n\t"
+        "clmul          t0, t4, t2\n\t"
+        "slli           t3, t3, 32\n\t"
+        "xor            t3, t3, t1\n\t"
+        "xor            t3, t3, t0\n\t"
+
+        // Final Barrett reduction
+        "and            t4, t3, t5\n\t"
+        "li             t2, %[quo]\n\t"
+        "li             t1, %[poly]\n\t"
+        "clmul          t4, t4, t2\n\t"
+        "and            t4, t4, t5\n\t"
+        "clmul          t4, t4, t1\n\t"
+        "xor            t4, t3, t4\n\t"
+        "srai           %[result], t4, 32\n\t"
+        "and            %[result], %[result], t5\n\t"
+
+        : [result] "=r" (result)
+        : [buf] "r" (buf), [len] "r" (len), [crc] "r" (crc), [consts] "r" (fold_consts),
+          [const0] "i" (CRC32C_CONST_0), [const1] "i" (CRC32C_CONST_1),
+          [quo] "i" (CRC32C_CONST_QUO), [poly] "i" (CRC32C_CONST_POLY)
+        : "a3", "a4", "a5", "t0", "t1", "t2", "t3", "t4", "t5", "v0", "v1", "v2", "v3",
+          "v4", "v5", "v6", "v7", "v8", "memory"
+    );
+    size_t tail_len = len % 64;
+    if (tail_len > 0){
+        result = crc32c_sb8(result, buf + len - tail_len, tail_len);
+    }
+    return result;
+}
+
+/**
+ * Hardware-accelerated CRC32 (GZIP/zlib) using RISC-V vector crypto extensions.
+ */
+static uint32_t crc32_zlib_vclmul(const uint8_t *buf, size_t len, uint32_t crc) {
+    if (unlikely(len < 64)) {
+        // Fall back to table-based implementation for small buffers
+        return crc32_zlib_sb8(crc, buf, len);
+    }
+
+    uint32_t result;
+    const uint64_t *fold_consts = crc32_fold_const;
+
+    __asm__ __volatile__(
+        // Initialize CRC
+        "li             t5, 0xffffffff\n\t"
+        "and            %[crc], %[crc], t5\n\t"
+        "li             a3, 0\n\t"
+        "li             t1, 64\n\t"
+
+        // Set vector configuration for 128-bit elements
+        "vsetivli       zero, 2, e64, m1, ta, ma\n\t"
+
+        // Load first 64 bytes and initialize
+        "mv             a4, %[buf]\n\t"
+        "vle64.v        v0, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v1, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v2, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v3, 0(a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "andi           a3, %[len], ~63\n\t"
+        "addi           t0, a3, -64\n\t"
+
+        // XOR initial CRC into first vector
+        "vmv.s.x        v4, zero\n\t"
+        "vmv.s.x        v5, %[crc]\n\t"
+        "vslideup.vi    v5, v4, 1\n\t"
+        "vxor.vv        v0, v0, v5\n\t"
+        "vmv.s.x        v8, zero\n\t"
+
+        // Load folding constant
+        "add            a5, a4, t0\n\t"
+        "mv             t4, %[consts]\n\t"
+        "vle64.v        v5, 0(t4)\n\t"
+
+        // Check if we need main loop
+        "addi           t0, %[len], -64\n\t"
+        "bltu           t0, t1, 2f\n\t"
+
+        // Main loop - process 64 bytes at a time
+        "1:\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v0, v5\n\t"
+        "vclmulh.vv     v0, v0, v5\n\t"
+        "vredxor.vs     v0, v0, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v0, 1\n\t"
+        "vxor.vv        v0, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v1, v5\n\t"
+        "vclmulh.vv     v1, v1, v5\n\t"
+        "vredxor.vs     v1, v1, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v1, 1\n\t"
+        "vxor.vv        v1, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v2, v5\n\t"
+        "vclmulh.vv     v2, v2, v5\n\t"
+        "vredxor.vs     v2, v2, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v2, 1\n\t"
+        "vxor.vv        v2, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "vle64.v        v7, 0(a4)\n\t"
+        "vclmul.vv      v4, v3, v5\n\t"
+        "vclmulh.vv     v3, v3, v5\n\t"
+        "vredxor.vs     v3, v3, v8\n\t"
+        "vredxor.vs     v4, v4, v8\n\t"
+        "vslideup.vi    v4, v3, 1\n\t"
+        "vxor.vv        v3, v4, v7\n\t"
+
+        "addi           a4, a4, 16\n\t"
+        "bne            a4, a5, 1b\n\t"
+
+        // Fold 512 bits to 128 bits
+        "2:\n\t"
+        "addi           t4, t4, 16\n\t"
+        "vle64.v        v5, 0(t4)\n\t"
+        "vclmul.vv      v6, v0, v5\n\t"
+        "vclmulh.vv     v7, v0, v5\n\t"
+        "vredxor.vs     v6, v6, v8\n\t"
+        "vredxor.vs     v7, v7, v8\n\t"
+        "vslideup.vi    v6, v7, 1\n\t"
+        "vxor.vv        v0, v6, v1\n\t"
+
+        "vclmul.vv      v6, v0, v5\n\t"
+        "vclmulh.vv     v7, v0, v5\n\t"
+        "vredxor.vs     v6, v6, v8\n\t"
+        "vredxor.vs     v7, v7, v8\n\t"
+        "vslideup.vi    v6, v7, 1\n\t"
+        "vxor.vv        v0, v6, v2\n\t"
+
+        "vclmul.vv      v6, v0, v5\n\t"
+        "vclmulh.vv     v7, v0, v5\n\t"
+        "vredxor.vs     v6, v6, v8\n\t"
+        "vredxor.vs     v7, v7, v8\n\t"
+        "vslideup.vi    v6, v7, 1\n\t"
+        "vxor.vv        v0, v6, v3\n\t"
+
+        // Extract 128-bit result from vector register
+        "addi           sp, sp, -16\n\t"
+        "vse64.v        v0, (sp)\n\t"
+        "ld             t0, 0(sp)\n\t"
+        "ld             t1, 8(sp)\n\t"
+        "addi           sp, sp, 16\n\t"
+
+        // Barrett reduction
+        "li             t2, %[const0]\n\t"
+        "and            t2, t2, t5\n\t"
+        "li             t3, %[const1]\n\t"
+        "and            t3, t3, t5\n\t"
+
+        "clmul          t4, t0, t3\n\t"
+        "clmulh         t3, t0, t3\n\t"
+        "xor            t1, t1, t4\n\t"
+        "and            t4, t1, t5\n\t"
+        "srli           t1, t1, 32\n\t"
+        "clmul          t0, t4, t2\n\t"
+        "slli           t3, t3, 32\n\t"
+        "xor            t3, t3, t1\n\t"
+        "xor            t3, t3, t0\n\t"
+
+        // Final Barrett reduction
+        "and            t4, t3, t5\n\t"
+        "li             t2, %[quo]\n\t"
+        "li             t1, %[poly]\n\t"
+        "clmul          t4, t4, t2\n\t"
+        "and            t4, t4, t5\n\t"
+        "clmul          t4, t4, t1\n\t"
+        "xor            t4, t3, t4\n\t"
+        "srai           %[result], t4, 32\n\t"
+        "and            %[result], %[result], t5\n\t"
+
+        : [result] "=r" (result)
+        : [buf] "r" (buf), [len] "r" (len), [crc] "r" (crc), [consts] "r" (fold_consts),
+          [const0] "i" (CRC32_CONST_0), [const1] "i" (CRC32_CONST_1),
+          [quo] "i" (CRC32_CONST_QUO), [poly] "i" (CRC32_CONST_POLY)
+        : "a3", "a4", "a5", "t0", "t1", "t2", "t3", "t4", "t5", "v0", "v1", "v2", "v3",
+          "v4", "v5", "v6", "v7", "v8", "memory"
+    );
+
+    size_t tail_len = len % 64;
+    if (tail_len > 0) {
+        result = crc32_zlib_sb8(result, buf + len - tail_len, tail_len);
+    }
+    return result;
+}
+
+static void pipelined_crc32c(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3,
+                             const uint8_t *p_buf, size_t block_size, int num_blocks) {
+    assert(num_blocks >= 1 && num_blocks <= 3 && "invalid num_blocks");
+
+    *crc1 = crc32c_vclmul(p_buf, block_size, *crc1);
+
+    if (num_blocks >= 2) {
+        *crc2 = crc32c_vclmul(p_buf + block_size, block_size, *crc2);
+    }
+
+    if (num_blocks >= 3) {
+        *crc3 = crc32c_vclmul(p_buf + 2 * block_size, block_size, *crc3);
+    }
+}
+
+static void pipelined_crc32_zlib(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3,
+                                 const uint8_t *p_buf, size_t block_size, int num_blocks) {
+    assert(num_blocks >= 1 && num_blocks <= 3 && "invalid num_blocks");
+
+    *crc1 = crc32_zlib_vclmul(p_buf, block_size, *crc1);
+
+    if (num_blocks >= 2) {
+        *crc2 = crc32_zlib_vclmul(p_buf + block_size, block_size, *crc2);
+    }
+
+    if (num_blocks >= 3) {
+        *crc3 = crc32_zlib_vclmul(p_buf + 2 * block_size, block_size, *crc3);
+    }
+}
+
+typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, const uint8_t *, size_t, int);
+extern crc_pipelined_func_t pipelined_crc32c_func;
+extern crc_pipelined_func_t pipelined_crc32_zlib_func;
+
+/**
+ * Runtime detection of RISC-V vector crypto support.
+ */
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/utsname.h>
+#include <linux/types.h>
+#include <stdio.h>
+#include <string.h>
+
+ // riscv_hwprobe definitions for compatibility with older kernels
+#ifndef __NR_riscv_hwprobe
+#if defined(__riscv) && __riscv_xlen == 64
+  #define __NR_riscv_hwprobe 258
+#endif
+#endif
+
+// RISC-V hardware capability detection
+#ifndef COMPAT_HWCAP_ISA_V
+#define COMPAT_HWCAP_ISA_V  (1 << ('V' - 'A'))
+#endif
+
+// Define riscv_hwprobe structure if not available in headers
+#ifndef HAVE_RISCV_HWPROBE
+struct riscv_hwprobe {
+    __s64 key;
+    __u64 value;
+};
+#endif
+
+// hwprobe key definitions
+#ifndef RISCV_HWPROBE_KEY_IMA_EXT_0
+#define RISCV_HWPROBE_KEY_IMA_EXT_0      4
+#define RISCV_HWPROBE_EXT_ZBC            (1 << 7)
+#define RISCV_HWPROBE_EXT_ZVBC           (1 << 18)
+#endif
+
+#define REQ_KERNEL_MAJOR 6
+#define REQ_KERNEL_MINOR 4
+
+/**
+ * Parse kernel version from uname
+ */
+static int parse_kernel_version(int *major, int *minor, int *patch) {
+    struct utsname uts;
+    if (uname(&uts) != 0) {
+        return -1;
+    }
+
+    *patch = 0; // Default value if not parsed
+    if (sscanf(uts.release, "%d.%d.%d", major, minor, patch) >= 2) {
+        return 0;
+    }
+
+    return -1;
+}
+
+/**
+ * Check if kernel version supports hwprobe
+ * hwprobe was introduced in Linux 6.4
+ */
+static int kernel_supports_hwprobe(void) {
+    int major, minor, patch;
+    if (parse_kernel_version(&major, &minor, &patch) != 0) {
+        return 0; // Unknown, assume not supported
+    }
+
+    if (major > REQ_KERNEL_MAJOR) {
+        return 1;
+    }
+    if (major == REQ_KERNEL_MAJOR && minor >= REQ_KERNEL_MINOR) {
+        return 1;
+    }
+
+    return 0;
+}
+
+/**
+ * Detect extensions using riscv_hwprobe
+ * Returns: -1 on error
+ */
+static int detect_with_hwprobe(int *has_zbc, int *has_zvbc) {
+   #ifdef __NR_riscv_hwprobe
+   if (!kernel_supports_hwprobe()) {
+       return -1;
+   }
+
+   struct riscv_hwprobe probe;
+   probe.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
+   probe.value = 0;
+
+   long ret = syscall(__NR_riscv_hwprobe, &probe, 1, 0, NULL, 0);
+   if (ret != 0) {
+       return -1;
+   }
+
+   *has_zbc  = (probe.value & RISCV_HWPROBE_EXT_ZBC)  ? 1 : 0;
+   *has_zvbc = (probe.value & RISCV_HWPROBE_EXT_ZVBC) ? 1 : 0;
+
+   return 0;
+   #else
+   return -1;
+   #endif
+}
+
+/**
+ * Check for RISC-V extension support via /proc/cpuinfo
+ */
+static int check_riscv_extension(const char *extension) {
+    FILE *cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (!cpuinfo) {
+        return 0;
+    }
+
+    char line[512];
+    int found = 0;
+
+    while (fgets(line, sizeof(line), cpuinfo)) {
+       if(strncmp(line, "isa", 3) == 0) {
+       if (strstr(line, extension)) {
+           found = 1;
+       break;
+       }
+   }
+   }
+
+    fclose(cpuinfo);
+    return found;
+}
+
+/**
+ * Detect extensions via /proc/cpuinfo
+ */
+static int detect_with_cpuinfo(int *has_zbc, int *has_zvbc) {
+
+    *has_zbc = check_riscv_extension("zbc");
+    *has_zvbc = check_riscv_extension("zvbc");
+
+    if (*has_zbc == 0 && *has_zvbc == 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * Main detection function with multiple fallback methods
+ */
+static int detect_crypto_extensions(int *has_zbc, int *has_zvbc) {
+    if (has_zbc == NULL || has_zvbc == NULL){
+        return -1;
+    }
+
+    static int cached_zbc = -1;
+    static int cached_zvbc = -1;
+
+    // Return cached results if available
+    if (cached_zbc != -1 && cached_zvbc != -1) {
+        *has_zbc = cached_zbc;
+        *has_zvbc = cached_zvbc;
+        return 0;
+    }
+
+    // Initialize to not found
+    *has_zbc = 0;
+    *has_zvbc = 0;
+
+    if (detect_with_hwprobe(has_zbc, has_zvbc) == 0) {
+        cached_zbc = *has_zbc;
+        cached_zvbc = *has_zvbc;
+        return 0;
+    }
+
+    if (detect_with_cpuinfo(has_zbc, has_zvbc) < 0) {
+        return -1;
+    }
+
+    cached_zbc = *has_zbc;
+    cached_zvbc = *has_zvbc;
+    return 0;
+}
+
+/**
+ * Constructor function that runs when the library is loaded.
+ * Detects hardware support and sets the function pointers accordingly.
+ */
+void __attribute__((constructor)) init_cpu_support_flag(void) {
+    unsigned long auxval = getauxval(AT_HWCAP);
+    int has_zbc = 0, has_zvbc = 0;
+    if (auxval & COMPAT_HWCAP_ISA_V) {
+        if (detect_crypto_extensions(&has_zbc, &has_zvbc) == 0 && has_zbc && has_zvbc) {
+            pipelined_crc32c_func = pipelined_crc32c;
+            pipelined_crc32_zlib_func = pipelined_crc32_zlib;
+        }
+    }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c b/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c
index ef3dbecceca08..6a11ac43bac27 100644
--- a/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/test/org/apache/hadoop/util/test_bulk_crc32.c
@@ -104,6 +104,8 @@ int main(int argc, char **argv)
   EXPECT_ZERO(testBulkVerifyCrc(17, CRC32_ZLIB_POLYNOMIAL, 2));
   EXPECT_ZERO(testBulkVerifyCrc(17, CRC32C_POLYNOMIAL, 4));
   EXPECT_ZERO(testBulkVerifyCrc(17, CRC32_ZLIB_POLYNOMIAL, 4));
+  EXPECT_ZERO(testBulkVerifyCrc(257, CRC32C_POLYNOMIAL, 64));
+  EXPECT_ZERO(testBulkVerifyCrc(257, CRC32_ZLIB_POLYNOMIAL, 64));
 
   EXPECT_ZERO(timeBulkCrc(16 * 1024, CRC32C_POLYNOMIAL, 512, 1000000));
   EXPECT_ZERO(timeBulkCrc(16 * 1024, CRC32_ZLIB_POLYNOMIAL, 512, 1000000));