xuhancn
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎BUILD.bazel
+2 b/‎BUILD.bazel
+2
diff --git a/‎CMakeLists.txt
+7 b/‎CMakeLists.txt
+7
diff --git a/‎WORKSPACE
+6 b/‎WORKSPACE
+6
diff --git a/‎aten/src/ATen/CMakeLists.txt
+9-1 b/‎aten/src/ATen/CMakeLists.txt
+9-1
diff --git a/‎aten/src/ATen/Config.h.in
+1 b/‎aten/src/ATen/Config.h.in
+1
diff --git a/‎aten/src/ATen/Context.cpp
+4 b/‎aten/src/ATen/Context.cpp
+4
diff --git a/‎aten/src/ATen/Context.h
+5 b/‎aten/src/ATen/Context.h
+5
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp
+67 b/‎aten/src/ATen/native/LinearAlgebra.cpp
+67
@@ -131,3 +131,6 @@
 	path = third_party/composable_kernel
 	url = https://github.com/ROCm/composable_kernel.git
 	branch = develop
+[submodule "third_party/kleidiai"]
+	path = third_party/kleidiai
+	url = https://github.com/ARM-software/kleidiai.git
@@ -257,6 +257,7 @@ filegroup(
     # target that generates these sources...
 )
 
+# TODO: Enable support for KleidiAI bazel build
 header_template_rule(
     name = "aten_src_ATen_config",
     src = "aten/src/ATen/Config.h.in",
@@ -276,6 +277,7 @@ header_template_rule(
         "@AT_PARALLEL_NATIVE@": "1",
         "@AT_BLAS_F2C@": "0",
         "@AT_BLAS_USE_CBLAS_DOT@": "1",
+        "@AT_KLEIDIAI_ENABLED@": "0",
     },
 )
 
 
@@ -377,6 +377,8 @@ cmake_dependent_option(
 cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
 cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                        OFF "USE_CUDA" OFF)
+cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
+                        "CPU_AARCH64" OFF)
 
 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
@@ -418,6 +420,8 @@ endif()
 if(WIN32)
   set(USE_TENSORPIPE OFF)
   message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
+  set(USE_KLEIDIAI OFF)
+  message(WARNING "KleidiAI cannot be used on Windows. Set it to OFF")
 
   if(USE_DISTRIBUTED AND NOT DEFINED ENV{libuv_ROOT})
     find_library(
@@ -667,6 +671,9 @@ if(ANDROID
   message(WARNING "INTERN_BUILD_MOBILE is on, disabling BUILD_LAZY_TS_BACKEND")
   set(BUILD_LAZY_TS_BACKEND OFF)
 
+  set(USE_KLEIDIAI OFF)
+  message(WARNING "KleidiAI cannot be used on Mobile builds. Set it to OFF")
+
   # Set -ffunction-sections and -fdata-sections so that each method has its own
   # text section. This allows the linker to remove unused section when the flag
   # -Wl,-gc-sections is provided at link time.
 
@@ -309,6 +309,12 @@ local_repository(
     path = "third_party/gemmlowp/gemmlowp",
 )
 
+local_repository(
+    name = "kleidiai",
+    path = "third_party/kleidiai",
+    repo_mapping = {"@com_google_googletest": "@com_google_benchmark"},
+)
+
 ### Unused repos start
 
 # `unused` repos are defined to hide bazel files from submodules of submodules.
 
@@ -219,6 +219,10 @@ endif()
 # XNNPACK
 file(GLOB native_xnnpack "native/xnnpack/*.cpp")
 
+# KLEIDIAI
+file(GLOB native_kleidiai "native/kleidiai/*.cpp")
+file(GLOB native_kleidiai_h "native/kleidiai/*.h")
+
 # Add files needed from jit folders
 append_filelist("jit_core_headers" ATen_CORE_HEADERS)
 append_filelist("jit_core_sources" ATen_CORE_SRCS)
@@ -248,6 +252,10 @@ endif()
 if(AT_MKL_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()
+if(AT_KLEIDIAI_ENABLED)
+  set(all_cpu_cpp ${all_cpu_cpp} ${native_kleidiai})
+  include_directories(SYSTEM INTERFACE ${KLEIDIAI_INCLUDE_DIRS})
+endif()
 if(AT_MKLDNN_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkldnn_cpp})
 endif()
@@ -637,7 +645,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h} ${mkldnn_xpu_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_kleidiai_h} ${native_mps_h} ${native_utils_h} ${miopen_h} ${mkldnn_xpu_h})
   # Metal
   if(USE_PYTORCH_METAL_EXPORT)
     # Add files needed from exporting metal models(optimized_for_mobile)
 
@@ -19,3 +19,4 @@
 #define AT_PARALLEL_NATIVE @AT_PARALLEL_NATIVE@
 #define AT_BLAS_F2C() @AT_BLAS_F2C@
 #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
+#define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
@@ -435,6 +435,10 @@ bool Context::hasMKLDNN() {
 #endif
 }
 
+bool Context::hasKleidiAI() {
+  return AT_KLEIDIAI_ENABLED();
+}
+
 bool Context::hasOpenMP() {
 #ifdef _OPENMP
   return true;
 
@@ -119,6 +119,7 @@ class TORCH_API Context {
 
   static bool hasOpenMP();
   static bool hasMKL();
+  static bool hasKleidiAI();
   static bool hasLAPACK();
   static bool hasMKLDNN();
   static bool hasMAGMA() {
@@ -550,6 +551,10 @@ inline bool hasMKL() {
   return globalContext().hasMKL();
 }
 
+inline bool hasKleidiAI() {
+  return globalContext().hasKleidiAI();
+}
+
 inline bool hasLAPACK() {
   return globalContext().hasLAPACK();
 }
 
@@ -33,6 +33,8 @@
 #include <ATen/ops/_addmm_activation_native.h>
 #include <ATen/ops/_compute_linear_combination_native.h>
 #include <ATen/ops/_convert_weight_to_int4pack_for_cpu_native.h>
+#include <ATen/ops/_dyn_quant_matmul_4bit_native.h>
+#include <ATen/ops/_dyn_quant_pack_4bit_weight_native.h>
 #include <ATen/ops/_int_mm_native.h>
 #include <ATen/ops/_linalg_check_errors.h>
 #include <ATen/ops/_linalg_det.h>
@@ -3429,6 +3431,8 @@ Tensor kron(const Tensor& self, const Tensor& other) {
 DEFINE_DISPATCH(weight_to_int4pack_stub);
 DEFINE_DISPATCH(int4pack_mm_stub);
 DEFINE_DISPATCH(int8pack_mm_stub);
+DEFINE_DISPATCH(dyn_quant_pack_4bit_weight_stub);
+DEFINE_DISPATCH(dyn_quant_matmul_4bit_stub);
 
 Tensor _convert_weight_to_int4pack_cpu(
     const Tensor& in,
@@ -3492,6 +3496,69 @@ Tensor _weight_int4pack_mm_cpu(
   return C;
 }
 
+Tensor _dyn_quant_pack_4bit_weight_cpu(
+    const Tensor& weights,
+    const Tensor& scales_zeros,
+    const std::optional<Tensor>& bias,
+    const int64_t block_size,
+    const int64_t in_features,
+    const int64_t out_features) {
+  TORCH_CHECK(
+      weights.dtype() == at::kByte, __func__, " : expect weight to be kByte.");
+  TORCH_CHECK(
+      block_size == in_features ||
+          (!(block_size % 32) && !(in_features % block_size)),
+      __func__,
+      ": Group size should be multiple of 32, in_features [",
+      in_features,
+      "]. Provided ",
+      block_size);
+  Tensor packed_weights =
+      at::empty(weights.sizes(), weights.options().dtype(at::kByte));
+  dyn_quant_pack_4bit_weight_stub(
+      kCPU,
+      packed_weights,
+      weights,
+      scales_zeros,
+      bias,
+      out_features,
+      in_features,
+      block_size);
+  return packed_weights;
+}
+
+Tensor _dyn_quant_matmul_4bit_cpu(
+    const Tensor& inp,
+    const Tensor& packed_weights,
+    const int64_t block_size,
+    const int64_t in_features,
+    const int64_t out_features) {
+  auto M = inp.size(0);
+  TORCH_CHECK(
+      inp.dtype() == kFloat,
+      __func__,
+      " : expect input to be 32-bit float tensor.");
+  TORCH_CHECK(
+      block_size == in_features ||
+          (!(block_size % 32) && !(in_features % block_size)),
+      __func__,
+      ": Group size should be multiple of 32, in_features [",
+      in_features,
+      "]. Provided ",
+      block_size);
+  auto output = at::empty({M, out_features}, inp.options());
+  dyn_quant_matmul_4bit_stub(
+      kCPU,
+      output,
+      inp,
+      packed_weights,
+      M,
+      out_features,
+      in_features,
+      block_size);
+  return output;
+}
+
 Tensor _weight_int8pack_mm_cpu(
     const Tensor& A,
     const Tensor& B,
Original file line number	Diff line number	Diff line change
`@@ -257,6 +257,7 @@ filegroup(`
`257`	`257`	`# target that generates these sources...`
`258`	`258`	`)`
`259`	`259`
	`260`	`+# TODO: Enable support for KleidiAI bazel build`
`260`	`261`	`header_template_rule(`
`261`	`262`	`name = "aten_src_ATen_config",`
`262`	`263`	`src = "aten/src/ATen/Config.h.in",`
`@@ -276,6 +277,7 @@ header_template_rule(`
`276`	`277`	`"@AT_PARALLEL_NATIVE@": "1",`
`277`	`278`	`"@AT_BLAS_F2C@": "0",`
`278`	`279`	`"@AT_BLAS_USE_CBLAS_DOT@": "1",`
	`280`	`+ "@AT_KLEIDIAI_ENABLED@": "0",`
`279`	`281`	`},`
`280`	`282`	`)`
`281`	`283`