[FIX]Fix Machete compile via ENABLE_MACHETE (#3727)

Sunny-bot1 · aquagull · Jiang-Jia-Jun · web-flow · commit fe5d09f9ee9b · 2025-08-30T17:50:17.000+08:00
* add ENABLE_MACHETE

* fix

* revert

* update

* pre_commit

* fix

* fix

---------

Co-authored-by: Ayakouji &lt;yuhongh@qq.com&gt;
Co-authored-by: Jiang-Jia-Jun &lt;163579578+Jiang-Jia-Jun@users.noreply.github.com&gt;
Co-authored-by: aquagull &lt;hongyuh@qq.com&gt;
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("recv_expert_count"), py::arg("block_size"),
         "per token per block quant");
 
+#ifdef ENABLE_MACHETE
   /*machete/machete_mm.cu
    * machete_mm
    */
@@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
    * machete_supported_schedules
    */
   m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
+#endif
 
   /**
    * moe/fused_moe/moe_topk_select.cu
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -373,6 +373,7 @@ def find_end_files(directory, end_str):
         if not os.listdir(json_dir):
             raise ValueError("Git clone nlohmann_json failed!")
 
+    cc_compile_args = []
     nvcc_compile_args = get_gencode_flags(archs)
     nvcc_compile_args += ["-DPADDLE_DEV"]
     nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
@@ -519,12 +520,13 @@ def find_end_files(directory, end_str):
         sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
         os.system("python gpu_ops/machete/generate.py")
         sources += find_end_files("gpu_ops/machete", ".cu")
+        cc_compile_args += ["-DENABLE_MACHETE"]
 
     setup(
         name="fastdeploy_ops",
         ext_modules=CUDAExtension(
             sources=sources,
-            extra_compile_args={"nvcc": nvcc_compile_args},
+            extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
             libraries=["cublasLt"],
             extra_link_args=["-lcuda"],
         ),
diff --git a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py
@@ -26,8 +26,14 @@ def get_sm_version():
     return cc
 
 
+_ENABLE_MACHETE = False
 if current_platform.is_cuda() and get_sm_version() == 90:
-    from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
+    try:
+        from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
+
+        _ENABLE_MACHETE = True
+    except Exception:
+        pass
 
 
 def get_pack_factor(num_bits):
diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -34,12 +34,6 @@
 from .quant_base import QuantConfigBase, QuantMethodBase
 
 
-def get_sm_version():
-    prop = paddle.device.cuda.get_device_properties()
-    cc = prop.major * 10 + prop.minor
-    return cc
-
-
 class WeightOnlyConfig(QuantConfigBase):
     """
     Quantization config for weight only
@@ -139,10 +133,14 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
                 else:
                     raise ValueError(f"Unsupported MOE backend {layer.use_method}")
             else:
+                from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
+                    _ENABLE_MACHETE,
+                )
+
                 if (
                     self.name() == "wint4"
+                    and _ENABLE_MACHETE
                     and envs.FD_USE_MACHETE == "1"
-                    and get_sm_version() == 90
                     and layer.weight_shape[1]
                     and layer.weight_shape[1] % 128 == 0
                 ):