[Math] Dispatch T.rsqrt(x) into cuda intrin instead of 1 / T.sqrt(x) (#781)

LeiWang1999 · web-flow · commit b66f9aae020b · 2025-09-02T23:18:01.000+08:00
* Fix type hint for target_host parameter in compile function to allow None value

* Refactor target handling in compile function to utilize determine_target for improved clarity and consistency

* Update PrintConst function in codegen_cuda.cc to use hexfloat format for bfloat16 and float8/float4 types, while adding scientific notation comments for clarity. This change enhances the representation of floating-point constants in the generated code.

* Refactor PrintType function in codegen_cuda.cc to remove unnecessary failure conditions for floating-point types with lane counts greater than 4. This change simplifies the logic and improves code clarity.

* Enhance benchmark_matmul.py to conditionally print Reference TFlops only if ref_latency is not None. Update param.py to ensure target is converted to string for consistency. Refactor tuner.py to utilize determine_target for improved clarity in target handling.

* Remove automatic commit and push step from AMD and NVIDIA CI workflows to streamline the process and avoid unnecessary commits.

* Add intrin_rule source files to CMakeLists.txt and implement hrsqrt function for half_t in common.h

* lint fix

* remove cmake dep in pyproject as it may lead to different cmake paths in diff stages

* lint fix

* Add cmake dependency to pyproject.toml and improve build logging in setup.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -124,6 +124,8 @@ tilelang_file_glob(GLOB TILE_LANG_SRCS
   src/target/rt_mod_cpp.cc
   # webgpu doesn't have system dependency
   src/target/codegen_webgpu.cc
+  # intrin_rule doesn't have system dependency
+  src/target/intrin_rule*.cc
 )
 
 # Include CUDA source files if CUDA is enabled
diff --git a/setup.py b/setup.py
@@ -203,6 +203,7 @@ def get_cplus_compiler():
     return None
 
 
+@functools.lru_cache(maxsize=None)
 def get_cython_compiler() -> Optional[str]:
     """Return the path to the Cython compiler.
 
@@ -238,6 +239,17 @@ def get_cython_compiler() -> Optional[str]:
     return None
 
 
+@functools.lru_cache(maxsize=None)
+def get_cmake_path() -> str:
+    """Return the path to the CMake compiler.
+    """
+    # found which cmake is used
+    cmake_path = shutil.which("cmake")
+    if not os.path.exists(cmake_path):
+        raise Exception("CMake is not installed, please install it first.")
+    return cmake_path
+
+
 def get_system_info():
     system = platform.system().lower()
     if system == "linux":
@@ -338,33 +350,6 @@ def is_git_repo():
         raise RuntimeError("Failed to update submodules") from error
 
 
-def build_csrc(llvm_config_path):
-    """Configures and builds TVM."""
-
-    if not os.path.exists("build"):
-        os.makedirs("build")
-    os.chdir("build")
-    # Copy the config.cmake as a baseline
-    if not os.path.exists("config.cmake"):
-        shutil.copy("../3rdparty/tvm/cmake/config.cmake", "config.cmake")
-    # Set LLVM path and enable CUDA or ROCM in config.cmake
-    with open("config.cmake", "a") as config_file:
-        config_file.write(f"set(USE_LLVM {llvm_config_path})\n")
-        if USE_ROCM:
-            config_file.write(f"set(USE_ROCM {ROCM_HOME})\n")
-            config_file.write("set(USE_CUDA OFF)\n")
-        else:
-            config_file.write(f"set(USE_CUDA {CUDA_HOME})\n")
-            config_file.write("set(USE_ROCM OFF)\n")
-    # Run CMake and make
-    try:
-        subprocess.check_call(["cmake", ".."])
-        num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
-        subprocess.check_call(["make", f"-j{num_jobs}"])
-    except subprocess.CalledProcessError as error:
-        raise RuntimeError("Failed to build TileLang C Source") from error
-
-
 def setup_llvm_for_tvm():
     """Downloads and extracts LLVM, then configures TVM to use it."""
     # Assume the download_and_extract_llvm function and its dependencies are defined elsewhere in this script
@@ -627,7 +612,10 @@ class TilelangExtensionBuild(build_ext):
     def run(self):
         # Check if CMake is installed and accessible by attempting to run 'cmake --version'.
         try:
-            subprocess.check_output(["cmake", "--version"])
+            cmake_path = get_cmake_path()
+            if not cmake_path:
+                raise Exception("CMake is not installed, please install it first.")
+            subprocess.check_output([cmake_path, "--version"])
         except OSError as error:
             # If CMake is not found, raise an error.
             raise RuntimeError(
@@ -830,15 +818,25 @@ def build_cmake(self, ext):
         else:
             print(f"[Config] No changes: {dst_config}")
 
+        cmake_path = get_cmake_path()
         # Run CMake to configure the project with the given arguments.
-        if not os.path.exists(build_temp + "/build.ninja"):
-            subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp)
+        if not os.path.exists(os.path.join(build_temp, "build.ninja")):
+            logger.info(
+                f"[CMake] Generating build.ninja: {cmake_path} {ext.sourcedir} {' '.join(cmake_args)}"
+            )
+            subprocess.check_call([cmake_path, ext.sourcedir] + cmake_args, cwd=build_temp)
+        else:
+            logger.info(f"[CMake] build.ninja already exists in {build_temp}")
 
-        # Build the project in "Release" mode with all available CPU cores ("-j").
         num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
-        subprocess.check_call(["cmake", "--build", ".", "--config", "Release", "-j",
-                               str(num_jobs)],
-                              cwd=build_temp)
+        logger.info(
+            f"[Build] Using {num_jobs} jobs | cmake: {cmake_path} (exists: {os.path.exists(cmake_path)}) | build dir: {build_temp}"
+        )
+
+        subprocess.check_call(
+            [cmake_path, "--build", ".", "--config", "Release", "-j",
+             str(num_jobs)],
+            cwd=build_temp)
 
 
 setup(
diff --git a/src/target/intrin_rule_cuda.cc b/src/target/intrin_rule_cuda.cc
@@ -0,0 +1,138 @@
+/*!
+ * \file intrin_rule_cuda.cc
+ * \brief CUDA intrinsic rules.
+ */
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "target/intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+// Add float suffix to the intrinsics, CUDA fast math.
+using tir::FLowerIntrinsic;
+
+struct CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      case 32:
+        return name + 'f';
+      case 16: {
+        if (name == "fabs") {
+          return "__habs";
+        } else if (name == "round") {
+          return "hrint";
+        } else {
+          return "h" + name;
+        }
+      }
+      default:
+        return "";
+      }
+    } else if (t.is_bfloat16()) {
+      if (name == "fabs") {
+        return "__habs";
+      } else if (name == "round") {
+        return "hrint";
+      } else {
+        return "h" + name;
+      }
+    } else if (t.is_int() || t.is_uint()) {
+      switch (t.bits()) {
+      case 32:
+        return "__" + name;
+      case 64:
+        return "__" + name + "ll";
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMath : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float() && t.bits() == 32) {
+      return "__" + name + 'f';
+    } else {
+      return CUDAMath::operator()(t, name);
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMathTan : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      // `__tanf` seems to produce some values too deviant from numpy tan
+      // version. So, let's use just `tanf` instead.
+      case 32:
+        return name + 'f';
+      case 16:
+        return 'h' + name;
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAPopcount {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_uint()) {
+      switch (t.bits()) {
+      case 32:
+        return "__popc";
+      case 64:
+        return "__popcll";
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAWarpIntrinsic {
+  const Op operator()(DataType t, const Op &orig_op) const {
+    if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
+      return Op::Get("tir.cuda.__shfl_sync");
+    } else if (orig_op.same_as(builtin::tvm_warp_shuffle_up())) {
+      return Op::Get("tir.cuda.__shfl_up_sync");
+    } else {
+      ICHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
+      return Op::Get("tir.cuda.__shfl_down_sync");
+    }
+  }
+};
+
+static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr &e) {
+  const CallNode *call = e.as<CallNode>();
+  return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args);
+}
+
+template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
+  const CallNode *call = e.as<CallNode>();
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size
+  Array<PrimExpr> cuda_args{
+      {call->args[0], call->args[1], call->args[2], call->args[3]}};
+  return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
+}
+
+TVM_REGISTER_OP("tir.rsqrt")
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic",
+                               DispatchPureExtern<CUDAMath>);
+
+} // namespace intrin
+} // namespace codegen
+} // namespace tvm
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
@@ -55,6 +55,11 @@ TL_PATCH TL_DEVICE half_t __habs(const half_t x) {
   return half_t(__habs(x.to_half()));
 }
 
+// hrsqrt function for half_t
+TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
+  return half_t(hrsqrt(x.to_half()));
+}
+
 // Pack two half values.
 TL_DEVICE unsigned __pack_half2(const half x, const half y) {
   unsigned v0 = *((unsigned short *)&x);

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,8 @@ tilelang_file_glob(GLOB TILE_LANG_SRCS`
`124`	`124`	`src/target/rt_mod_cpp.cc`
`125`	`125`	`# webgpu doesn't have system dependency`
`126`	`126`	`src/target/codegen_webgpu.cc`
	`127`	`+ # intrin_rule doesn't have system dependency`
	`128`	`+ src/target/intrin_rule*.cc`
`127`	`129`	`)`
`128`	`130`
`129`	`131`	`# Include CUDA source files if CUDA is enabled`