tile-ai · LeiWang1999 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -124,6 +124,8 @@ tilelang_file_glob(GLOB TILE_LANG_SRCS
   src/target/rt_mod_cpp.cc
   # webgpu doesn't have system dependency
   src/target/codegen_webgpu.cc
+  # intrin_rule doesn't have system dependency
+  src/target/intrin_rule*.cc
 )
 
 # Include CUDA source files if CUDA is enabled

diff --git a/setup.py b/setup.py
@@ -203,6 +203,7 @@ def get_cplus_compiler():
     return None
 
 
+@functools.lru_cache(maxsize=None)
 def get_cython_compiler() -> Optional[str]:
     """Return the path to the Cython compiler.
 
@@ -238,6 +239,17 @@ def get_cython_compiler() -> Optional[str]:
     return None
 
 
+@functools.lru_cache(maxsize=None)
+def get_cmake_path() -> str:
+    """Return the path to the CMake compiler.
+    """
+    # found which cmake is used
+    cmake_path = shutil.which("cmake")
+    if not os.path.exists(cmake_path):
+        raise Exception("CMake is not installed, please install it first.")
+    return cmake_path
+
-@functools.lru_cache(maxsize=None)
-def get_cmake_path() -> str:
-    """Return the path to the CMake compiler.
-    """
-    # found which cmake is used
-    cmake_path = shutil.which("cmake")
-    if not os.path.exists(cmake_path):
-        raise Exception("CMake is not installed, please install it first.")
-    return cmake_path
+@functools.lru_cache(maxsize=None)
+def get_cmake_path() -> str:
+    """Return the path to the CMake executable."""
+    explicit = os.environ.get("CMAKE") or os.environ.get("CMAKE_PATH")
+    candidates = [explicit, shutil.which("cmake"), shutil.which("cmake3")]
+    for p in candidates:
+        if p and os.path.exists(p) and os.access(p, os.X_OK):
+            return p
+    raise RuntimeError(
+        "CMake not found. Install CMake (>=3.22), ensure it is on PATH, or set CMAKE/CMAKE_PATH."
+    )
-@functools.lru_cache(maxsize=None)
-def get_cmake_path() -> str:
-    """Return the path to the CMake compiler.
-    """
-    # found which cmake is used
-    cmake_path = shutil.which("cmake")
-    if not os.path.exists(cmake_path):
-        raise Exception("CMake is not installed, please install it first.")
-    return cmake_path
+@functools.lru_cache(maxsize=None)
+def get_cmake_path() -> str:
+    """Return the path to the CMake executable."""
+    explicit = os.environ.get("CMAKE") or os.environ.get("CMAKE_PATH")
+    candidates = [explicit, shutil.which("cmake"), shutil.which("cmake3")]
+    for p in candidates:
+        if p and os.path.exists(p) and os.access(p, os.X_OK):
+            return p
+    raise RuntimeError(
+        "CMake not found. Install CMake (>=3.22), ensure it is on PATH, or set CMAKE/CMAKE_PATH."
+    )
+
 def get_system_info():
     system = platform.system().lower()
     if system == "linux":
@@ -338,33 +350,6 @@ def is_git_repo():
         raise RuntimeError("Failed to update submodules") from error
 
 
-def build_csrc(llvm_config_path):
-    """Configures and builds TVM."""
-
-    if not os.path.exists("build"):
-        os.makedirs("build")
-    os.chdir("build")
-    # Copy the config.cmake as a baseline
-    if not os.path.exists("config.cmake"):
-        shutil.copy("../3rdparty/tvm/cmake/config.cmake", "config.cmake")
-    # Set LLVM path and enable CUDA or ROCM in config.cmake
-    with open("config.cmake", "a") as config_file:
-        config_file.write(f"set(USE_LLVM {llvm_config_path})\n")
-        if USE_ROCM:
-            config_file.write(f"set(USE_ROCM {ROCM_HOME})\n")
-            config_file.write("set(USE_CUDA OFF)\n")
-        else:
-            config_file.write(f"set(USE_CUDA {CUDA_HOME})\n")
-            config_file.write("set(USE_ROCM OFF)\n")
-    # Run CMake and make
-    try:
-        subprocess.check_call(["cmake", ".."])
-        num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
-        subprocess.check_call(["make", f"-j{num_jobs}"])
-    except subprocess.CalledProcessError as error:
-        raise RuntimeError("Failed to build TileLang C Source") from error
-
-
 def setup_llvm_for_tvm():
     """Downloads and extracts LLVM, then configures TVM to use it."""
     # Assume the download_and_extract_llvm function and its dependencies are defined elsewhere in this script
@@ -627,7 +612,10 @@ class TilelangExtensionBuild(build_ext):
     def run(self):
         # Check if CMake is installed and accessible by attempting to run 'cmake --version'.
         try:
-            subprocess.check_output(["cmake", "--version"])
+            cmake_path = get_cmake_path()
+            if not cmake_path:
+                raise Exception("CMake is not installed, please install it first.")
+            subprocess.check_output([cmake_path, "--version"])
         except OSError as error:
             # If CMake is not found, raise an error.
             raise RuntimeError(
@@ -830,15 +818,25 @@ def build_cmake(self, ext):
         else:
             print(f"[Config] No changes: {dst_config}")
 
+        cmake_path = get_cmake_path()
         # Run CMake to configure the project with the given arguments.
-        if not os.path.exists(build_temp + "/build.ninja"):
-            subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp)
+        if not os.path.exists(os.path.join(build_temp, "build.ninja")):
+            logger.info(
+                f"[CMake] Generating build.ninja: {cmake_path} {ext.sourcedir} {' '.join(cmake_args)}"
+            )
+            subprocess.check_call([cmake_path, ext.sourcedir] + cmake_args, cwd=build_temp)
+        else:
+            logger.info(f"[CMake] build.ninja already exists in {build_temp}")
 
-        # Build the project in "Release" mode with all available CPU cores ("-j").
         num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
-        subprocess.check_call(["cmake", "--build", ".", "--config", "Release", "-j",
-                               str(num_jobs)],
-                              cwd=build_temp)
+        logger.info(
+            f"[Build] Using {num_jobs} jobs | cmake: {cmake_path} (exists: {os.path.exists(cmake_path)}) | build dir: {build_temp}"
+        )
+
+        subprocess.check_call(
+            [cmake_path, "--build", ".", "--config", "Release", "-j",
+             str(num_jobs)],
+            cwd=build_temp)
 
 
 setup(

diff --git a/src/target/intrin_rule_cuda.cc b/src/target/intrin_rule_cuda.cc
@@ -0,0 +1,138 @@
+/*!
+ * \file intrin_rule_cuda.cc
+ * \brief CUDA intrinsic rules.
+ */
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "target/intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+// Add float suffix to the intrinsics, CUDA fast math.
+using tir::FLowerIntrinsic;
+
+struct CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      case 32:
+        return name + 'f';
+      case 16: {
+        if (name == "fabs") {
+          return "__habs";
+        } else if (name == "round") {
+          return "hrint";
+        } else {
+          return "h" + name;
+        }
+      }
+      default:
+        return "";
+      }
+    } else if (t.is_bfloat16()) {
+      if (name == "fabs") {
+        return "__habs";
+      } else if (name == "round") {
+        return "hrint";
+      } else {
+        return "h" + name;
+      }
+    } else if (t.is_int() || t.is_uint()) {
+      switch (t.bits()) {
+      case 32:
+        return "__" + name;
+      case 64:
+        return "__" + name + "ll";
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMath : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float() && t.bits() == 32) {
+      return "__" + name + 'f';
+    } else {
+      return CUDAMath::operator()(t, name);
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMathTan : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      // `__tanf` seems to produce some values too deviant from numpy tan
+      // version. So, let's use just `tanf` instead.
+      case 32:
+        return name + 'f';
+      case 16:
+        return 'h' + name;
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAPopcount {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_uint()) {
+      switch (t.bits()) {
+      case 32:
+        return "__popc";
+      case 64:
+        return "__popcll";
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAWarpIntrinsic {
+  const Op operator()(DataType t, const Op &orig_op) const {
+    if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
+      return Op::Get("tir.cuda.__shfl_sync");
+    } else if (orig_op.same_as(builtin::tvm_warp_shuffle_up())) {
+      return Op::Get("tir.cuda.__shfl_up_sync");
+    } else {
+      ICHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
+      return Op::Get("tir.cuda.__shfl_down_sync");
+    }
+  }
+};
+
+static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr &e) {
+  const CallNode *call = e.as<CallNode>();
+  return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args);
+}
+
+template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
+  const CallNode *call = e.as<CallNode>();
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size
+  Array<PrimExpr> cuda_args{
+      {call->args[0], call->args[1], call->args[2], call->args[3]}};
+  return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
+}
+
+TVM_REGISTER_OP("tir.rsqrt")
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic",
+                               DispatchPureExtern<CUDAMath>);
+
+} // namespace intrin
+} // namespace codegen
+} // namespace tvm
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
@@ -55,6 +55,11 @@ TL_PATCH TL_DEVICE half_t __habs(const half_t x) {
   return half_t(__habs(x.to_half()));
 }
 
+// hrsqrt function for half_t
+TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
+  return half_t(hrsqrt(x.to_half()));
+}
+
-// hrsqrt function for half_t
-TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
-  return half_t(hrsqrt(x.to_half()));
-}
+// hrsqrt function for half_t
+TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
+  float xf = static_cast<float>(x);
+  float rf = rsqrtf(xf);
+  return half_t(rf);
+}
-// hrsqrt function for half_t
-TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
-  return half_t(hrsqrt(x.to_half()));
-}
+// hrsqrt function for half_t
+TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
+  float xf = static_cast<float>(x);
+  float rf = rsqrtf(xf);
+  return half_t(rf);
+}
 // Pack two half values.
 TL_DEVICE unsigned __pack_half2(const half x, const half y) {
   unsigned v0 = *((unsigned short *)&x);