tile-ai · LeiWang1999 · Sep 26, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
@@ -40,6 +40,31 @@ DataType cuTensorMapType() { return DataType::UInt(8, 128); }
   TVM_REGISTER_OP("tl." #OpName)                                               \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)
 
+// fast math related op
+TIR_DEFINE_TL_BUILTIN(__exp).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(__exp10).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(__log).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(__log2).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(__log10).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(__tan).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(__cos).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(__sin).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",

diff --git a/src/op/builtin.h b/src/op/builtin.h
@@ -75,6 +75,16 @@ static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
  */
 DataType cuTensorMapType();
 
+// fast math related op
+TVM_DLL const Op &__exp();
+TVM_DLL const Op &__exp10();
+TVM_DLL const Op &__log();
+TVM_DLL const Op &__log2();
+TVM_DLL const Op &__log10();
+TVM_DLL const Op &__tan();
+TVM_DLL const Op &__cos();
+TVM_DLL const Op &__sin();
+
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
  *

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -21,6 +21,79 @@ namespace tvm {
 namespace codegen {
 using namespace tvm::tl::codegen;
 
+struct CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      case 32:
+        return name + 'f';
+      case 16: {
+        if (name == "fabs") {
+          return "__habs";
+        } else if (name == "round") {
+          return "hrint";
+        } else {
+          return "h" + name;
+        }
+      }
+      default:
+        return "";
+      }
+    } else if (t.is_bfloat16()) {
+      if (name == "fabs") {
+        return "__habs";
+      } else if (name == "round") {
+        return "hrint";
+      } else {
+        return "h" + name;
+      }
+    } else if (t.is_int() || t.is_uint()) {
+      switch (t.bits()) {
+      case 32:
+        return "__" + name;
+      case 64:
+        return "__" + name + "ll";
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMath : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float() && t.bits() == 32) {
+      return "__" + name + 'f';
+    } else {
+      return CUDAMath::operator()(t, name);
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMathTan : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      // `__tanf` seems to produce some values too deviant from numpy tan
+      // version. So, let's use just `tanf` instead.
+      case 32:
+        return name + 'f';
+      case 16:
+        return 'h' + name;
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
 static std::string GetFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
@@ -1628,6 +1701,38 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
                           op->args, true, os);
   } else if (op->op.same_as(tl::tl_shuffle_elect())) {
     os << "tl::tl_shuffle_elect<" << PrintExpr(op->args[0]) << ">()";
+  } else if (op->op.same_as(tl::__exp())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "exp");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__exp10())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "exp10");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__log())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "log");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__log2())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "log2");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__log10())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "log10");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__tan())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "tan");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__cos())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "cos");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__sin())) {
-    CUDAFastMath math_func;
-    std::string func_name = math_func(op->dtype, "tan");
-    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::__cos())) {
-    CUDAFastMath math_func;
-    std::string func_name = math_func(op->dtype, "cos");
-    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::__sin())) {
+  } else if (op->op.same_as(tl::__tan())) {
+    CUDAFastMathTan math_func;
+    std::string func_name = math_func(op->dtype, "tan");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__cos())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "cos");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__sin())) {
-    CUDAFastMath math_func;
-    std::string func_name = math_func(op->dtype, "tan");
-    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::__cos())) {
-    CUDAFastMath math_func;
-    std::string func_name = math_func(op->dtype, "cos");
-    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::__sin())) {
+  } else if (op->op.same_as(tl::__tan())) {
+    CUDAFastMathTan math_func;
+    std::string func_name = math_func(op->dtype, "tan");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__cos())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "cos");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::__sin())) {
+    CUDAFastMath math_func;
+    std::string func_name = math_func(op->dtype, "sin");
+    os << func_name << "(" << PrintExpr(op->args[0]) << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }