Skip to content
Merged
2 changes: 1 addition & 1 deletion 3rdparty/tvm
Submodule tvm updated from 7a71ee to 883e96
25 changes: 25 additions & 0 deletions src/op/builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,31 @@ DataType cuTensorMapType() { return DataType::UInt(8, 128); }
TVM_REGISTER_OP("tl." #OpName) \
.set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)

// fast math related op
TIR_DEFINE_TL_BUILTIN(__exp).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(__exp10).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(__log).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(__log2).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(__log10).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(__tan).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(__cos).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(__sin).set_num_inputs(1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kOpaque));

TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
.set_num_inputs(-1)
.set_attr<TCallEffectKind>("TCallEffectKind",
Expand Down
10 changes: 10 additions & 0 deletions src/op/builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
*/
DataType cuTensorMapType();

// fast math related op
TVM_DLL const Op &__exp();
TVM_DLL const Op &__exp10();
TVM_DLL const Op &__log();
TVM_DLL const Op &__log2();
TVM_DLL const Op &__log10();
TVM_DLL const Op &__tan();
TVM_DLL const Op &__cos();
TVM_DLL const Op &__sin();

/*!
* \brief tvm intrinsics for TMADescriptor creation for tiled load
*
Expand Down
105 changes: 105 additions & 0 deletions src/target/codegen_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,79 @@ namespace tvm {
namespace codegen {
using namespace tvm::tl::codegen;

struct CUDAMath {
std::string operator()(DataType t, std::string name) const {
if (t.is_float()) {
switch (t.bits()) {
case 64:
return name;
case 32:
return name + 'f';
case 16: {
if (name == "fabs") {
return "__habs";
} else if (name == "round") {
return "hrint";
} else {
return "h" + name;
}
}
default:
return "";
}
} else if (t.is_bfloat16()) {
if (name == "fabs") {
return "__habs";
} else if (name == "round") {
return "hrint";
} else {
return "h" + name;
}
} else if (t.is_int() || t.is_uint()) {
switch (t.bits()) {
case 32:
return "__" + name;
case 64:
return "__" + name + "ll";
default:
return "";
}
}
return "";
}
};

struct CUDAFastMath : public CUDAMath {
std::string operator()(DataType t, std::string name) const {
if (t.is_float() && t.bits() == 32) {
return "__" + name + 'f';
} else {
return CUDAMath::operator()(t, name);
}
return "";
}
Comment on lines +67 to +74
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The return ""; on line 73 is unreachable because all paths in the if/else statement above it return a value. This dead code should be removed.

  std::string operator()(DataType t, std::string name) const {
    if (t.is_float() && t.bits() == 32) {
      return "__" + name + 'f';
    } else {
      return CUDAMath::operator()(t, name);
    }
  }

};

struct CUDAFastMathTan : public CUDAMath {
std::string operator()(DataType t, std::string name) const {
if (t.is_float()) {
switch (t.bits()) {
case 64:
return name;
// `__tanf` seems to produce some values too deviant from numpy tan
// version. So, let's use just `tanf` instead.
case 32:
return name + 'f';
case 16:
return 'h' + name;
default:
return "";
}
}
return "";
}
};

static std::string GetFP8Type(DataType type) {
std::stringstream stream;
int32_t lanes = type.lanes();
Expand Down Expand Up @@ -1628,6 +1701,38 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
op->args, true, os);
} else if (op->op.same_as(tl::tl_shuffle_elect())) {
os << "tl::tl_shuffle_elect<" << PrintExpr(op->args[0]) << ">()";
} else if (op->op.same_as(tl::__exp())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "exp");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__exp10())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "exp10");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__log())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "log");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__log2())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "log2");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__log10())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "log10");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__tan())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "tan");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__cos())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "cos");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__sin())) {
Comment on lines +1725 to +1732
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use the tan-specific fast-math mapper

We landed CUDAFastMathTan to avoid emitting __tanf (too inaccurate) and to route FP16 through the h* variants, but this branch still instantiates the generic CUDAFastMath, so we end up generating __tanf for float32. That defeats the accuracy fix and contradicts the new helper/comment.

Please switch this site to CUDAFastMathTan.

-  } else if (op->op.same_as(tl::__tan())) {
-    CUDAFastMath math_func;
+  } else if (op->op.same_as(tl::__tan())) {
+    CUDAFastMathTan math_func;
     std::string func_name = math_func(op->dtype, "tan");
     os << func_name << "(" << PrintExpr(op->args[0]) << ")";
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "tan");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__cos())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "cos");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__sin())) {
} else if (op->op.same_as(tl::__tan())) {
CUDAFastMathTan math_func;
std::string func_name = math_func(op->dtype, "tan");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__cos())) {
CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "cos");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
} else if (op->op.same_as(tl::__sin())) {
🤖 Prompt for AI Agents
In src/target/codegen_cuda.cc around lines 1725 to 1732, the branch handling tan
currently instantiates CUDAFastMath which emits __tanf for float32; replace that
instantiation with CUDAFastMathTan so the mapper picks the tan-specific
fast-math variants (and routes FP16 to h* variants). Concretely, change the
local type from CUDAFastMath to CUDAFastMathTan, keep the rest of the call to
math_func(op->dtype, "tan") and the PrintExpr usage unchanged, so the emitted
function name uses the tan-specialized mapper.

CUDAFastMath math_func;
std::string func_name = math_func(op->dtype, "sin");
os << func_name << "(" << PrintExpr(op->args[0]) << ")";
Comment on lines +1704 to +1735
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for handling the different fast math intrinsics is very repetitive. Each else if block contains almost identical code, creating a new CUDAFastMath object and calling it. This could be refactored to reduce duplication and improve maintainability, for example by using a map from the Op to the math function name string and handling them in a single block or a helper function.

} else {
CodeGenC::VisitExpr_(op, os);
}
Expand Down
Loading
Loading