[CUDA] FP4 cast and reinterpret support

MasterJH5574 · MasterJH5574 · commit 7b88a74342ff · 2025-03-05T22:53:52.000-05:00
Following up on a previous PR, this PR introduces the cast and
reinterpret support between `__nv_fp4_e2m1` and other dtypes.
This PR also makes sure that the cast and reinterpret support
vectorize.
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
@@ -197,7 +197,9 @@ def copyfrom(self, source_array):
             source_array = np.ascontiguousarray(
                 source_array, dtype="uint16" if dtype == "bfloat16" else dtype
             )
-        if dtype.startswith("e2m1_float4"):
+        if self.dtype.startswith("e2m1_float4") and self.dtype != "e2m1_float4":
+            # e2m1_float4 in numpy is not packed.
+            # So we need to pack the input data when converting to vectorized e2m1_float4 type.
             data_bits = source_array.view(dtype="uint8")
             if data_bits.size % 2:
                 data_bits = np.pad(data_bits, (0, 1), mode="constant", constant_values=0)
@@ -271,12 +273,14 @@ def numpy(self):
         np_arr = np.empty(shape, dtype=dtype)
         assert np_arr.flags["C_CONTIGUOUS"]
         data = np_arr.ctypes.data_as(ctypes.c_void_p)
-        if old_dtype.startswith("e2m1_float4"):
+        if old_dtype.startswith("e2m1_float4") and old_dtype != "e2m1_float4":
             nbytes = ctypes.c_size_t(np_arr.size * np_arr.dtype.itemsize // 2)
         else:
             nbytes = ctypes.c_size_t(np_arr.size * np_arr.dtype.itemsize)
         check_call(_LIB.TVMArrayCopyToBytes(self.handle, data, nbytes))
-        if old_dtype == "int4" or old_dtype.startswith("e2m1_float4"):
+        if old_dtype == "int4" or (
+            old_dtype.startswith("e2m1_float4") and old_dtype != "e2m1_float4"
+        ):
             length = np_arr.size
             np_arr = np_arr.view("int8")
             np_arr_ret = np.empty((length,), dtype="int8")
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
@@ -1458,6 +1458,7 @@ def func(
 e5m2_float8x64 = func_gen(("E5M2Float8x64"))
 
 e2m1_float4 = func_gen(("E2M1Float4"))
+e2m1_float4x2 = func_gen(("E2M1Float4x2"))
 e2m1_float4x4 = func_gen(("E2M1Float4x4"))
 e2m1_float4x8 = func_gen(("E2M1Float4x8"))
 e2m1_float4x16 = func_gen(("E2M1Float4x16"))
@@ -2017,6 +2018,7 @@ def wrapped(*args, **kwargs):
     "float16",
     "float32",
     "float64",
+    "e2m1_float4x2",
     "e4m3_float8x4",
     "e5m2_float8x4",
     "e2m1_float4x4",
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
@@ -789,6 +789,11 @@ void CodeGenC::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  // NOLI
       }
     }
 
+    if (value_dtype.is_e2m1_float4() && lanes != 1) {
+      // A e2m1_float4 element has 4 bits, which is an incomplete byte.
+      // So we cannot vector load it.
+      can_vector_load = false;
+    }
     if (can_vector_load) {
       std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
       HandleVolatileLoads(ref, op, os);
@@ -839,7 +844,8 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
   } else {
     arith::PVar<PrimExpr> base;
 
-    if (arith::ramp(base, 1, value_dtype.lanes()).Match(index_expr)) {
+    if (arith::ramp(base, 1, value_dtype.lanes()).Match(index_expr) &&
+        !value_dtype.is_e2m1_float4()) {
       std::string value = this->PrintExpr(op->value);
       this->PrintVecStore(op->buffer.get(), value_dtype, base.Eval(), value);
     } else {
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -82,7 +82,7 @@ std::string GetFP4Type(DataType type) {
   } else if (lanes == 4) {
     vec = "x4";
   } else {
-    LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8) for FP8";
+    LOG(FATAL) << "Only support scalar and vector types of width (2, 4) for FP8";
   }
   stream << "__nv_fp4";
   std::string suffix;
@@ -196,7 +196,7 @@ std::string CodeGenCUDA::Finish() {
     decl_stream << "#include <cuda_fp4.h>\n";
     decl_stream << "#endif\n\n";
   }
-  declare_vector_type_extensions(decl_stream, enable_fp16_, enable_fp8_);
+  declare_vector_type_extensions(decl_stream, enable_fp16_, enable_fp8_, enable_fp4_);
 
   if (enable_warp_shuffle_) {
     decl_stream << _cuda_warp_intrinsic_util;
@@ -597,6 +597,9 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
     }
     ICHECK(!type_name.empty());
     os << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
+  } else if (t.is_e2m1_float4()) {
+    os << "([](__nv_fp4_storage_t v) { __nv_fp4_e2m1 t; t.__x = v; return t; })((" << vec
+       << ".__x >> " << i * 4 << ") & 0xF)";
   } else {
     os << vec << "." << access[i];
   }
@@ -1036,8 +1039,8 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     var_idmap_[inverse_index_map->initial_indices[1].get()] = "local_id";
 
     os << "for (int local_id = 0; local_id < 8; ++local_id) {\n";
-    os << dst << "[" + this->PrintExpr(dst_ind) + "]"
-       << " = " << src << "[" << src_offset << " + local_id];\n";
+    os << dst << "[" + this->PrintExpr(dst_ind) + "] = " << src << "[" << src_offset
+       << " + local_id];\n";
     os << "}\n";
 
   } else if (op->op.same_as(builtin::mma_fill())) {
@@ -1155,6 +1158,82 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr << ")), \"r\"((int)"
            << guard << ")\n";
     stream << ");\n";
+  } else if (op->op.same_as(builtin::reinterpret())) {
+    DataType tgt_dtype = op->dtype;
+    DataType src_dtype = op->args[0]->dtype;
+    PrimExpr value = op->args[0];
+
+    // Handle e2m1_float4 reinterpret
+    if (!src_dtype.is_e2m1_float4() && !tgt_dtype.is_e2m1_float4()) {
+      return CodeGenC::VisitExpr_(op, os);
+    }
+    if (src_dtype == tgt_dtype ||
+        tgt_dtype.lanes() * tgt_dtype.bits() == src_dtype.lanes() * src_dtype.bits()) {
+      return CodeGenC::VisitExpr_(op, os);
+    }
+    CHECK_EQ(tgt_dtype.lanes(), src_dtype.lanes())
+        << "E2M1 float4 reinterpret expects source and target to have the same number of lanes. "
+        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
+    CHECK_EQ(tgt_dtype.bytes(), src_dtype.bytes())
+        << "E2M1 float4 reinterpret expects source and target to have the same number of bytes. "
+        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
+
+    int lanes = tgt_dtype.lanes();
+
+    int ssa_scope = BeginScope();
+    if (lanes == 1) {
+      // The case of lane=1 is same as the normal reinterpret,
+      // except that we allow the src and dst dtype to have different number of bits.
+      std::string rhs = SSAGetID(PrintExpr(value), src_dtype);
+      os << "(*(";
+      this->PrintType(tgt_dtype, os);
+      os << " *)(&(" << rhs << ")))";
+    } else if (lanes == 2) {
+      if (tgt_dtype.is_e2m1_float4()) {
+        // We view the source as an uint16, and then extract bits of two fp4 numbers,
+        // and finally reinterpret the result as fp4x2.
+        value = tir::Call(DataType::UInt(16), tir::builtin::reinterpret(), {value});
+        tir::Var temp_var("temp_var", DataType::UInt(16));
+        value = tir::Let(
+            temp_var, value,
+            tir::Cast(DataType::UInt(8), (temp_var & IntImm(DataType::UInt(16), 0xF)) |
+                                             ((temp_var >> 4) & IntImm(DataType::UInt(16), 0xF0))));
+      } else {
+        value = tir::Cast(DataType::UInt(16),
+                          tir::Call(DataType::UInt(8), tir::builtin::reinterpret(), {value}));
+        tir::Var temp_var("temp_var", DataType::UInt(16));
+        value = tir::Let(temp_var, value,
+                         (temp_var & IntImm(DataType::UInt(16), 0xF)) |
+                             ((temp_var & IntImm(DataType::UInt(16), 0xF0)) << 4));
+      }
+      os << PrintExpr(tir::Call(tgt_dtype, tir::builtin::reinterpret(), {value}));
+    } else if (lanes == 4) {
+      if (tgt_dtype.is_e2m1_float4()) {
+        // We view the source as an uint32, and then extract bits of four fp4 numbers,
+        // and finally reinterpret the result as fp4x4.
+        value = tir::Call(DataType::UInt(32), tir::builtin::reinterpret(), {value});
+        tir::Var temp_var("temp_var", DataType::UInt(32));
+        value = tir::Let(temp_var, value,
+                         tir::Cast(DataType::UInt(16),
+                                   (temp_var & IntImm(DataType::UInt(32), 0xF)) |
+                                       ((temp_var >> 4) & IntImm(DataType::UInt(32), 0xF0)) |
+                                       ((temp_var >> 8) & IntImm(DataType::UInt(32), 0xF00)) |
+                                       ((temp_var >> 12) & IntImm(DataType::UInt(32), 0xF000))));
+      } else {
+        value = tir::Cast(DataType::UInt(32),
+                          tir::Call(DataType::UInt(16), tir::builtin::reinterpret(), {value}));
+        tir::Var temp_var("temp_var", DataType::UInt(32));
+        value = tir::Let(temp_var, value,
+                         (temp_var & IntImm(DataType::UInt(32), 0xF)) |
+                             ((temp_var & IntImm(DataType::UInt(32), 0xF0)) << 4) |
+                             ((temp_var & IntImm(DataType::UInt(32), 0xF00)) << 8) |
+                             ((temp_var & IntImm(DataType::UInt(32), 0xF000)) << 12));
+      }
+      os << PrintExpr(tir::Call(tgt_dtype, tir::builtin::reinterpret(), {value}));
+    } else {
+      LOG(FATAL) << "Invalid number of lanes for e2m1_float4 reinterpret: " << lanes;
+    }
+    EndScope(ssa_scope);
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/target/source/literal/cuda_half_t.h b/src/target/source/literal/cuda_half_t.h
@@ -385,8 +385,9 @@ static constexpr const char* _cuda_warp_intrinsic_util = R"(
 
 )";
 
-void declare_vector_type_extensions(std::ostringstream& stream, bool enable_fp16, bool enable_fp8) {
-  if (enable_fp16 || enable_fp8) {
+void declare_vector_type_extensions(std::ostringstream& stream, bool enable_fp16, bool enable_fp8,
+                                    bool enable_fp4) {
+  if (enable_fp16 || enable_fp8 || enable_fp4) {
     stream << R"(
 struct __align__(8) half4 {
   __half x, y, z, w;
@@ -455,13 +456,47 @@ struct __align__(8) half4 {
       result.__x = (a) | (b << 8) | (c << 16) | (d << 24);
       return result;
   }
+  )";
+    }
+    if (enable_fp4) {
+      stream << R"(
+  __host__ __device__ explicit half4(const __nv_fp4x4_e2m1& fp4x4) {
+    __nv_fp4x2_storage_t lo_part, hi_part;
+    lo_part = static_cast<__nv_fp4x2_storage_t>(fp4x4.__x & 0xFF);
+    hi_part = static_cast<__nv_fp4x2_storage_t>((fp4x4.__x >> 8) & 0xFF);
+    __half2 lo_half2 = __half2(__nv_cvt_fp4x2_to_halfraw2(lo_part, __NV_E2M1));
+    __half2 hi_half2 = __half2(__nv_cvt_fp4x2_to_halfraw2(hi_part, __NV_E2M1));
+    x = reinterpret_cast<__half*>(&lo_half2)[0];
+    y = reinterpret_cast<__half*>(&lo_half2)[1];
+    z = reinterpret_cast<__half*>(&hi_half2)[0];
+    w = reinterpret_cast<__half*>(&hi_half2)[1];
+  }
+  __host__ __device__ explicit operator __nv_fp4x4_e2m1() const {
+    __half2 lo_half2 = *reinterpret_cast<const __half2*>(&x);
+    __half2 hi_half2 = *reinterpret_cast<const __half2*>(&z);
+    return __nv_fp4x4_e2m1(lo_half2, hi_half2);
+  }
   )";
     }
     stream << R"(
 };
 __host__ __device__ half4 make_half4(__half x, __half y, __half z, __half w) {
     return half4(x, y, z, w);
 }
+)";
+  }
+  if (enable_fp4) {
+    stream << R"(
+__device__ __nv_fp4x2_e2m1 make___nv_fp4x2_e2m1(__nv_fp4_e2m1 x, __nv_fp4_e2m1 y) {
+  __nv_fp4x2_e2m1 result;
+  result.__x = (x.__x) | (y.__x << 4);
+  return result;
+}
+__device__ __nv_fp4x4_e2m1 make___nv_fp4x4_e2m1(__nv_fp4_e2m1 a, __nv_fp4_e2m1 b, __nv_fp4_e2m1 c, __nv_fp4_e2m1 d) {
+  __nv_fp4x4_e2m1 result;
+  result.__x = (static_cast<__nv_fp4x4_storage_t>(a.__x)) | (static_cast<__nv_fp4x4_storage_t>(b.__x) << 4) | (static_cast<__nv_fp4x4_storage_t>(c.__x) << 8) | (static_cast<__nv_fp4x4_storage_t>(d.__x) << 12);
+  return result;
+}
 )";
   }
 }
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
@@ -425,8 +425,10 @@ PrimExpr cast(const DataType& t, PrimExpr value, Span span) {
 PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span) {
   if (value.dtype() == t) return value;
   if (!t.is_scalable_vector() && !value.dtype().is_scalable_vector()) {
-    ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * t.lanes())
-        << "Bitcast requires size match " << t << " vs " << value.dtype();
+    ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * t.lanes() ||
+           ((value.dtype().is_e2m1_float4() || t.is_e2m1_float4()) &&
+            value.dtype().bytes() * value.dtype().lanes() == 1 && t.bytes() * t.lanes()))
+        << "Reinterpret requires size match " << t << " vs " << value.dtype();
   }
   return tir::Call(t, tir::builtin::reinterpret(), {value}, span);
 }
diff --git a/tests/python/codegen/test_target_codegen_cuda_fp4.py b/tests/python/codegen/test_target_codegen_cuda_fp4.py

Original file line number	Diff line number	Diff line change
`@@ -425,8 +425,10 @@ PrimExpr cast(const DataType& t, PrimExpr value, Span span) {`
`425`	`425`	`PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span) {`
`426`	`426`	`if (value.dtype() == t) return value;`
`427`	`427`	`if (!t.is_scalable_vector() && !value.dtype().is_scalable_vector()) {`
`428`		`- ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * t.lanes())`
`429`		`- << "Bitcast requires size match " << t << " vs " << value.dtype();`
	`428`	`+ ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * t.lanes() \|\|`
	`429`	`+ ((value.dtype().is_e2m1_float4() \|\| t.is_e2m1_float4()) &&`
	`430`	`+ value.dtype().bytes() * value.dtype().lanes() == 1 && t.bytes() * t.lanes()))`
	`431`	`+ << "Reinterpret requires size match " << t << " vs " << value.dtype();`
`430`	`432`	`}`
`431`	`433`	`return tir::Call(t, tir::builtin::reinterpret(), {value}, span);`
`432`	`434`	`}`