From b15b7d2526512ebf135589c74a1ff05e3b638d5a Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Sun, 4 Jan 2026 11:51:11 +0800
Subject: [PATCH 1/7] Fix some bugs on ci and open rocm ci test

---
 .github/workflows/ci.yml                      |   4 +-
 src/op/gemm.cc                                |   2 +
 src/op/logical.cc                             |   6 +-
 src/target/codegen_hip.cc                     |  12 +
 src/tl_templates/cuda/reduce.h                |   8 +-
 src/tl_templates/hip/atomic.h                 | 104 +++++++
 src/tl_templates/hip/common.h                 |  98 ++++++-
 src/tl_templates/hip/debug.h                  | 274 ++++++------------
 src/tl_templates/hip/hip_fp8.h                |   1 +
 src/tl_templates/hip/reduce.h                 | 180 +++++++++++-
 .../python/autotune/test_tilelang_autotune.py |   3 +-
 .../cache/test_tilelang_kernel_cache.py       |   3 +
 ..._tilelang_carver_cuda_driver_properties.py |   7 +
 .../test_tilelang_carver_recommend_hints.py   |   1 +
 .../test_storage_rewrite_detect_inplace.py    |   8 +-
 testing/python/debug/test_device_assert.py    |   4 +-
 .../python/debug/test_tilelang_debug_print.py |  28 +-
 .../python/issue/test_tilelang_issue_1001.py  |   1 +
 .../python/issue/test_tilelang_issue_1008.py  |   2 +
 .../python/issue/test_tilelang_issue_830.py   |   1 -
 .../python/issue/test_tilelang_issue_96.py    |   2 +-
 .../python/jit/test_tilelang_jit_callback.py  |  11 +-
 .../python/jit/test_tilelang_jit_cutedsl.py   |   4 +
 testing/python/jit/test_tilelang_jit_gemm.py  |   4 +-
 .../jit/test_tilelang_jit_gemm_cython.py      |  14 +-
 testing/python/jit/test_tilelang_jit_nvrtc.py |   6 +
 .../jit/test_tilelang_jit_parcompile.py       |   4 +-
 .../python/jit/test_tilelang_jit_tvm_ffi.py   |  13 +-
 .../kernel/test_tilelang_kernel_gemm.py       |   8 +
 .../kernel/test_tilelang_kernel_gemm_simt.py  |   8 +-
 .../test_tilelang_kernel_int4_gemm_mma.py     |   2 +
 .../language/test_tilelang_language_alias.py  |   2 +-
 .../language/test_tilelang_language_alloc.py  |   8 +-
 .../language/test_tilelang_language_annot.py  |  15 +
 ...t_tilelang_language_annotate_safe_value.py |   1 +
 .../test_tilelang_language_atomic_add.py      |  24 +-
 .../language/test_tilelang_language_clear.py  |   2 +-
 ...test_tilelang_language_composable_index.py |   1 -
 .../language/test_tilelang_language_copy.py   |   3 -
 .../test_tilelang_language_frontend_v2.py     |   2 +
 .../test_tilelang_language_infinity.py        |   1 +
 .../language/test_tilelang_language_let.py    |   1 +
 .../test_tilelang_language_mask_op.py         |   8 +-
 .../language/test_tilelang_language_ptr.py    |   2 +-
 .../language/test_tilelang_language_unroll.py |   6 +-
 .../test_tilelang_language_var_init.py        |   2 +
 .../test_tilelang_language_vectorize.py       |   1 +
 .../test_tilelang_language_vectorized_cast.py |   1 +
 .../test_tilelang_tilelibrary_gemm.py         | 155 ++++++++--
 .../test_tilelang_tilelibrary_gemm_sp_v2.py   |   4 +
 tilelang/engine/lower.py                      |  32 +-
 tilelang/intrinsics/mfma_layout.py            |   5 +-
 tilelang/intrinsics/mfma_macro_generator.py   |   4 +-
 tilelang/jit/adapter/wrapper.py               |  12 +-
 54 files changed, 784 insertions(+), 331 deletions(-)
 create mode 100644 src/tl_templates/hip/atomic.h

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b12f0592d..96a28433a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -373,6 +373,7 @@ jobs:
             ./python
 
       # AMD ROCm tests
+      # runtime and transform tests needs to repair, then rm it from ignore list
       - name: Run ROCm tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
         id: rocm-tests
         if: contains(matrix.runner.toolkit, 'ROCm')
@@ -383,7 +384,8 @@ jobs:
             pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
           )
           "${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
-            ./python/amd
+            --ignore=./python/runtime --ignore=./python/transform \
+            ./python
 
       # Apple Metal tests
       - name: Run Metal tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index 7ad8b8c1e..94af2cbfe 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -153,6 +153,8 @@ std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
   int kNPerWarp = 8;            // Columns processed by a single warp
   if (TargetIsVolta(target)) {
     kNPerWarp = 16;
+  } else if (TargetIsCDNA(target)) {
+    kNPerWarp = 16;
   }
   ICHECK(M % kMPerWarp == 0)
       << "M must be divisible by " << kMPerWarp << ", but got " << M;
diff --git a/src/op/logical.cc b/src/op/logical.cc
index 0de6658bd..38fe38cd1 100644
--- a/src/op/logical.cc
+++ b/src/op/logical.cc
@@ -42,14 +42,16 @@ TVM_REGISTER_OP("tl.any_of")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "any_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", any_of_op);
 
 TVM_REGISTER_OP("tl.all_of")
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "all_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", all_of_op);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 8a18c3fc9..aaac71dcf 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -942,6 +942,8 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
         {"float32x4", "float32x4"},
         {"float8_e4m3fnuzx4", "fp8_e4_4_t"},
         {"float8_e4m3fnuzx8", "long"},
+        {"float8_e5m2fnuzx4", "fp8_e5_4_t"},
+        {"float8_e5m2fnuzx8", "long"},
         {"float32x16", "float32x16"}};
     std::string call_mfma_code = R"({
       *((({C_dtype}*){c_ref}) + {c_bias}) = {mfma_buildin}(*((({A_dtype}*){a_ref}) + {a_bias}),
@@ -980,6 +982,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     // HIP doesn't need explicit register management like CUDA
     // This is a no-op for HIP
     return;
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    os << "tl::warp_reduce_max(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    os << "tl::warp_reduce_min(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
index 458242649..55b8878b7 100644
--- a/src/tl_templates/cuda/reduce.h
+++ b/src/tl_templates/cuda/reduce.h
@@ -175,15 +175,15 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
   static_assert(threads == 1024 or threads == 512 or threads == 256 or
                 threads == 128 or threads == 64 or threads == 32);
   template <typename T, int SEG = 32>
-  static TL_DEVICE T run(const T *__restrict__ src, T *__restrict__ dst, int H,
-                         int W) {
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int H, int W) {
 
     constexpr int TILE_H = threads / SEG;
     constexpr unsigned MASK = 0xffffffff;
     const int num_blocks = (H + TILE_H - 1) / TILE_H;
     const int tid = threadIdx.x;
-    const int lane = tid % 32;
-    const int row = tid / 32;
+    const int lane = tid % SEG;
+    const int row = tid / SEG;
 
     for (int b = 0; b < num_blocks; ++b) {
       const int gRow = b * TILE_H + row;
diff --git a/src/tl_templates/hip/atomic.h b/src/tl_templates/hip/atomic.h
new file mode 100644
index 000000000..30931361b
--- /dev/null
+++ b/src/tl_templates/hip/atomic.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ T1 AtomicAddRet(T1 *ref, T2 val,
+                                           int memory_order = 0) {
+  return atomicAdd(ref, static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+__forceinline__ __device__ void AtomicAddx2(float *ref, float *val,
+                                            int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float2 AtomicAddx2Ret(float *ref, float *val,
+                                                 int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ void AtomicAddx4(float *ref, float *val,
+                                            int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float4 AtomicAddx4Ret(float *ref, float *val,
+                                                 int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
diff --git a/src/tl_templates/hip/common.h b/src/tl_templates/hip/common.h
index 8be247e77..186e7dfb2 100644
--- a/src/tl_templates/hip/common.h
+++ b/src/tl_templates/hip/common.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include "atomic.h"
 #include <ck_tile/core.hpp>
+#include <hip/amd_detail/amd_warp_functions.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
@@ -105,18 +107,94 @@ TL_DEVICE unsigned __pack_bfloat162(const bfloat16_t x, const bfloat16_t y) {
   return (v1 << 16) | v0;
 }
 
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 *address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+namespace tl {
+
+// Any
+template <typename T> TL_DEVICE bool Any(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (a[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// All
+template <typename T> TL_DEVICE bool All(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (!a[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO(gong): support shfl_sync(rocm 7.1.1 provide shfl_sync)
+// shfl_sync func
+template <typename T> TL_DEVICE T shfl_xor(T val, int delta) {
+  return __shfl_xor(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl_down(T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl_up(T val, int delta) {
+  return __shfl_up(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl(T val, int srcLane) {
+  return __shfl(val, srcLane);
+}
+
+// specialize half_t
+template <> TL_DEVICE half_t shfl_xor(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl_down(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl_up(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl(half_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl(f, srcLane);
+  return half_t(r);
+}
+
+// specialize bfloat16_t
+template <> TL_DEVICE bfloat16_t shfl_xor(bfloat16_t val, int laneMask) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor(f, laneMask);
+  return bfloat16_t(r);
 }
 
-// Overload for when the first argument is a value instead of a pointer
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+template <> TL_DEVICE bfloat16_t shfl_down(bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down(f, delta);
+  return bfloat16_t(r);
 }
 
-template <typename T1, typename T2>
-TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val) {
-  return atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+template <> TL_DEVICE bfloat16_t shfl_up(bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up(f, delta);
+  return bfloat16_t(r);
 }
+
+template <> TL_DEVICE bfloat16_t shfl(bfloat16_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl(f, srcLane);
+  return bfloat16_t(r);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/hip/debug.h b/src/tl_templates/hip/debug.h
index 7b19d3e94..7eb3736c2 100644
--- a/src/tl_templates/hip/debug.h
+++ b/src/tl_templates/hip/debug.h
@@ -1,191 +1,101 @@
 #pragma once
 #include <hip/hip_runtime.h>
 
-// Base template declaration
-template <typename T> __device__ void debug_print_var(const char *msg, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void debug_print_var<signed char>(const char *msg, signed char var) {
-  const char *safe_msg = msg;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=signed "
-         "char value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void debug_print_var<unsigned char>(const char *msg,
-                                               unsigned char var) {
-  const char *safe_msg = msg;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned char value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for int type
-template <> __device__ void debug_print_var<int>(const char *msg, int var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for unsigned int type
-template <>
-__device__ void debug_print_var<unsigned int>(const char *msg,
-                                              unsigned int var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned int value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for float type
-template <> __device__ void debug_print_var<float>(const char *msg, float var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-         "value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_var<double>(const char *msg, double var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
-         "value=%lf\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for bool type
-template <> __device__ void debug_print_var<bool>(const char *msg, bool var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
-         "value=%s\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z,
-         var ? "true" : "false");
-}
-
-// Specialization for short type
-template <> __device__ void debug_print_var<short>(const char *msg, short var) {
-  const char *safe_msg = msg;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=short "
-         "value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for unsigned short type
-template <>
-__device__ void debug_print_var<unsigned short>(const char *msg,
-                                                unsigned short var) {
-  const char *safe_msg = msg;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned short value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
+#include "hip_fp8.h"
+
+template <typename T> struct PrintTraits {
+  static __device__ void print_var(const char *msg, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (const void *)&val);
+  }
+
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (const void *)&val);
+  }
+};
+
+#define DEFINE_PRINT_TRAIT(TYPE, NAME, FORMAT, CAST_TYPE)                      \
+  template <> struct PrintTraits<TYPE> {                                       \
+    static __device__ void print_var(const char *msg, TYPE val) {              \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "dtype=" NAME " value=" FORMAT "\n",                              \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, (CAST_TYPE)val);                        \
+    }                                                                          \
+    static __device__ void print_buffer(const char *msg, const char *buf_name, \
+                                        int index, TYPE val) {                 \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "buffer=%s, index=%d, dtype=" NAME " value=" FORMAT "\n",         \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, buf_name, index, (CAST_TYPE)val);       \
+    }                                                                          \
+  }
+
+DEFINE_PRINT_TRAIT(char, "char", "%d", int);
+DEFINE_PRINT_TRAIT(signed char, "signed char", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned char, "unsigned char", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(short, "short", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned short, "unsigned short", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(int, "int", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned int, "uint", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(long, "long", "%ld", long);
+DEFINE_PRINT_TRAIT(unsigned long, "ulong", "%lu", unsigned long);
+DEFINE_PRINT_TRAIT(long long, "long long", "%lld", long long);
+
+DEFINE_PRINT_TRAIT(float, "float", "%f", float);
+DEFINE_PRINT_TRAIT(double, "double", "%lf", double);
+DEFINE_PRINT_TRAIT(half_t, "half_t", "%f", float);
+DEFINE_PRINT_TRAIT(bfloat16_t, "bfloat16_t", "%f", float);
+
+DEFINE_PRINT_TRAIT(fp8_e4_t, "fp8_e4_t", "%f", float);
+DEFINE_PRINT_TRAIT(fp8_e5_t, "fp8_e5_t", "%f", float);
+
+//
+template <> struct PrintTraits<bool> {
+  static __device__ void print_var(const char *msg, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+           "value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, val ? "true" : "false");
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=bool value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, val ? "true" : "false");
+  }
+};
+
+template <typename T> struct PrintTraits<T *> {
+  static __device__ void print_var(const char *msg, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (void *)val);
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (void *)val);
+  }
+};
+
+template <typename T> __device__ void debug_print_var(const char *msg, T var) {
+  PrintTraits<T>::print_var(msg, var);
 }
 
 // Template declaration for device-side debug printing (buffer only)
 template <typename T>
 __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
-                                         int index, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void
-debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
-                                      int index, signed char var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=signed char value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
-                                        int index, unsigned char var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=unsigned char value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
-}
-
-// Specialization for integer type
-template <>
-__device__ void debug_print_buffer_value<int>(const char *msg,
-                                              const char *buf_name, int index,
-                                              int var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
-}
-
-// Specialization for float type
-template <>
-__device__ void debug_print_buffer_value<float>(const char *msg,
-                                                const char *buf_name, int index,
-                                                float var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=float value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_buffer_value<half_t>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, half_t var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  float value = static_cast<float>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half_t value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_buffer_value<double>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, double var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=double value=%lf\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
+                                         int index, T var) {
+  PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
index 82fb53031..0503e8f46 100644
--- a/src/tl_templates/hip/hip_fp8.h
+++ b/src/tl_templates/hip/hip_fp8.h
@@ -1,3 +1,4 @@
+#pragma once
 #include <hip/amd_detail/amd_hip_fp8.h>
 
 #define HIP_FP8_ENABLED 1
diff --git a/src/tl_templates/hip/reduce.h b/src/tl_templates/hip/reduce.h
index 16c51b648..7185585ee 100644
--- a/src/tl_templates/hip/reduce.h
+++ b/src/tl_templates/hip/reduce.h
@@ -73,7 +73,7 @@ struct SharedReduceWarp {
       }
 
       for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
-        T other = __shfl_down(partial, offset, kWarpSize);
+        T other = tl::shfl_down(partial, offset, kWarpSize);
         partial = Reducer()(partial, other);
       }
 
@@ -104,7 +104,7 @@ struct AllReduce {
       __syncthreads();
       x = Reducer()(x, red_buf[threadIdx.x ^ offset]);
     } else {
-      x = Reducer()(x, __shfl_xor(x, offset));
+      x = Reducer()(x, tl::shfl_xor(x, offset));
     }
     if constexpr (offset == scale) {
       return x;
@@ -114,4 +114,180 @@ struct AllReduce {
   }
 };
 
+template <int threads, bool reverse = false> struct CumSum1D {
+  static_assert(threads == 1024 or threads == 512 or threads == 256 or
+                threads == 128 or threads == 64);
+  template <typename T, int SEG = 64>
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int N) {
+    if (N <= 0)
+      return;
+
+    const int tid = threadIdx.x;
+    const int lane = tid % SEG;
+
+    if (tid >= SEG)
+      return;
+
+    T carry = (T)0;
+
+    if (reverse) {
+      const int num_segments = (N + SEG - 1) / SEG;
+      for (int seg = num_segments - 1; seg >= 0; --seg) {
+        const int idx = seg * SEG + lane;
+        T val = (idx < N) ? src[idx] : (T)0;
+
+#pragma unroll
+        for (int off = 1; off < SEG; off <<= 1) {
+          T n = tl::shfl_down(val, off);
+          if (lane < SEG - off)
+            val += n;
+        }
+
+        val += carry;
+
+        if (idx < N)
+          dst[idx] = val;
+
+        T segSum = tl::shfl(val, 0);
+        if (lane == 0)
+          carry = segSum;
+        carry = tl::shfl(carry, 0);
+      }
+    } else {
+      const int num_segments = (N + SEG - 1) / SEG;
+      for (int seg = 0; seg < num_segments; ++seg) {
+        const int idx = seg * SEG + lane;
+        T val = (idx < N) ? src[idx] : (T)0;
+
+#pragma unroll
+        for (int off = 1; off < SEG; off <<= 1) {
+          T n = tl::shfl_up(val, off);
+          if (lane >= off)
+            val += n;
+        }
+
+        val += carry;
+
+        if (idx < N)
+          dst[idx] = val;
+
+        T segSum = tl::shfl(val, SEG - 1);
+        if (lane == SEG - 1)
+          carry = segSum;
+        carry = tl::shfl(carry, SEG - 1);
+      }
+    }
+  }
+};
+
+template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
+  static_assert(threads == 1024 or threads == 512 or threads == 256 or
+                threads == 128 or threads == 64);
+  template <typename T, int SEG = 64>
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int H, int W) {
+
+    constexpr int TILE_H = threads / SEG;
+    const int num_blocks = (H + TILE_H - 1) / TILE_H;
+    const int tid = threadIdx.x;
+    const int lane = tid % SEG;
+    const int row = tid / SEG;
+
+    for (int b = 0; b < num_blocks; ++b) {
+      const int gRow = b * TILE_H + row;
+      if (gRow >= H)
+        return;
+
+      T carry = (T)0;
+
+      if (reverse) {
+        // Start from the last segment for reverse mode
+        for (int seg = (W + SEG - 1) / SEG - 1; seg >= 0; --seg) {
+          const int col = seg * SEG + lane;
+
+          const int real_row = Axis == 1 ? gRow : col;
+          const int real_col = Axis == 1 ? col : gRow;
+
+          T val = (col < W) ? src[real_row * W + real_col] : (T)0;
+
+#pragma unroll
+          for (int off = 1; off < SEG; off <<= 1) {
+            T n = tl::shfl_down(val, off);
+            if (lane < SEG - off)
+              val += n;
+          }
+
+          val += carry;
+
+          if (real_col < W)
+            dst[real_row * W + real_col] = val;
+
+          T segSum = tl::shfl(val, 0);
+          if (lane == 0)
+            carry = segSum;
+          carry = tl::shfl(carry, 0);
+        }
+      } else {
+        for (int seg = 0; seg * SEG < W; ++seg) {
+          const int col = seg * SEG + lane;
+
+          const int real_row = Axis == 1 ? gRow : col;
+          const int real_col = Axis == 1 ? col : gRow;
+
+          T val = (col < W) ? src[real_row * W + real_col] : (T)0;
+
+#pragma unroll
+          for (int off = 1; off < SEG; off <<= 1) {
+            T n = tl::shfl_up(val, off);
+            if (lane >= off)
+              val += n;
+          }
+
+          val += carry;
+
+          if (real_col < W)
+            dst[real_row * W + real_col] = val;
+
+          T segSum = tl::shfl(val, SEG - 1);
+          if (lane == SEG - 1)
+            carry = segSum;
+          carry = tl::shfl(carry, SEG - 1);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename ReduceOp>
+TL_DEVICE T warp_reduce(T value, ReduceOp op) {
+  value = op(value, __shfl_xor(value, 32));
+  value = op(value, __shfl_xor(value, 16));
+  value = op(value, __shfl_xor(value, 8));
+  value = op(value, __shfl_xor(value, 4));
+  value = op(value, __shfl_xor(value, 2));
+  value = op(value, __shfl_xor(value, 1));
+  return value;
+}
+
+template <typename T> TL_DEVICE T warp_reduce_sum(T value) {
+  return warp_reduce<T>(value, SumOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_max(T value) {
+  return warp_reduce<T>(value, MaxOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_min(T value) {
+  return warp_reduce<T>(value, MinOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitand(T value) {
+  return warp_reduce<T>(value, BitAndOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitor(T value) {
+  return warp_reduce<T>(value, BitOrOp());
+}
+
 } // namespace tl
diff --git a/testing/python/autotune/test_tilelang_autotune.py b/testing/python/autotune/test_tilelang_autotune.py
index 53707ca34..f4b9709a8 100644
--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -251,7 +251,6 @@ def main(
         AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
         .set_compile_args(
             out_idx=[-1],
-            target="auto",
         )
         .set_profile_args(
             ref_prog=ref_program,
@@ -260,11 +259,13 @@ def main(
     return autotuner.run(warmup=3, rep=20)
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_get_configs():
     get_configs(1024, 1024, 1024, with_roller=True)
     get_configs(1024, 1024, 1024, with_roller=False)
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_matmul():
     matmul(1024, 1024, 1024, with_roller=True)
     matmul(1024, 1024, 1024, with_roller=False)
diff --git a/testing/python/cache/test_tilelang_kernel_cache.py b/testing/python/cache/test_tilelang_kernel_cache.py
index 9f6683a8d..617bf76be 100644
--- a/testing/python/cache/test_tilelang_kernel_cache.py
+++ b/testing/python/cache/test_tilelang_kernel_cache.py
@@ -118,6 +118,7 @@ def clean_cache_env(tmp_path, request):
     return cache_dir
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_disk_cache_with_postproc(clean_cache_env, backend):
     """Test disk cache for multiple backends using postproc callback.
@@ -195,6 +196,7 @@ def vector_add(
     torch.testing.assert_close(c1, c2)
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_cache_miss_detection(clean_cache_env, backend):
     """Verify cache correctly misses when function changes.
@@ -245,6 +247,7 @@ def func2(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
     assert counter.count == 2, f"Different function should cause cache miss, expected 2 calls, got {counter.count}"
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_cache_isolation_between_tests(clean_cache_env, backend):
     """Verify cache isolation between tests.
diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
index 67d20b897..dd58ca259 100644
--- a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -26,42 +26,49 @@ class _cudaDeviceAttrNames:
     cudaDevAttrMaxPersistingL2CacheSize: int = 108
 
 
+@tilelang.testing.requires_cuda
 def test_driver_get_device_properties():
     prop = get_cuda_device_properties()
     assert prop is not None, "Failed to get CUDA device properties"
     assert isinstance(prop, torch.cuda._CudaDeviceProperties), "Returned object is not of type _CudaDeviceProperties"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_device_name():
     tl_device_name = get_device_name()
     th_device_name = torch.cuda.get_device_name()
     assert tl_device_name == th_device_name, "Device names do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_shared_memory_per_block():
     tl_smem = get_shared_memory_per_block()
     driver_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerBlock)
     assert tl_smem == driver_smem, "Shared memory per block values do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_persisting_l2_cache_size():
     tl_cache_size = get_persisting_l2_cache_max_size()
     driver_cache_size = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
     assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_num_sms():
     tl_num_sms = get_num_sms()
     driver_num_sms = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMultiProcessorCount)
     assert tl_num_sms == driver_num_sms, "Number of SMs do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_registers_per_block():
     tl_regs_per_block = get_registers_per_block()
     driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
     assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_max_dynamic_shared_size_bytes():
     tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
     driver_dynamic_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
diff --git a/testing/python/carver/test_tilelang_carver_recommend_hints.py b/testing/python/carver/test_tilelang_carver_recommend_hints.py
index 3a060f532..a096ec3b2 100644
--- a/testing/python/carver/test_tilelang_carver_recommend_hints.py
+++ b/testing/python/carver/test_tilelang_carver_recommend_hints.py
@@ -133,6 +133,7 @@ def run_fmha_recommend_hints(
     assert len(hints) > 0, "Hints length should be greater than 0"
 
 
+@tilelang.testing.requires_cuda
 def test_fmha_recommend_hints():
     run_fmha_recommend_hints(4, 32, 512, 512, 128, T.float16, T.float16, T.float16)
     run_fmha_recommend_hints(4, 32, 512, 512, 128, T.int8, T.int32, T.int32)
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
index 4c4f4e5f3..3dcd7f57c 100644
--- a/testing/python/components/test_storage_rewrite_detect_inplace.py
+++ b/testing/python/components/test_storage_rewrite_detect_inplace.py
@@ -1,6 +1,9 @@
 import tilelang
 import tilelang.testing
 from tilelang import language as T
+from tilelang.utils.target import check_hip_availability
+
+_IS_HIP_AVAILABLE = check_hip_availability()
 
 
 @tilelang.jit
@@ -54,8 +57,9 @@ def test_storage_rewrite_detect_inplace_toggle():
     script_off = _get_device_kernel_script(detect_inplace=False)
     script_on = _get_device_kernel_script(detect_inplace=True)
 
-    assert script_off.count("read = (read * 2);") == 0
-    assert script_on.count("read = (read * 2);") > 0
+    pattern = "read[0] = (read[0] * 2);" if _IS_HIP_AVAILABLE else "read = (read * 2);"
+    assert script_off.count(pattern) == 0
+    assert script_on.count(pattern) > 0
 
 
 if __name__ == "__main__":
diff --git a/testing/python/debug/test_device_assert.py b/testing/python/debug/test_device_assert.py
index 210b8966d..4ed72903e 100644
--- a/testing/python/debug/test_device_assert.py
+++ b/testing/python/debug/test_device_assert.py
@@ -13,7 +13,7 @@ def program():
             tid = T.get_thread_binding()
             T.device_assert(tid > 0, "Assertion Trigger !")
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -25,7 +25,7 @@ def program():
             tid = T.get_thread_binding()
             T.device_assert(tid == tid)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index 735eb3e80..23c0f4d92 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -1,5 +1,5 @@
 # type: ignore
-
+import pytest
 import tilelang
 import tilelang.testing
 import tilelang.language as T
@@ -17,19 +17,11 @@ def program(Q: T.Tensor((M, N), dtype)):
     profiler.run_once()
 
 
-def test_debug_print_buffer():
-    debug_print_buffer(dtype=T.int8)
-    debug_print_buffer(dtype=T.int16)
-    debug_print_buffer(dtype=T.int32)
-    debug_print_buffer(dtype=T.int64)
-    debug_print_buffer(dtype=T.uint8)
-    debug_print_buffer(dtype=T.uint16)
-    debug_print_buffer(dtype=T.uint32)
-    debug_print_buffer(dtype=T.uint64)
-    debug_print_buffer(dtype=T.float16)
-    debug_print_buffer(dtype=T.float32)
-    debug_print_buffer(dtype=T.float64)
-    debug_print_buffer(dtype=T.bfloat16)
+@pytest.mark.parametrize(
+    "dtype", [T.int8, T.int16, T.int32, T.int64, T.uint8, T.uint16, T.uint32, T.uint64, T.float16, T.float32, T.float64, T.bfloat16]
+)
+def test_debug_print_buffer(dtype):
+    debug_print_buffer(dtype=dtype)
 
 
 @tilelang.testing.requires_cuda
@@ -55,7 +47,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             if bx == 0 and by == 0 and bz == 0:
                 T.print(shared_buf)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -74,7 +66,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             if tid == 0:
                 T.print(bx + by + bz)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -93,7 +85,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             for i, j in T.Parallel(M, N):
                 T.print(register_buf[i, j])
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -112,7 +104,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             if tid == 0:
                 T.print(bx + by + bz, msg="hello world")
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
index f2315ef21..d6a9ffe26 100644
--- a/testing/python/issue/test_tilelang_issue_1001.py
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -23,6 +23,7 @@ def buggy_kernel(x: T.Tensor[(num_tokens, hidden), T.float]):
     return buggy_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_cumsum_view_infer_layout():
     hidden = 128
     x = torch.randn(1, hidden, device="cuda", dtype=torch.float)
diff --git a/testing/python/issue/test_tilelang_issue_1008.py b/testing/python/issue/test_tilelang_issue_1008.py
index a35a18449..1b25e203c 100644
--- a/testing/python/issue/test_tilelang_issue_1008.py
+++ b/testing/python/issue/test_tilelang_issue_1008.py
@@ -39,12 +39,14 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
     return buggy_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_fill_with_static_region_kernel():
     kernel = _fill_with_static_region_kernel()
     x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
+@tilelang.testing.requires_cuda
 def test_fill_with_dynamic_region_kernel():
     kernel = _fill_with_dynamic_region_kernel()
     x = torch.zeros((256,), dtype=torch.int64, device="cuda")
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index 1a2a909d2..7edb5dbd1 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -41,7 +41,6 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), T.float32]):
     return buggy_kernel
 
 
-@tilelang.testing.requires_cuda
 def test_empty_with_dead_code_kernel():
     kernel = _empty_with_dead_code_kernel()
     x = torch.randn((128,), dtype=torch.float32, device="cuda")
diff --git a/testing/python/issue/test_tilelang_issue_96.py b/testing/python/issue/test_tilelang_issue_96.py
index 9bf5c69bd..db86e825e 100644
--- a/testing/python/issue/test_tilelang_issue_96.py
+++ b/testing/python/issue/test_tilelang_issue_96.py
@@ -37,7 +37,7 @@ def main(
 
 def run_gemm_pipeline_test(N, block_M=128, block_N=128, block_K=32):
     func = matmul(N, N, N, block_M, block_N, block_K)
-    jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda")
+    jit_kernel = tilelang.compile(func, out_idx=[2])
 
     torch.manual_seed(0)
     a = torch.randn(N, N, device="cuda", dtype=torch.float16)
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index 9ad8da47f..752657be0 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -1,7 +1,7 @@
 from tilelang import language as T
 import tilelang.testing
 import tilelang
-from tilelang.engine.callback import register_cuda_postproc_callback
+from tilelang.engine.callback import register_cuda_postproc_callback, register_hip_postproc_callback
 import torch
 import pytest
 
@@ -90,6 +90,11 @@ def tilelang_callback_cuda_postproc(code, _):
         code = f"// {stramp}\n" + code
         return code
 
+    @register_hip_postproc_callback
+    def tilelang_callback_hip_postproc(code, _):
+        code = f"// {stramp}\n" + code
+        return code
+
     tilelang.disable_cache()
     matmul_kernel = tilelang.compile(program, out_idx=-1)
     tilelang.enable_cache()
@@ -109,7 +114,7 @@ def test_cuda_postproc_callback():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         256,
         32,
@@ -224,7 +229,7 @@ def test_gemm_jit_kernel():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         256,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_cutedsl.py b/testing/python/jit/test_tilelang_jit_cutedsl.py
index 202bbf117..83457b97f 100644
--- a/testing/python/jit/test_tilelang_jit_cutedsl.py
+++ b/testing/python/jit/test_tilelang_jit_cutedsl.py
@@ -155,6 +155,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_jit_kernel():
     run_gemm_jit_kernel(
         512,
@@ -206,6 +207,7 @@ def run_cutedsl_kernel_do_bench(
     assert tvm_latency is not None
 
 
+@tilelang.testing.requires_cuda
 def test_cutedsl_kernel_do_bench():
     run_cutedsl_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
@@ -248,6 +250,7 @@ def run_cutedsl_kernel_multi_stream(
             matmul_kernel(tensor_a, tensor_b, tensor_c)
 
 
+@tilelang.testing.requires_cuda
 def test_cutedsl_kernel_multi_stream():
     run_cutedsl_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
@@ -297,6 +300,7 @@ def run_cutedsl_dynamic_shape(
     tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_cutedsl_dynamic_shape():
     run_cutedsl_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
diff --git a/testing/python/jit/test_tilelang_jit_gemm.py b/testing/python/jit/test_tilelang_jit_gemm.py
index 97391f26f..9d65714a9 100644
--- a/testing/python/jit/test_tilelang_jit_gemm.py
+++ b/testing/python/jit/test_tilelang_jit_gemm.py
@@ -103,7 +103,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
-def test_gemm_f16f16f16_nn_kernel_jit():
+def test_gemm_f16f16f32_nn_kernel_jit():
     run_gemm_kernel_jit(
         512,
         1024,
@@ -112,7 +112,7 @@ def test_gemm_f16f16f16_nn_kernel_jit():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         128,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py
index 04c71db9d..220a40f0a 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_cython.py
+++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py
@@ -166,7 +166,7 @@ def test_gemm_jit_kernel():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         256,
         32,
@@ -208,7 +208,7 @@ def run_cython_kernel_do_bench(
 
 
 def test_cython_kernel_do_bench():
-    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_cython_kernel_multi_stream(
@@ -252,7 +252,7 @@ def run_cython_kernel_multi_stream(
 
 
 def test_cython_kernel_multi_stream():
-    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_cython_dynamic_shape(
@@ -301,11 +301,11 @@ def run_cython_dynamic_shape(
 
 
 def test_cython_dynamic_shape():
-    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
-    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
-    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_cython_dynamic_shape_with_out_idx(
@@ -354,7 +354,7 @@ def run_cython_dynamic_shape_with_out_idx(
 
 
 def test_cython_dynamic_shape_with_out_idx():
-    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def matmul_int_variable(
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
index 6eda88a59..581fbc7e9 100644
--- a/testing/python/jit/test_tilelang_jit_nvrtc.py
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -156,6 +156,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_jit_kernel():
     run_gemm_jit_kernel(
         512,
@@ -207,6 +208,7 @@ def run_nvrtc_kernel_do_bench(
     assert tvm_latency is not None
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_kernel_do_bench():
     run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
@@ -249,6 +251,7 @@ def run_nvrtc_kernel_multi_stream(
             matmul_kernel(tensor_a, tensor_b, tensor_c)
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_kernel_multi_stream():
     run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
@@ -298,6 +301,7 @@ def run_nvrtc_dynamic_shape(
     tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_dynamic_shape():
     run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
@@ -369,6 +373,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_im2col_tma_desc():
     """Test im2col TMA descriptor with NVRTC backend."""
     if not check_hopper():
@@ -382,6 +387,7 @@ def test_nvrtc_im2col_tma_desc():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_l2_persistent_map():
     """Test L2 persistent cache annotation with elementwise add."""
     from tilelang.language import annotate_l2_hit_ratio
diff --git a/testing/python/jit/test_tilelang_jit_parcompile.py b/testing/python/jit/test_tilelang_jit_parcompile.py
index 56201e1cc..bcc76f3e5 100644
--- a/testing/python/jit/test_tilelang_jit_parcompile.py
+++ b/testing/python/jit/test_tilelang_jit_parcompile.py
@@ -58,8 +58,8 @@ def main(
 
 def test_par_compile():
     configs = [
-        (1024, 1024, 1024, 128, 128, 32),
-        (2048, 2048, 2048, 256, 256, 64),
+        (1024, 1024, 1024, 128, 128, 64),
+        (2048, 2048, 2048, 256, 256, 32),
         (4096, 4096, 4096, 64, 64, 128),
     ]
     kernels = matmul_kernel_jit.par_compile(configs)
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
index e0a3346ef..a53c6f70a 100644
--- a/testing/python/jit/test_tilelang_jit_tvm_ffi.py
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -165,7 +165,7 @@ def test_gemm_jit_kernel():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         256,
         32,
@@ -208,7 +208,7 @@ def run_tvm_ffi_kernel_do_bench(
 
 
 def test_tvm_ffi_kernel_do_bench():
-    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_tvm_ffi_kernel_multi_stream(
@@ -250,7 +250,7 @@ def run_tvm_ffi_kernel_multi_stream(
 
 
 def test_tvm_ffi_kernel_multi_stream():
-    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_tvm_ffi_dynamic_shape(
@@ -299,12 +299,12 @@ def run_tvm_ffi_dynamic_shape(
 
 
 def test_tvm_ffi_dynamic_shape():
-    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
-    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
     run_tvm_ffi_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2
     )
 
 
@@ -384,6 +384,7 @@ def test_tvm_ffi_im2col_tma_desc():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_tvm_ffi_l2_persistent_map():
     """Test L2 persistent cache annotation with elementwise add."""
     from tilelang.language import annotate_l2_hit_ratio
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_kernel_gemm.py
index 6dc95e98a..f6a412f14 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm.py
@@ -103,6 +103,7 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_nn():
     run_gemm(
         512,
@@ -168,6 +169,7 @@ def test_gemm_f32f32f32_nn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_tn():
     run_gemm(
         512,
@@ -185,6 +187,7 @@ def test_gemm_f16f16f16_tn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_nt():
     run_gemm(
         512,
@@ -210,6 +213,7 @@ def test_gemm_i8i8i32_tn():
     run_gemm(512, 1024, 768, True, False, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f64f64f64_nt():
     run_gemm(512, 512, 512, False, True, T.float64, T.float64, T.float64, 64, 32, 16)
 
@@ -230,6 +234,8 @@ def test_gemm_f32f32f32_nt():
     )
 
 
+# TODO(Gong): Meets precision issue on ROCm, disable for now
+@tilelang.testing.requires_cuda
 def test_gemm_f32f32f32_tn():
     run_gemm(
         512,
@@ -246,6 +252,7 @@ def test_gemm_f32f32f32_tn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_pad_aligned_f16f16f16_nn():
     run_gemm(
         512 - 8,
@@ -263,6 +270,7 @@ def test_pad_aligned_f16f16f16_nn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_pad_f16f16f16_nn():
     run_gemm(
         512 - 9,
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
index 584aa854a..5c52f432d 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
@@ -154,15 +154,19 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     assert latency is not None
 
     # Get Reference Result
-    ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, accum_dtype))
+    ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     print(C)
     print(ref_c)
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float32)
     assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+
+
+@tilelang.testing.requires_cuda
+def test_assert_tl_matmul_int8():
     assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
index 9d60e5229..1870be745 100644
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -196,6 +196,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_assert_tl_matmul_correctness():
     assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
     assert_tl_matmul_correctness(128, 128, 64, T.int8, T.int32, T.int32)
@@ -399,6 +400,7 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
 
 @tilelang.testing.requires_package("bitblas")
 @tilelang.testing.requires_llvm
+@tilelang.testing.requires_cuda
 def test_assert_tl_matmul_weight_only_transform():
     assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, T.int8, T.int32, T.int32)
 
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index 48fe1ac4d..77e1a60d2 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -45,7 +45,7 @@ def main(
 
 def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(program, out_idx=[2], target="cuda")
+    kernel = tilelang.compile(program, out_idx=[2])
     kernel.run_once()
 
 
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 883f65c3c..709796932 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -33,7 +33,7 @@ def run_alloc_var(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    assert "tmp =" in code
+    assert "tmp =" in code or "tmp[0] =" in code
 
 
 def test_alloc_var():
@@ -73,7 +73,7 @@ def run_alloc_var_add(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    assert "tmp =" in code
+    assert "tmp =" in code or "tmp[0] =" in code
 
 
 def test_alloc_var_add():
@@ -115,6 +115,8 @@ def run_alloc_var_with_initializer(
     assert f"= {init_value};" in code
 
 
+# TODO(Gong): ROCm is not supported yet, disable for now
+@tilelang.testing.requires_cuda
 def test_alloc_var_with_initializer():
     run_alloc_var_with_initializer(256, 64, T.int32, 5)
 
@@ -154,6 +156,8 @@ def run_alloc_multi_vars_with_initializer(
     assert code.count("= 2;") == 1
 
 
+# TODO(Gong): ROCm is not supported yet, disable for now
+@tilelang.testing.requires_cuda
 def test_alloc_multi_vars_with_initializer():
     run_alloc_multi_vars_with_initializer(256, 64, T.int32)
 
diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
index 5c9aeeac6..b6996d799 100644
--- a/testing/python/language/test_tilelang_language_annot.py
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -4,6 +4,11 @@
 import torch
 
 
+# TODO: HIP uses the cython execution backend as default(while CUDA uses tvm_ffi as default),
+# but building with the cython backend fails due to a bug.
+# Remove @tilelang.testing.requires_cuda after the bug is fixed.
+# See https://github.com/tile-ai/tilelang/issues/1594 for more details.
+@tilelang.testing.requires_cuda
 def test_tensor_annot_mul():
     @tilelang.jit
     def example_tensor_annot():
@@ -26,6 +31,11 @@ def kernel(
     assert torch.equal(A, expected)
 
 
+# TODO: HIP uses the cython execution backend as default(while CUDA uses tvm_ffi as default),
+# but building with the cython backend fails due to a bug.
+# Remove @tilelang.testing.requires_cuda after the bug is fixed.
+# See https://github.com/tile-ai/tilelang/issues/1594 for more details.
+@tilelang.testing.requires_cuda
 def test_tensor_annot_add():
     @tilelang.jit
     def example_tensor_annot():
@@ -48,6 +58,11 @@ def kernel(
     assert torch.equal(A, expected)
 
 
+# TODO: HIP uses the cython execution backend as default(while CUDA uses tvm_ffi as default),
+# but building with the cython backend fails due to a bug.
+# Remove @tilelang.testing.requires_cuda after the bug is fixed.
+# See https://github.com/tile-ai/tilelang/issues/1594 for more details.
+@tilelang.testing.requires_cuda
 def test_tensor_annot_mul_add():
     @tilelang.jit
     def example_tensor_annot():
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index 3c8239a15..d4d93232d 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -42,6 +42,7 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16,
     torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, pad_value=10)
 
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index fa4dff7b3..b3c94a742 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -195,9 +195,9 @@ def ref_program(A, B):
 
 
 @tilelang.jit
-def atomic_addx2_program(M, N, block_M, block_N):
+def atomic_addx2_program(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
-    def atomic_addx2(A: T.Tensor((M, N), T.float16), B: T.Tensor((M, N), T.float16)):
+    def atomic_addx2(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N // 2):
                 idx_i = bx * block_M + i
@@ -207,12 +207,12 @@ def atomic_addx2(A: T.Tensor((M, N), T.float16), B: T.Tensor((M, N), T.float16))
     return atomic_addx2
 
 
-def run_atomic_addx2(M, N, block_M, block_N):
-    kernel = atomic_addx2_program(M, N, block_M, block_N)
+def run_atomic_addx2(M, N, block_M, block_N, dtype=T.float16):
+    kernel = atomic_addx2_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
-    A = torch.randn(M, N, dtype=torch.float16).cuda()
-    B = torch.zeros(M, N, dtype=torch.float16).cuda()
+    A = torch.randn(M, N, dtype=torch.float32).cuda().to(getattr(torch, dtype))
+    B = torch.zeros(M, N, dtype=torch.float32).cuda().to(getattr(torch, dtype))
     ref_B = B.clone()
 
     for i in range(M):
@@ -235,16 +235,23 @@ def test_atomic_min():
     run_atomic_min(4, 64, 64, 16, 16)
 
 
+@tilelang.testing.requires_cuda
 def test_atomic_load_store():
     run_atomic_load_store(64, 64, 16, 16)
 
 
+@tilelang.testing.requires_cuda
 def test_atomic_memory_order():
     run_atomic_memory_order(4, 64, 64, 16, 16)
 
 
-def test_atomic_addx2():
-    run_atomic_addx2(32, 64, 8, 16)
+@tilelang.testing.requires_cuda
+def test_atomic_addx2_half():
+    run_atomic_addx2(32, 64, 8, 16, dtype=T.float16)
+
+
+def test_atomic_addx2_float():
+    run_atomic_addx2(32, 64, 8, 16, dtype=T.float32)
 
 
 @tilelang.jit
@@ -343,6 +350,7 @@ def run_atomic_return_prev(M, N, block_M, block_N, dtype=T.float32):
     torch.testing.assert_close(B, initial_B + A, atol=1e-3, rtol=1e-3)
 
 
+@tilelang.testing.requires_cuda
 def test_atomic_different_memory_orders():
     run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float32)
     run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float16)
diff --git a/testing/python/language/test_tilelang_language_clear.py b/testing/python/language/test_tilelang_language_clear.py
index af9d89631..2e4c732fc 100644
--- a/testing/python/language/test_tilelang_language_clear.py
+++ b/testing/python/language/test_tilelang_language_clear.py
@@ -41,7 +41,7 @@ def main(
 
 def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=[2], pass_configs={"tl.disable_tma_lower": True})
     import torch
     from tilelang.utils import map_torch_type
 
diff --git a/testing/python/language/test_tilelang_language_composable_index.py b/testing/python/language/test_tilelang_language_composable_index.py
index 7893c1f24..09f9ad9c4 100644
--- a/testing/python/language/test_tilelang_language_composable_index.py
+++ b/testing/python/language/test_tilelang_language_composable_index.py
@@ -30,7 +30,6 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index 29bb0f951..d9d6659d1 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -29,7 +29,6 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
         pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
     )
     source = kernel.get_kernel_source()
@@ -66,7 +65,6 @@ def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
@@ -131,7 +129,6 @@ def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, blo
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
         pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index aacbdacee..9d5213e1a 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -319,6 +319,8 @@ def swap_idx(A: T.Tensor[(2,), T.float32]):
     torch.testing.assert_close(data, ref)
 
 
+# TODO(Gong): ROCm is not supported alloc_var with initializer
+@tilelang.testing.requires_cuda
 def test_while_loop():
     @tilelang.jit(out_idx=-1)
     @T.prim_func
diff --git a/testing/python/language/test_tilelang_language_infinity.py b/testing/python/language/test_tilelang_language_infinity.py
index 746afc4e0..a33a616b3 100644
--- a/testing/python/language/test_tilelang_language_infinity.py
+++ b/testing/python/language/test_tilelang_language_infinity.py
@@ -1,5 +1,6 @@
 import torch
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 
 
diff --git a/testing/python/language/test_tilelang_language_let.py b/testing/python/language/test_tilelang_language_let.py
index 6f94ad664..e1f3f394b 100644
--- a/testing/python/language/test_tilelang_language_let.py
+++ b/testing/python/language/test_tilelang_language_let.py
@@ -3,6 +3,7 @@
 from tilelang import language as T
 
 
+@tilelang.testing.requires_cuda
 def test_let_vectorize_load():
     @T.prim_func
     def main(A_ptr: T.handle):
diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index 8f8997291..e577210b1 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -29,7 +29,7 @@ def main(
 def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -65,7 +65,7 @@ def main(
 def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -102,7 +102,7 @@ def main(
 def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -138,7 +138,7 @@ def main(
 def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_ptr.py b/testing/python/language/test_tilelang_language_ptr.py
index 85458139a..da137e019 100644
--- a/testing/python/language/test_tilelang_language_ptr.py
+++ b/testing/python/language/test_tilelang_language_ptr.py
@@ -41,7 +41,7 @@ def main(
 
 def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    jit_kernel = tl.compile(program, target="cuda", execution_backend="cython")
+    jit_kernel = tl.compile(program, execution_backend="cython")
 
     def ref_program(a, b):
         return (a @ b.T).to(torch.float32)
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
index 06367e975..665e57737 100644
--- a/testing/python/language/test_tilelang_language_unroll.py
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -13,10 +13,12 @@ def main(A_ptr: T.handle):
                 for i in T.unroll(0, 16, step=4):
                     A[0, i] = 1.0
 
-    kernel = tilelang.compile(main, target="cuda")
+    kernel = tilelang.compile(main)
     assert "#pragma unroll" in kernel.get_kernel_source()
 
 
+# TODO: unroll factor is not supported on hip, skip.
+@tilelang.testing.requires_cuda
 def test_unroll_with_unroll_factor():
     @T.prim_func
     def main(A_ptr: T.handle):
@@ -27,7 +29,7 @@ def main(A_ptr: T.handle):
                 for i in T.unroll(0, 16, unroll_factor=4):
                     A[0, i] = 1.0
 
-    kernel = tilelang.compile(main, target="cuda")
+    kernel = tilelang.compile(main)
     assert "#pragma unroll 4" in kernel.get_kernel_source()
 
 
diff --git a/testing/python/language/test_tilelang_language_var_init.py b/testing/python/language/test_tilelang_language_var_init.py
index 36d9bf014..35e8a074d 100644
--- a/testing/python/language/test_tilelang_language_var_init.py
+++ b/testing/python/language/test_tilelang_language_var_init.py
@@ -3,6 +3,8 @@
 import tilelang.testing
 
 
+# TODO: var init is not supported on hip.
+@tilelang.testing.requires_cuda
 def test_var_assign() -> None:
     @tilelang.jit(out_idx=-1)
     def jit_kernel():
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index 7462aa81b..577527ae6 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -124,6 +124,7 @@ def main(A: T.Tensor[(64,), dtype]):
     return main
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index 33d40e679..e4684f70c 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -75,6 +75,7 @@ def run_vectorized_cast(src_dtype: T.dtype, dst_dtype: T.dtype, check_str: str,
     torch.testing.assert_close(A.to(dst_dtype.as_torch()), C)
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "src_dtype, dst_dtype, check_str, lanes",
     [
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
index 67123cb8c..8e023d8da 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -111,16 +111,15 @@ def ref_program(A, B):
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
-        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
-        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
-        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
-        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 128, 32, 2, 128),
+        (128, 16, 32, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 0, 128),
         (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
@@ -131,6 +130,31 @@ def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, blo
     run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
+@pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2fnuz, T.float8_e5m2fnuz, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e4m3fnuz, T.float8_e4m3fnuz, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
 def matmul_rs(
     M,
     N,
@@ -244,16 +268,15 @@ def ref_program(A, B):
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 16, 32, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 0, 128),
         (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
@@ -264,6 +287,31 @@ def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, blo
     run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
+@pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2fnuz, T.float8_e5m2fnuz, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e4m3fnuz, T.float8_e4m3fnuz, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
 def matmul_sr(
     M,
     N,
@@ -376,26 +424,51 @@ def ref_program(A, B):
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
-        (128, 128, 32, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
-        (128, 128, 32, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 16, 32, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 0, 128),
+        # TODO: There is precision problem when num_stages=2 on ROCm
+        # (128, 128, 32, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        # (128, 128, 32, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 32, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 32, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
-        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        # TODO: There is precision problem needs to repair on ROCm
+        # (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        # (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
     ],
 )
 def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
     run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        # TODO: There is precision problem needs to repair
+        # (128, 128, 128, True, True, T.float8_e5m2fnuz, T.float8_e5m2fnuz, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e4m3fnuz, T.float8_e4m3fnuz, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
 def matmul_rr(
     M,
     N,
@@ -514,18 +587,18 @@ def ref_program(A, B):
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
-        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
-        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
         (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float, 128, 256, 32, 2, 128),
-        (128, 8, 128, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 2, 128),
-        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 32, 2, 128),
+        # TODO: There is precision problem when num_stages=2 on ROCm
+        # (128, 16, 128, False, True, T.float16, T.float16, T.float32, 128, 16, 32, 2, 128)
+        # (128, 16, 128, False, True, T.int8, T.int8, T.int32, 128, 16, 32, 2, 128),
         (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
-        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
         (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
@@ -536,5 +609,29 @@ def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, blo
     run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr_fp8_cuda(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        # TODO: There is precision problem needs to repair
+        # (128, 128, 128, True, True, T.float8_e5m2fnuz, T.float8_e5m2fnuz, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e4m3fnuz, T.float8_e4m3fnuz, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr_fp8_rocm(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
index 33395a53d..3f3273b9a 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -153,6 +153,7 @@ def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
     return A, B
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
@@ -303,6 +304,7 @@ def _matmul(A, B):
     print("pass")
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
@@ -453,6 +455,7 @@ def _matmul(A, B):
     print("pass")
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
@@ -607,6 +610,7 @@ def _matmul(A, B):
     print("pass")
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 25edca740..024c7d253 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-import os
-import os.path as osp
 from typing import Callable
 import tilelang.transform
 from tilelang import tvm as tvm
@@ -12,6 +10,7 @@
 from tvm.ir import CallingConv
 from tvm.target import Target
 from tilelang.contrib import hipcc, nvcc
+from tilelang.env import COMPOSABLE_KERNEL_INCLUDE_DIR, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
 from tilelang.transform import PassConfigKey
 from tilelang.engine.param import KernelParam, CompiledArtifact
 from tilelang.utils.target import determine_target
@@ -57,17 +56,6 @@ def get_host_call(is_device_c: bool = False) -> Callable[[tir.PrimFunc], bool]:
 
 @tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
 def tilelang_callback_cuda_compile(code, target, pass_config=None):
-    project_root = osp.join(osp.dirname(__file__), "../..")
-    if "TL_TEMPLATE_PATH" in os.environ:
-        tl_template_path = os.environ["TL_TEMPLATE_PATH"]
-    else:
-        tl_template_path = osp.abspath(osp.join(project_root, "src"))
-    # TODO(lei): this indeed should be renamed into
-    # TL_CUTLASS_INCLUDE_PATH in the future
-    if "TL_CUTLASS_PATH" in os.environ:
-        cutlass_path = os.environ["TL_CUTLASS_PATH"]
-    else:
-        cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
     target_arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
 
     arch = [f"-arch=sm_{target_arch}"]
@@ -82,8 +70,8 @@ def tilelang_callback_cuda_compile(code, target, pass_config=None):
 
     options = [
         "-std=c++17",
-        "-I" + tl_template_path,
-        "-I" + cutlass_path,
+        "-I" + TILELANG_TEMPLATE_PATH,
+        "-I" + CUTLASS_INCLUDE_DIR,
     ]
     # Merge extra device compiler flags from pass config, if provided
     extra_flags = cfg.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS, None)
@@ -124,23 +112,13 @@ def tilelang_callback_cuda_compile(code, target, pass_config=None):
 
 @tvm_ffi.register_global_func("tilelang_callback_hip_compile", override=True)
 def tilelang_callback_hip_compile(code, target):
-    project_root = osp.join(osp.dirname(__file__), "../..")
-    tl_template_path = osp.abspath(osp.join(project_root, "src"))
-
-    # TODO(lei): actually this indeed should be renamed into
-    # TL_COMPOSABLE_KERNEL_INCLUDE_PATH in the future
-    if "TL_COMPOSABLE_KERNEL_PATH" in os.environ:
-        ck_path = os.environ["TL_COMPOSABLE_KERNEL_PATH"]
-    else:
-        ck_path = osp.abspath(osp.join(project_root, "3rdparty/composable_kernel/include"))
-
     hsaco = hipcc.compile_hip(
         code,
         target_format="hsaco",
         options=[
             "-std=c++17",
-            "-I" + tl_template_path,
-            "-I" + ck_path,
+            "-I" + TILELANG_TEMPLATE_PATH,
+            "-I" + COMPOSABLE_KERNEL_INCLUDE_DIR,
         ],
         verbose=False,
     )
diff --git a/tilelang/intrinsics/mfma_layout.py b/tilelang/intrinsics/mfma_layout.py
index 389596494..d8af97988 100644
--- a/tilelang/intrinsics/mfma_layout.py
+++ b/tilelang/intrinsics/mfma_layout.py
@@ -1,11 +1,12 @@
 from tvm import DataType
 from tvm.runtime import convert
+from tvm.tir import const
 import tilelang.language as T
 
 
 def shared_16x4_to_local_64x1_layout_A(i, j):
     thread_id = j * 16 + i
-    return thread_id, convert(0)
+    return thread_id, const(0)
 
 
 def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
@@ -16,7 +17,7 @@ def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
 
 def shared_4x16_to_local_64x1_layout_B(i, j):
     thread_id = i * 16 + j
-    return thread_id, convert(0)
+    return thread_id, const(0)
 
 
 def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index ad2192061..89fd0bdc6 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -50,6 +50,7 @@ class MatrixCoreIntrinEmitter:
         "float8_e4m3": "e4m3",
         "float8_e5m2": "e5m2",
         "float8_e4m3fnuz": "e4m3fnuz",
+        "float8_e5m2fnuz": "e5m2fnuz",
     }
 
     # k_pack represents the number of elements in a vectorized instruction
@@ -107,7 +108,7 @@ def __init__(
 
     def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
-            if a_dtype in ["float8_e4m3fnuz", T.int8]:
+            if a_dtype in ["float8_e4m3fnuz", "float8_e5m2fnuz", T.int8]:
                 self.k_dim = 32
                 return
             a_dtype = DataType(a_dtype)
@@ -141,6 +142,7 @@ def _initialize_mfma_prefix(self, k_dim=16):
             "int8": "i8",
             "int32": "i32",
             "float8_e4m3fnuz": "fp8",
+            "float8_e5m2fnuz": "fp8",
         }[in_dtype]
 
         if in_dtype_abbrv == "fp8":
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 1c50efa88..6e984bcd8 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -283,7 +283,7 @@ def create_dispatch_func(self, code, function_informations):
             index = match_declare_kernel(code, function_name + "(")
 
             # Analyze the function declaration to prepare for argument extraction
-            declaration = code[index:].split(";")[0]
+            declaration = self.get_declaration(code[index:])
 
             # Identify the start of the function body to insert arguments
             index = code.index("{", index)
@@ -347,6 +347,9 @@ def create_dispatch_func(self, code, function_informations):
         host_func = PREDEF_HOST_FUNC.format(def_args, kernel_launch_code)
         return host_func
 
+    def get_declaration(self, declare_kernel_code: str) -> str:
+        return declare_kernel_code.split(";")[0]
+
     def generate_l2_persistent_map(self, function_name: str) -> str:
         if function_name not in self.l2_persistent_map:
             return ""
@@ -620,12 +623,14 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "float8_e4m3": "fp8_e4_t",
         "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
+        "float8_e5m2fnuz": "fp8_e5_t",
         "float8_e4m3fnuz": "fp8_e4_t",
         "e4m3fnuz_float8": "fp8_e4_t",
         "float64": "double",
         "int64": "int64_t",
         "int32": "int",
         "uint32": "unsigned int",
+        "uint64": "uint64_t",
         "bool": "int8_t",
         "int8": "int8_t",
         "uint8": "uint8_t",
@@ -645,6 +650,11 @@ def __init__(
     ):
         super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
 
+    def get_declaration(self, declare_kernel_code: str) -> str:
+        # HIP code dont have function declaration, so we use '{\n' to split
+        # __global__ void __launch_bounds__(128) kernel_kernel(float* __restrict__ A) {\n
+        return declare_kernel_code.split("{")[0]
+
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
         call_str = """"""

From c3ec62ff9e338a5b1ba1234504e141a5cd594142 Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Sun, 4 Jan 2026 11:55:28 +0800
Subject: [PATCH 2/7] fix pre-commit

---
 .../language/test_tilelang_language_mask_op.py   | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index e577210b1..cd899a606 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -28,9 +28,7 @@ def main(
 
 def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -64,9 +62,7 @@ def main(
 
 def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -101,9 +97,7 @@ def main(
 
 def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -137,9 +131,7 @@ def main(
 
 def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)

From 59637e8115a962093f45da0a305565ccc182912d Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Sun, 4 Jan 2026 16:47:30 +0800
Subject: [PATCH 3/7] fix build issue on hip

---
 .../language/test_tilelang_language_annot.py  | 27 +++++++------------
 ...t_tilelang_language_annotate_safe_value.py |  4 +--
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
index b6996d799..1647a296c 100644
--- a/testing/python/language/test_tilelang_language_annot.py
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -4,13 +4,10 @@
 import torch
 
 
-# TODO: HIP uses the cython execution backend as default(while CUDA uses tvm_ffi as default),
-# but building with the cython backend fails due to a bug.
-# Remove @tilelang.testing.requires_cuda after the bug is fixed.
-# See https://github.com/tile-ai/tilelang/issues/1594 for more details.
-@tilelang.testing.requires_cuda
 def test_tensor_annot_mul():
-    @tilelang.jit
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
     def example_tensor_annot():
         n = T.symbolic("n")
 
@@ -31,13 +28,10 @@ def kernel(
     assert torch.equal(A, expected)
 
 
-# TODO: HIP uses the cython execution backend as default(while CUDA uses tvm_ffi as default),
-# but building with the cython backend fails due to a bug.
-# Remove @tilelang.testing.requires_cuda after the bug is fixed.
-# See https://github.com/tile-ai/tilelang/issues/1594 for more details.
-@tilelang.testing.requires_cuda
 def test_tensor_annot_add():
-    @tilelang.jit
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
     def example_tensor_annot():
         n = T.symbolic("n")
 
@@ -58,13 +52,10 @@ def kernel(
     assert torch.equal(A, expected)
 
 
-# TODO: HIP uses the cython execution backend as default(while CUDA uses tvm_ffi as default),
-# but building with the cython backend fails due to a bug.
-# Remove @tilelang.testing.requires_cuda after the bug is fixed.
-# See https://github.com/tile-ai/tilelang/issues/1594 for more details.
-@tilelang.testing.requires_cuda
 def test_tensor_annot_mul_add():
-    @tilelang.jit
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
     def example_tensor_annot():
         n = T.symbolic("n")
 
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index d4d93232d..6dd13344e 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -28,9 +28,7 @@ def main(
 
 def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16, pad_value=0):
     program = tilelang_copy(M, N, block_M, block_N, dtype, pad_value=pad_value)
-    kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     ref_b = torch.zeros_like(a)

From c6bfcdd4a43bccae996f04560bed0f200d438fd6 Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Mon, 5 Jan 2026 11:10:18 +0800
Subject: [PATCH 4/7] refactor on hip

---
 src/target/codegen_hip.cc | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index aaac71dcf..05bbd0acf 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -767,24 +767,25 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     }
     this->stream << ");\n";
   };
-  if (op->op.same_as(builtin::ptx_cp_async())) {
+  if (op->op.same_as(builtin::ptx_cp_async()) || op->op.same_as(tl::ptx_cp_async())) {
+    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
+    // args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
     std::string dst = this->PrintExpr(op->args[0]);
-    std::string dst_offset = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string size = this->PrintExpr(op->args[4]);
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      this->PrintIndent();
-      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << "+"
-                   << dst_offset << ", " << src << "+" << src_offset << ");\n";
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = this->PrintExpr(op->args[2]);
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      // Non-predicated version
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
     } else {
-      std::string condition = this->PrintExpr(op->args[5]);
-      this->PrintIndent();
+      // Predicated version
+      std::string condition = this->PrintExpr(op->args[3]);
       this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
-                   << "+" << dst_offset << ", " << src << "+" << src_offset
-                   << ", " << condition << ");\n";
+                   << ", " << src << ", " << condition << ");\n";
     }
   } else if (op->op.same_as(builtin::ptx_commit_group())) {
     print_extern_call_stmt("tl::cp_async_commit");

From 9ea2b555bb89c1f8b279bfa8d0d7aeb65d12c883 Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Mon, 5 Jan 2026 11:17:05 +0800
Subject: [PATCH 5/7] pre-commit fix

---
 src/target/codegen_hip.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 05bbd0acf..ce904307a 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -767,7 +767,8 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     }
     this->stream << ");\n";
   };
-  if (op->op.same_as(builtin::ptx_cp_async()) || op->op.same_as(tl::ptx_cp_async())) {
+  if (op->op.same_as(builtin::ptx_cp_async()) ||
+      op->op.same_as(tl::ptx_cp_async())) {
     // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
     // args[3] = predicate (optional)
     ICHECK(op->args.size() == 3 || op->args.size() == 4)

From 413522f4717e748cca09f90d0a761efb011bf65a Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Mon, 5 Jan 2026 11:33:39 +0800
Subject: [PATCH 6/7] fix test with pdl

---
 testing/python/jit/test_tilelang_jit_cython.py  | 14 ++------------
 testing/python/jit/test_tilelang_jit_nvrtc.py   | 14 ++------------
 testing/python/jit/test_tilelang_jit_tvm_ffi.py | 14 ++------------
 3 files changed, 6 insertions(+), 36 deletions(-)

diff --git a/testing/python/jit/test_tilelang_jit_cython.py b/testing/python/jit/test_tilelang_jit_cython.py
index a492b936b..07b369bea 100644
--- a/testing/python/jit/test_tilelang_jit_cython.py
+++ b/testing/python/jit/test_tilelang_jit_cython.py
@@ -3,23 +3,13 @@
 import tilelang.testing
 import tilelang
 import torch
-import pytest
-
-
-def check_pdl():
-    if not torch.cuda.is_available():
-        return False
-    props = torch.cuda.get_device_properties(0)
-    compute_capability = props.major, props.minor
-    return compute_capability[0] >= 9
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
 def test_cython_pdl():
     """Test pdl."""
 
-    if not check_pdl():
-        pytest.skip("PDL Test requires compute capability >= 9")
-
     N = 64
 
     @tilelang.jit(execution_backend="cython")
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
index 581fbc7e9..1ccd2c5c3 100644
--- a/testing/python/jit/test_tilelang_jit_nvrtc.py
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -4,7 +4,6 @@
 import tilelang
 import torch
 from tilelang.utils.tensor import map_torch_type
-import pytest
 
 
 def matmul(
@@ -439,20 +438,11 @@ def kernel(
     print("L2 persistent map test passed!")
 
 
-def check_pdl():
-    if not torch.cuda.is_available():
-        return False
-    props = torch.cuda.get_device_properties(0)
-    compute_capability = props.major, props.minor
-    return compute_capability[0] >= 9
-
-
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
 def test_nvrtc_pdl():
     """Test pdl."""
 
-    if not check_pdl():
-        pytest.skip("PDL Test requires compute capability >= 9")
-
     N = 64
 
     @tilelang.jit(execution_backend="nvrtc")
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
index a53c6f70a..4b8e99764 100644
--- a/testing/python/jit/test_tilelang_jit_tvm_ffi.py
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -4,7 +4,6 @@
 import tilelang
 import torch
 from tilelang.utils.tensor import map_torch_type
-import pytest
 
 
 def matmul(
@@ -444,20 +443,11 @@ def kernel(
     print("L2 persistent map test passed!")
 
 
-def check_pdl():
-    if not torch.cuda.is_available():
-        return False
-    props = torch.cuda.get_device_properties(0)
-    compute_capability = props.major, props.minor
-    return compute_capability[0] >= 9
-
-
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
 def test_tvm_ffi_pdl():
     """Test pdl."""
 
-    if not check_pdl():
-        pytest.skip("PDL Test requires compute capability >= 9")
-
     N = 64
 
     @tilelang.jit(execution_backend="tvm_ffi")

From 2e3e2fd2a83025dab3a69b2de597da083b6c5ac6 Mon Sep 17 00:00:00 2001
From: Gongen-Ali <gongen.ge@alibaba-inc.com>
Date: Mon, 5 Jan 2026 12:49:37 +0800
Subject: [PATCH 7/7] fix pdl test

---
 testing/python/language/test_tilelang_language_pdl.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/testing/python/language/test_tilelang_language_pdl.py b/testing/python/language/test_tilelang_language_pdl.py
index 4d0120c6e..77fe984ea 100644
--- a/testing/python/language/test_tilelang_language_pdl.py
+++ b/testing/python/language/test_tilelang_language_pdl.py
@@ -34,6 +34,7 @@ def main(
     return main
 
 
+@tilelang.testing.requires_cuda
 def test_pdl_trigger():
     N = 64
     program = kernels_with_pdl_trigger(N)
@@ -43,6 +44,7 @@ def test_pdl_trigger():
     assert "cudaTriggerProgrammaticLaunchCompletion" in code
 
 
+@tilelang.testing.requires_cuda
 def test_pdl_sync():
     N = 64
     program = kernels_with_pdl_sync(N)