tile-ai · LeiWang1999 · Aug 17, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 14, 2025
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
@@ -53,7 +53,7 @@ def get_configs(args, kwargs):
         from tilelang.carver.roller.rasterization import NoRasterization
         import torch
 
-        arch = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
         topk = 10
 
         carve_template = MatmulTemplate(

diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -187,7 +187,7 @@ def get_configs(args, kwargs):
         from tilelang.carver.roller.rasterization import NoRasterization
         import torch
 
-        arch = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
         topk = 10
 
         carve_template = MatmulTemplate(

diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
@@ -96,7 +96,7 @@ def conv(
 
 def main():
     my_func = kernel(N, C, H, W, F, K, S, D, P, 64, 128, 32, 3, 256)
-    cuda_device = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
+    cuda_device = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
     result = Analyzer.analysis(my_func, cuda_device)
     print(result)
     print(f"Analyzed FLOPs: {result.total_flops}")

diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
@@ -49,7 +49,7 @@ def matmul(
 def main():
     my_func = kernel(128, 128, 32, 3, 128, True)
 
-    cuda_device = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
+    cuda_device = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
     result = Analyzer.analysis(my_func, cuda_device)
 
     print(f"Analyzed FLOPs: {result.total_flops}")

diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -7,8 +7,6 @@
 from tilelang.autotuner import *
 from example_fusedmoe_torch import *
 
-# tilelang.disable_cache()
-
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 def moe_forward_tilelang_shared(d_hidden,

diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
@@ -16,7 +16,7 @@ def ref_program(A, B):
 
 def get_configs(M, N, K, with_roller=False, topk=20):
     if with_roller:
-        arch = CDNA("cuda") if torch.version.hip is None else CUDA("hip")
+        arch = CUDA("cuda") if torch.version.hip is None else CDNA("hip")
         carve_template = MatmulTemplate(
             M=M,
             N=N,

diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -145,20 +145,10 @@ def flash_attn(
                         clear_accum=True,
                         wg_wait=-1)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_r,
-                        KV_shared_0_r,
-                        acc_s_0,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_0_pe_is_ready, k % 2)
-                    T.gemm(
-                        Q_pe_local_0,
-                        K_pe_shared_0,
-                        acc_s_0,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_pe_local_0, K_pe_shared_0, acc_s_0, transpose_B=True, wg_wait=-1)
 
                     T.wait_wgmma(0)
 
@@ -261,20 +251,10 @@ def flash_attn(
                         wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_r,
-                        KV_shared_1_r,
-                        acc_s_1,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_pe_is_ready, k % 2)
-                    T.gemm(
-                        Q_pe_local_1,
-                        K_pe_shared_1,
-                        acc_s_1,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_pe_local_1, K_pe_shared_1, acc_s_1, transpose_B=True, wg_wait=-1)
 
                     T.wait_wgmma(0)
 
@@ -308,11 +288,7 @@ def flash_attn(
 
                     # Step 10. compute O1 with KV_shared_1_rd
                     T.copy(acc_s_1, acc_s_1_cast)
-                    T.gemm(
-                        acc_s_1_cast,
-                        KV_shared_1_r,
-                        acc_o_r,
-                        wg_wait=-1)
+                    T.gemm(acc_s_1_cast, KV_shared_1_r, acc_o_r, wg_wait=-1)
                     T.copy(acc_s_1_cast, SP1_shared)
                     T.barrier_arrive(s_shared_ready_barrier)
 

diff --git a/setup.py b/setup.py
@@ -1,3 +1,6 @@
+import fcntl
+import functools
+import hashlib
 import io
 import subprocess
 import shutil
@@ -12,17 +15,14 @@
 import os
 import sys
 import site
-import hashlib
 import sysconfig
-import functools
 import urllib.request
 from packaging.version import Version
 import platform
 import multiprocessing
 from setuptools.command.build_ext import build_ext
 import importlib
 import logging
-import fcntl
 
 # Configure logging with basic settings
 logging.basicConfig(
@@ -692,15 +692,15 @@ def build_cython(self, ext):
                 with open(md5_path, "r") as f:
                     cached_hash = f.read().strip()
                     if cached_hash == code_hash:
-                        logger.info("Cython jit adapter is up to date, no need to compile...")
+                        logger.info("Cython JIT adapter is up to date, no need to compile...")
                         need_compile = False
                     else:
-                        logger.info("Cython jit adapter is out of date, need to recompile...")
+                        logger.info("Cython JIT adapter is out of date, need to recompile...")
             else:
-                logger.info("No cached version found for cython jit adapter, need to compile...")
+                logger.info("No cached version found for Cython JIT adapter, need to compile...")
 
             if need_compile:
-                logger.info("Waiting for lock to compile cython jit adapter...")
+                logger.info("Waiting for lock to compile Cython JIT adapter...")
                 with open(lock_file, 'w') as lock:
                     fcntl.flock(lock.fileno(), fcntl.LOCK_EX)
                     try:
@@ -715,7 +715,7 @@ def build_cython(self, ext):
                                     need_compile = False
 
                         if need_compile:
-                            logger.info("Compiling cython jit adapter...")
+                            logger.info("Compiling Cython JIT adapter...")
                             temp_path = cache_dir / f"temp_{code_hash}.so"
 
                             with open(md5_path, "w") as f:
@@ -736,7 +736,7 @@ def build_cython(self, ext):
                     except Exception as e:
                         if 'temp_path' in locals() and temp_path.exists():
                             temp_path.unlink()
-                        raise Exception(f"Failed to compile cython jit adapter: {e}") from e
+                        raise Exception(f"Failed to compile Cython JIT adapter: {e}") from e
                     finally:
                         if lock_file.exists():
                             lock_file.unlink()

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -1689,6 +1689,76 @@ void CodeGenTileLangCUDA::VisitExpr_(const RampNode *op, std::ostream &os) {
   os << "))";
 }
 
+void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
+                                     std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  int lanes = op->dtype.lanes();
+  // delcare type.
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    std::string ref = GetBufferRef(op->dtype, op->buffer.get(), index);
+    HandleVolatileLoads(ref, op, os);
+  } else {
+    bool can_vector_load = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+      const RampNode *ramp = index.as<RampNode>();
+      ICHECK(ramp);
+      can_vector_load = true;
+      // arith::ModularSet me = arith::Analyzer().modular_set(ramp->base);
+      // The condition: {k * coeff + base} divisible by the alignment for any k
+      // if (me->coeff % op->dtype.lanes() == 0 && me->base % op->dtype.lanes()
+      // == 0) {
+      //   can_vector_load = true;
+      // }
+    }
+
+    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
+      // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
+      // So we cannot vector load it.
+      can_vector_load = false;
+    }
+    if (can_vector_load) {
+      std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
+      HandleVolatileLoads(ref, op, os);
+    } else {
+      std::ostringstream svalue_expr;
+      std::string sindex = SSAGetID(PrintExpr(index), index.dtype());
+      std::string vid = GetVarID(buffer_var.get());
+      DataType elem_type = op->dtype.element_of();
+      for (int i = 0; i < lanes; ++i) {
+        std::ostringstream value_temp;
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          value_temp << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, value_temp);
+            }
+          }
+          PrintType(elem_type, value_temp);
+          value_temp << "*)" << vid << ')';
-          value_temp << "((";
-          if (buffer_var.get()->dtype.is_handle()) {
-            auto it = alloc_storage_scope_.find(buffer_var.get());
-            if (it != alloc_storage_scope_.end()) {
-              PrintStorageScope(it->second, value_temp);
-            }
-          }
-          PrintType(elem_type, value_temp);
-          value_temp << "*)" << vid << ')';
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          value_temp << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end() && IsScopePartOfType()) {
+              PrintStorageScope(it->second, value_temp);
+            }
+          }
+          PrintType(elem_type, value_temp);
+          value_temp << "*)" << vid << ')';
+        } else {
+          value_temp << vid;
+        }
-          value_temp << "((";
-          if (buffer_var.get()->dtype.is_handle()) {
-            auto it = alloc_storage_scope_.find(buffer_var.get());
-            if (it != alloc_storage_scope_.end()) {
-              PrintStorageScope(it->second, value_temp);
-            }
-          }
-          PrintType(elem_type, value_temp);
-          value_temp << "*)" << vid << ')';
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          value_temp << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end() && IsScopePartOfType()) {
+              PrintStorageScope(it->second, value_temp);
+            }
+          }
+          PrintType(elem_type, value_temp);
+          value_temp << "*)" << vid << ')';
+        } else {
+          value_temp << vid;
+        }
+        } else {
+          value_temp << vid;
+        }
+        value_temp << '[';
+        PrintVecElemLoad(sindex, index.dtype(), i, value_temp);
+        value_temp << ']';
+        PrintVecElemLoadExpr(op->dtype, i, value_temp.str(), svalue_expr);
+      }
+      os << svalue_expr.str();
+    }
+  }
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);

diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
@@ -50,6 +50,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   void VisitStmt_(const EvaluateNode *op) final;
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
+  void VisitExpr_(const BufferLoadNode *op, std::ostream &os) final;
 
   // Override this as a work around for __grid_constant__ parameter
   void AddFunction(const GlobalVar &gvar, const PrimFunc &f);

diff --git a/src/tl_templates/hip/reduce.h b/src/tl_templates/hip/reduce.h
@@ -22,7 +22,8 @@ struct MinOp {
   }
 };
 
-template <class Reducer, int threads, int scale, int thread_offset = 0> struct AllReduce {
+template <class Reducer, int threads, int scale, int thread_offset = 0>
+struct AllReduce {
   static_assert(threads == 1024 || threads == 512 || threads == 256 ||
                 threads == 128 || threads == 64 || threads == 32 ||
                 threads == 16 || threads == 8 || threads == 4 || threads == 2);

diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
@@ -136,11 +136,23 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
         max_vector_size = gcd_base;
       }
       vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
+
+      // Generate strides if not existed
+      auto strides = buffer->strides;
+      if (buffer->strides.size() == 0) {
+        PrimExpr stride = 1;
+        for (int i = indices.size() - 1; i >= 0; --i) {
+          strides.push_back(stride);
+          stride = stride * buffer->shape[i];
+        }
+        strides = Array<PrimExpr>{strides.rbegin(), strides.rend()};
+      }
+
+      // Generate and check element offset expression
+      ICHECK(indices.size() == strides.size()) << "Invalid indices and strides";
       PrimExpr elem_offset = 0;
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        elem_offset = elem_offset + indices[i] * stride;
-        stride = stride * buffer->shape[i];
+      for (int i = 0; i < indices.size(); ++i) {
+        elem_offset += indices[i] * strides[i];
       }
-      // Generate strides if not existed
-      auto strides = buffer->strides;
-      if (buffer->strides.size() == 0) {
-        PrimExpr stride = 1;
-        for (int i = indices.size() - 1; i >= 0; --i) {
-          strides.push_back(stride);
-          stride = stride * buffer->shape[i];
-        }
-        strides = Array<PrimExpr>{strides.rbegin(), strides.rend()};
-      }
-
-      // Generate and check element offset expression
-      ICHECK(indices.size() == strides.size()) << "Invalid indices and strides";
-      PrimExpr elem_offset = 0;
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        elem_offset = elem_offset + indices[i] * stride;
-        stride = stride * buffer->shape[i];
-      for (int i = 0; i < indices.size(); ++i) {
-        elem_offset += indices[i] * strides[i];
-      }
+      // Compute element offset (supports both explicit and implicit strides)
+      PrimExpr elem_offset = 0;
+      if (buffer->strides.size() == 0) {
+        PrimExpr stride = 1;
+        for (int i = indices.size() - 1; i >= 0; --i) {
+          elem_offset += indices[i] * stride;
+          stride = stride * buffer->shape[i];
+        }
+      } else {
+        ICHECK(indices.size() == buffer->strides.size()) << "Invalid indices and strides";
+        for (int i = 0; i < indices.size(); ++i) {
+          elem_offset += indices[i] * buffer->strides[i];
+        }
+      }
-      // Generate strides if not existed
-      auto strides = buffer->strides;
-      if (buffer->strides.size() == 0) {
-        PrimExpr stride = 1;
-        for (int i = indices.size() - 1; i >= 0; --i) {
-          strides.push_back(stride);
-          stride = stride * buffer->shape[i];
-        }
-        strides = Array<PrimExpr>{strides.rbegin(), strides.rend()};
-      }
-
-      // Generate and check element offset expression
-      ICHECK(indices.size() == strides.size()) << "Invalid indices and strides";
-      PrimExpr elem_offset = 0;
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        elem_offset = elem_offset + indices[i] * stride;
-        stride = stride * buffer->shape[i];
-      for (int i = 0; i < indices.size(); ++i) {
-        elem_offset += indices[i] * strides[i];
-      }
+      // Compute element offset (supports both explicit and implicit strides)
+      PrimExpr elem_offset = 0;
+      if (buffer->strides.size() == 0) {
+        PrimExpr stride = 1;
+        for (int i = indices.size() - 1; i >= 0; --i) {
+          elem_offset += indices[i] * stride;
+          stride = stride * buffer->shape[i];
+        }
+      } else {
+        ICHECK(indices.size() == buffer->strides.size()) << "Invalid indices and strides";
+        for (int i = 0; i < indices.size(); ++i) {
+          elem_offset += indices[i] * buffer->strides[i];
+        }
+      }
       while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
                                  inner_for_->extent, vector_size_,
@@ -229,10 +241,19 @@ bool IndiceCanVectorize(PrimExpr expr, Var var, PrimExpr iter_var_size,
   ICHECK(target_vectorized_size >= 1);
   if (target_vectorized_size == 1)
     return true;
-  // bind thread range
+
+  // Extent must be divisible
   if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size),
                                0))
     return false;
+
+  // The base offset must be divisible
+  if (!analyzer->CanProveEqual(
+          FloorMod(Substitute(expr, {{var, 0}}), target_vectorized_size), 0)) {
+    return false;
+  }
+
+  // Bind thread range
   Var v0("v0"), v1("v1");
   analyzer->Bind(v0, Range(0, target_vectorized_size));
   analyzer->Bind(v1, Range(0, analyzer->Simplify(FloorDiv(
@@ -241,7 +262,8 @@ bool IndiceCanVectorize(PrimExpr expr, Var var, PrimExpr iter_var_size,
       Substitute(expr, {{var, v0 + v1 * target_vectorized_size}}));
   Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size));
   PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed);
-  // This simplify is necessary for thread region specifiled
+
+  // This simplify is necessary for thread region specified
   // optimizations.
   expr_vectorized = analyzer->Simplify(expr_vectorized);
   auto ramp_node = expr_vectorized.as<RampNode>();

diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
@@ -28,8 +28,8 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
         out_idx=[1],
         target="cuda",
         pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
         })
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -42,5 +42,49 @@ def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
 
 
+def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
+
+    @T.prim_func
+    def main(
+            A: T.StridedTensor((M, N), (NN, 1), dtype),
+            B: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j]
+
+    return main
+
+
+def run_tilelang_copy_with_stride(M=1024,
+                                  N=1024,
+                                  NN=2048,
+                                  block_M=128,
+                                  block_N=128,
+                                  dtype="float16"):
+    if isinstance(NN, int):
+        assert NN > N, "NN must be greater than N"
+    program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        })
+    if isinstance(NN, T.Var):
+        NN = N * 2
+    a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a[:, :N])
+    torch.testing.assert_close(b, a[:, :N], rtol=1e-2, atol=1e-2)
+
-    if isinstance(NN, int):
-        assert NN > N, "NN must be greater than N"
-    program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        })
-    if isinstance(NN, T.Var):
-        NN = N * 2
-    a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
-    b = kernel(a[:, :N])
-    torch.testing.assert_close(b, a[:, :N], rtol=1e-2, atol=1e-2)
+    if isinstance(NN, int):
+        assert NN >= N, "NN must be greater than or equal to N"
+    program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        })
+    # For any non-int (e.g., tvm.tir.Var), pick a concrete size
+    if not isinstance(NN, int):
+        NN = N * 2
+    a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a[:, :N])
+    torch.testing.assert_close(b, a[:, :N], rtol=1e-2, atol=1e-2)
-    if isinstance(NN, int):
-        assert NN > N, "NN must be greater than N"
-    program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        })
-    if isinstance(NN, T.Var):
-        NN = N * 2
-    a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
-    b = kernel(a[:, :N])
-    torch.testing.assert_close(b, a[:, :N], rtol=1e-2, atol=1e-2)
+    if isinstance(NN, int):
+        assert NN >= N, "NN must be greater than or equal to N"
+    program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        })
+    # For any non-int (e.g., tvm.tir.Var), pick a concrete size
+    if not isinstance(NN, int):
+        NN = N * 2
+    a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a[:, :N])
+    torch.testing.assert_close(b, a[:, :N], rtol=1e-2, atol=1e-2)
+
+def test_tilelang_copy_with_stride():
+    run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128)
+    run_tilelang_copy_with_stride(M=1024, N=1024, NN=T.symbolic("NN"), block_M=128, block_N=128)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()