tile-ai
diff --git a/‎examples/dequantize_gemm/test_example_dequantize_gemm.py‎
Lines changed: 0 additions & 7 deletions b/‎examples/dequantize_gemm/test_example_dequantize_gemm.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/warp_specialize/example_warp_specialize_flashmla.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/warp_specialize/example_warp_specialize_flashmla.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/op/builtin.cc‎
Lines changed: 1 addition & 1 deletion b/‎src/op/builtin.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/op/builtin.h‎
Lines changed: 21 additions & 21 deletions b/‎src/op/builtin.h‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎src/op/elem.cc‎
Lines changed: 1 addition & 1 deletion b/‎src/op/elem.cc‎
Lines changed: 1 addition & 1 deletion
@@ -2,7 +2,6 @@
 
 import example_dequant_gemv_fp16xint4
 import example_dequant_gemm_fp4_hopper
-import example_dequant_gemm_bf16_fp4_hopper_serial
 
 
 @tilelang.testing.requires_cuda
@@ -16,11 +15,5 @@ def test_example_dequant_gemm_fp4_hopper():
     example_dequant_gemm_fp4_hopper.main()
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_dequant_gemm_bf16_fp4_hopper_serial():
-    example_dequant_gemm_bf16_fp4_hopper_serial.main()
-
-
 if __name__ == "__main__":
     tilelang.testing.main()
@@ -1,5 +1,4 @@
 import torch
-import torch.backends
 from tilelang import tvm as tvm
 import tilelang.testing
 from tvm import DataType
 
@@ -391,6 +391,7 @@ def main(batch=1, heads=128, kv_heads=1, kv_ctx=8192, dim=512, pe_dim=64):
     num_split = 1
 
     kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split)
+    print(kernel.get_kernel_source())
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     latency = profiler.do_bench(warmup=500)
 
@@ -66,7 +66,6 @@ def main():
 
     # Run the kernel through the Profiler
     c = jit_kernel(a, b)
-
     # Reference multiplication using PyTorch
     ref_c = a @ b
 
 
@@ -83,7 +83,7 @@ TIR_DEFINE_TL_BUILTIN(ptx_ldmatirx)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(ptx_stmatirx)
+TIR_DEFINE_TL_BUILTIN(ptx_stmatrix)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
@@ -62,7 +62,7 @@ static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
  * swizzle, l2_promotion, oob_fill)
  *
  */
-const Op &create_tma_descriptor();
+TVM_DLL const Op &create_tma_descriptor();
 
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for image to column load
@@ -73,23 +73,23 @@ const Op &create_tma_descriptor();
  * l2_promotion, oob_fill)
  *
  */
-const Op &create_tma_im2col_descriptor();
+TVM_DLL const Op &create_tma_im2col_descriptor();
 
 /*!
  * \brief Create a list of mbarrier with num_threads
  *
  * create_list_of_mbarrier(num_threads0, num_threads1, ...)
  *
  */
-const Op &create_list_of_mbarrier();
+TVM_DLL const Op &create_list_of_mbarrier();
 
 /*!
  * \brief Get the mbarrier with barrier_id
  *
  * int64_t* GetMBarrier(barrier_id)
  *
  */
-const Op &get_mbarrier();
+TVM_DLL const Op &get_mbarrier();
 
 /*!
  * \brief tvm intrinsics for loading data from global tensor descriptor to
@@ -98,7 +98,7 @@ const Op &get_mbarrier();
  * tma_load(descriptor, mbarrier, smem_data, coord_0, coord_1, ...)
  *
  */
-const Op &tma_load();
+TVM_DLL const Op &tma_load();
 
 /*!
  * \brief tvm intrinsics for loading image from global tensor to columns in
@@ -108,7 +108,7 @@ const Op &tma_load();
  * image_offset, ...)
  *
  */
-const Op &tma_load_im2col();
+TVM_DLL const Op &tma_load_im2col();
 
 /*!
  * \brief tvm intrinsics for storing data from shared memory to global tensor
@@ -117,119 +117,119 @@ const Op &tma_load_im2col();
  * tma_store(descriptor, smem_data, coord_0, coord_1, ...)
  *
  */
-const Op &tma_store();
+TVM_DLL const Op &tma_store();
 
 /*!
  * \brief tvm intrinsics for mbarrier wait with parity bit
  *
  * mbarrier_wait_parity(mbarrier, parity)
  *
  */
-const Op &mbarrier_wait_parity();
+TVM_DLL const Op &mbarrier_wait_parity();
 
 /*!
  * \brief tvm intrinsics for mbarrier expect tx
  *
  * mbarrier_expect_tx(mbarrier, transaction_bytes)
  *
  */
-const Op &mbarrier_expect_tx();
+TVM_DLL const Op &mbarrier_expect_tx();
 
 /*!
  * \brief tvm intrinsics for ldmatrix
  *
  * ptx_ldmatirx(transposed, num, shared_addr, local_addr)
  *
  */
-const Op &ptx_ldmatirx();
+TVM_DLL const Op &ptx_ldmatirx();
 
 /*!
  * \brief tvm intrinsics for stmatrix
  *
  * ptx_ldmatirx(transposed, num, shared_addr, int32_values...)
  *
  */
-const Op &ptx_stmatirx();
+TVM_DLL const Op &ptx_stmatrix();
 
 /*!
  * \brief Pack two b16 value into a b32 value
  *
  * int32 pack_b16(b16_value, b16_value)
  *
  */
-const Op &pack_b16();
+TVM_DLL const Op &pack_b16();
 
 /*!
  * \brief Similar to __syncthreads(), but can be used to sync partial threads
  *
  * sync_thread_partial(num_partial_threads or mbarrier)
  *
  */
-const Op &sync_thread_partial();
+TVM_DLL const Op &sync_thread_partial();
 
 /*!
  * \brief Issue a shared memory fence for async operations
  *
  * FenceProxyAsync()
  *
  */
-const Op &fence_proxy_async();
+TVM_DLL const Op &fence_proxy_async();
 
 /*!
  * \brief Indicate arrival of warp issuing TMA_STORE
  *
  * tma_store_arrive()
  *
  */
-const Op &tma_store_arrive();
+TVM_DLL const Op &tma_store_arrive();
 
 /*!
  * \brief Wait for TMA_STORE to finish
  *
  * tma_store_wait()
  *
  */
-const Op &tma_store_wait();
+TVM_DLL const Op &tma_store_wait();
 
 /*!
  * \brief Set reg hint for warp-specialized branched
  *
  * SetMaxNRegInc(num_reg, is_inc)
  *
  */
-const Op &set_max_nreg();
+TVM_DLL const Op &set_max_nreg();
 
 /*!
  * \brief No set reg hint for warp-specialized branched
  *
  * no_set_max_nreg()
  *
  */
-const Op &no_set_max_nreg();
+TVM_DLL const Op &no_set_max_nreg();
 
 /*!
  * \brief Wait the previous wgmma to finish
  *
  * wait_wgmma(num_mma)
  *
  */
-const Op &wait_wgmma();
+TVM_DLL const Op &wait_wgmma();
 
 /*!
  * \brief Synchronize all threads in a grid
  *
  * sync_grid()
  *
  */
-const Op &sync_grid();
+TVM_DLL const Op &sync_grid();
 
 /*!
  * \brief tvm intrinsic for loop continue
  *
  * loop_break()
  *
  */
-const Op &loop_break();
+TVM_DLL const Op &loop_break();
 
 /*!
  * \brief tvm intrinsic for amd matrix core mfma instructions.
 
@@ -302,7 +302,7 @@ Stmt Copy::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer) const {
     num = 2;
 
   Array<PrimExpr> args;
-  const Op &op = is_ldmatrix ? tl::ptx_ldmatirx() : tl::ptx_stmatirx();
+  const Op &op = is_ldmatrix ? tl::ptx_ldmatirx() : tl::ptx_stmatrix();
   args.push_back(static_cast<int>(is_transposed));
   args.push_back(num);
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`import torch`
`2`		`-import torch.backends`
`3`	`2`	`from tilelang import tvm as tvm`
`4`	`3`	`import tilelang.testing`
`5`	`4`	`from tvm import DataType`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";`
`62`	`62`	`* swizzle, l2_promotion, oob_fill)`
`63`	`63`	`*`
`64`	`64`	`*/`
`65`		`-const Op &create_tma_descriptor();`
	`65`	`+TVM_DLL const Op &create_tma_descriptor();`
`66`	`66`
`67`	`67`	`/*!`
`68`	`68`	`* \brief tvm intrinsics for TMADescriptor creation for image to column load`
`@@ -73,23 +73,23 @@ const Op &create_tma_descriptor();`
`73`	`73`	`* l2_promotion, oob_fill)`
`74`	`74`	`*`
`75`	`75`	`*/`
`76`		`-const Op &create_tma_im2col_descriptor();`
	`76`	`+TVM_DLL const Op &create_tma_im2col_descriptor();`
`77`	`77`
`78`	`78`	`/*!`
`79`	`79`	`* \brief Create a list of mbarrier with num_threads`
`80`	`80`	`*`
`81`	`81`	`* create_list_of_mbarrier(num_threads0, num_threads1, ...)`
`82`	`82`	`*`
`83`	`83`	`*/`
`84`		`-const Op &create_list_of_mbarrier();`
	`84`	`+TVM_DLL const Op &create_list_of_mbarrier();`
`85`	`85`
`86`	`86`	`/*!`
`87`	`87`	`* \brief Get the mbarrier with barrier_id`
`88`	`88`	`*`
`89`	`89`	`* int64_t* GetMBarrier(barrier_id)`
`90`	`90`	`*`
`91`	`91`	`*/`
`92`		`-const Op &get_mbarrier();`
	`92`	`+TVM_DLL const Op &get_mbarrier();`
`93`	`93`
`94`	`94`	`/*!`
`95`	`95`	`* \brief tvm intrinsics for loading data from global tensor descriptor to`
`@@ -98,7 +98,7 @@ const Op &get_mbarrier();`
`98`	`98`	`* tma_load(descriptor, mbarrier, smem_data, coord_0, coord_1, ...)`
`99`	`99`	`*`
`100`	`100`	`*/`
`101`		`-const Op &tma_load();`
	`101`	`+TVM_DLL const Op &tma_load();`
`102`	`102`
`103`	`103`	`/*!`
`104`	`104`	`* \brief tvm intrinsics for loading image from global tensor to columns in`
`@@ -108,7 +108,7 @@ const Op &tma_load();`
`108`	`108`	`* image_offset, ...)`
`109`	`109`	`*`
`110`	`110`	`*/`
`111`		`-const Op &tma_load_im2col();`
	`111`	`+TVM_DLL const Op &tma_load_im2col();`
`112`	`112`
`113`	`113`	`/*!`
`114`	`114`	`* \brief tvm intrinsics for storing data from shared memory to global tensor`
`@@ -117,119 +117,119 @@ const Op &tma_load_im2col();`
`117`	`117`	`* tma_store(descriptor, smem_data, coord_0, coord_1, ...)`
`118`	`118`	`*`
`119`	`119`	`*/`
`120`		`-const Op &tma_store();`
	`120`	`+TVM_DLL const Op &tma_store();`
`121`	`121`
`122`	`122`	`/*!`
`123`	`123`	`* \brief tvm intrinsics for mbarrier wait with parity bit`
`124`	`124`	`*`
`125`	`125`	`* mbarrier_wait_parity(mbarrier, parity)`
`126`	`126`	`*`
`127`	`127`	`*/`
`128`		`-const Op &mbarrier_wait_parity();`
	`128`	`+TVM_DLL const Op &mbarrier_wait_parity();`
`129`	`129`
`130`	`130`	`/*!`
`131`	`131`	`* \brief tvm intrinsics for mbarrier expect tx`
`132`	`132`	`*`
`133`	`133`	`* mbarrier_expect_tx(mbarrier, transaction_bytes)`
`134`	`134`	`*`
`135`	`135`	`*/`
`136`		`-const Op &mbarrier_expect_tx();`
	`136`	`+TVM_DLL const Op &mbarrier_expect_tx();`
`137`	`137`
`138`	`138`	`/*!`
`139`	`139`	`* \brief tvm intrinsics for ldmatrix`
`140`	`140`	`*`
`141`	`141`	`* ptx_ldmatirx(transposed, num, shared_addr, local_addr)`
`142`	`142`	`*`
`143`	`143`	`*/`
`144`		`-const Op &ptx_ldmatirx();`
	`144`	`+TVM_DLL const Op &ptx_ldmatirx();`
`145`	`145`
`146`	`146`	`/*!`
`147`	`147`	`* \brief tvm intrinsics for stmatrix`
`148`	`148`	`*`
`149`	`149`	`* ptx_ldmatirx(transposed, num, shared_addr, int32_values...)`
`150`	`150`	`*`
`151`	`151`	`*/`
`152`		`-const Op &ptx_stmatirx();`
	`152`	`+TVM_DLL const Op &ptx_stmatrix();`
`153`	`153`
`154`	`154`	`/*!`
`155`	`155`	`* \brief Pack two b16 value into a b32 value`
`156`	`156`	`*`
`157`	`157`	`* int32 pack_b16(b16_value, b16_value)`
`158`	`158`	`*`
`159`	`159`	`*/`
`160`		`-const Op &pack_b16();`
	`160`	`+TVM_DLL const Op &pack_b16();`
`161`	`161`
`162`	`162`	`/*!`
`163`	`163`	`* \brief Similar to __syncthreads(), but can be used to sync partial threads`
`164`	`164`	`*`
`165`	`165`	`* sync_thread_partial(num_partial_threads or mbarrier)`
`166`	`166`	`*`
`167`	`167`	`*/`
`168`		`-const Op &sync_thread_partial();`
	`168`	`+TVM_DLL const Op &sync_thread_partial();`
`169`	`169`
`170`	`170`	`/*!`
`171`	`171`	`* \brief Issue a shared memory fence for async operations`
`172`	`172`	`*`
`173`	`173`	`* FenceProxyAsync()`
`174`	`174`	`*`
`175`	`175`	`*/`
`176`		`-const Op &fence_proxy_async();`
	`176`	`+TVM_DLL const Op &fence_proxy_async();`
`177`	`177`
`178`	`178`	`/*!`
`179`	`179`	`* \brief Indicate arrival of warp issuing TMA_STORE`
`180`	`180`	`*`
`181`	`181`	`* tma_store_arrive()`
`182`	`182`	`*`
`183`	`183`	`*/`
`184`		`-const Op &tma_store_arrive();`
	`184`	`+TVM_DLL const Op &tma_store_arrive();`
`185`	`185`
`186`	`186`	`/*!`
`187`	`187`	`* \brief Wait for TMA_STORE to finish`
`188`	`188`	`*`
`189`	`189`	`* tma_store_wait()`
`190`	`190`	`*`
`191`	`191`	`*/`
`192`		`-const Op &tma_store_wait();`
	`192`	`+TVM_DLL const Op &tma_store_wait();`
`193`	`193`
`194`	`194`	`/*!`
`195`	`195`	`* \brief Set reg hint for warp-specialized branched`
`196`	`196`	`*`
`197`	`197`	`* SetMaxNRegInc(num_reg, is_inc)`
`198`	`198`	`*`
`199`	`199`	`*/`
`200`		`-const Op &set_max_nreg();`
	`200`	`+TVM_DLL const Op &set_max_nreg();`
`201`	`201`
`202`	`202`	`/*!`
`203`	`203`	`* \brief No set reg hint for warp-specialized branched`
`204`	`204`	`*`
`205`	`205`	`* no_set_max_nreg()`
`206`	`206`	`*`
`207`	`207`	`*/`
`208`		`-const Op &no_set_max_nreg();`
	`208`	`+TVM_DLL const Op &no_set_max_nreg();`
`209`	`209`
`210`	`210`	`/*!`
`211`	`211`	`* \brief Wait the previous wgmma to finish`
`212`	`212`	`*`
`213`	`213`	`* wait_wgmma(num_mma)`
`214`	`214`	`*`
`215`	`215`	`*/`
`216`		`-const Op &wait_wgmma();`
	`216`	`+TVM_DLL const Op &wait_wgmma();`
`217`	`217`
`218`	`218`	`/*!`
`219`	`219`	`* \brief Synchronize all threads in a grid`
`220`	`220`	`*`
`221`	`221`	`* sync_grid()`
`222`	`222`	`*`
`223`	`223`	`*/`
`224`		`-const Op &sync_grid();`
	`224`	`+TVM_DLL const Op &sync_grid();`
`225`	`225`
`226`	`226`	`/*!`
`227`	`227`	`* \brief tvm intrinsic for loop continue`
`228`	`228`	`*`
`229`	`229`	`* loop_break()`
`230`	`230`	`*`
`231`	`231`	`*/`
`232`		`-const Op &loop_break();`
	`232`	`+TVM_DLL const Op &loop_break();`
`233`	`233`
`234`	`234`	`/*!`
`235`	`235`	`* \brief tvm intrinsic for amd matrix core mfma instructions.`