Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
5617be1
feat: CuTeDSL backend
lucifer1004 Dec 13, 2025
98fd67a
fix: clang-tidy
lucifer1004 Dec 13, 2025
1a94479
fix: clang-format
lucifer1004 Dec 13, 2025
cdd8b61
fix: ci
lucifer1004 Dec 13, 2025
f58ca73
fix: revert example gemm fp8
lucifer1004 Dec 14, 2025
4154b79
fix: remove duplicate code
lucifer1004 Dec 14, 2025
2e9c5e4
fix: switch-case
lucifer1004 Dec 14, 2025
c09af46
fix: fp16 silence
lucifer1004 Dec 14, 2025
375a389
fix: TVM IR print
lucifer1004 Dec 14, 2025
6e09ec5
fix: useless tir
lucifer1004 Dec 14, 2025
9e5355b
fix: clang-format
lucifer1004 Dec 14, 2025
c7319c5
fix: remove tilelang/contrib/cutedsl/.gitignore
lucifer1004 Dec 14, 2025
f3362bc
fix: use hexfloat
lucifer1004 Dec 14, 2025
7d4cef6
fix: gsym guard
lucifer1004 Dec 14, 2025
702c1b1
fix: unknown storage sync type
lucifer1004 Dec 14, 2025
1447851
fix: string literal
lucifer1004 Dec 14, 2025
35b7a2a
fix: add args guard
lucifer1004 Dec 14, 2025
4ca27bc
fix: name hint dedup
lucifer1004 Dec 15, 2025
1680ed5
fix: better find_kernel_by_pattern
lucifer1004 Dec 15, 2025
76354cb
fix: set libpath for from_database path
lucifer1004 Dec 15, 2025
081de86
fix: guard buffer.strides
lucifer1004 Dec 15, 2025
5252482
fix: from guard
lucifer1004 Dec 15, 2025
4238d9e
fix: eviction guard
lucifer1004 Dec 15, 2025
45fc89a
fix: use thread local tma descs
lucifer1004 Dec 15, 2025
c510459
fix: ruff
lucifer1004 Dec 15, 2025
a5a721e
fix: drop tma_init_cpp
lucifer1004 Dec 15, 2025
1465ff9
fix: exc_info
lucifer1004 Dec 15, 2025
51bdb1f
fix: negative unmatch early return
lucifer1004 Dec 15, 2025
e6efdd9
fix: rename postproc func and add test
lucifer1004 Dec 15, 2025
9246673
fix: handle fast math according to pass config
lucifer1004 Dec 15, 2025
25c2d02
fix: dyn_sym parse
lucifer1004 Dec 15, 2025
cf01c04
fix: wrap_forward
lucifer1004 Dec 15, 2025
b6987f5
fix: use tvm_ffi.libinfo instead of cli
lucifer1004 Dec 15, 2025
b317a03
fix: keep signature
lucifer1004 Dec 15, 2025
b4fbe8e
fix: C++ string safety
lucifer1004 Dec 15, 2025
17b0696
fix: mark tma_store_add as unsupported
lucifer1004 Dec 15, 2025
6ea6c8e
fix: tvm version
lucifer1004 Dec 15, 2025
402ca6a
resolve ldsm and cpasync issues.
cherichy Dec 15, 2025
c98a080
fix: minor fixes
lucifer1004 Dec 15, 2025
2b16bef
fix: parse signature using ast
lucifer1004 Dec 15, 2025
1a5139f
fix: guard global_addr
lucifer1004 Dec 15, 2025
b22a82c
fix: create tempfile only when necessary
lucifer1004 Dec 15, 2025
7a4bcc8
fix: use logger.execption for exceptions
lucifer1004 Dec 15, 2025
336d387
fix: guard lib_path and host_func
lucifer1004 Dec 15, 2025
f3d8029
fix: remove tma_cpp_init and add timeout for cpp compile
lucifer1004 Dec 15, 2025
9866d88
add timeout for mbarrier_wait.
cherichy Dec 15, 2025
95e5616
fix: _load_kernel_from_disk signature
lucifer1004 Dec 15, 2025
4533c34
resolve codegen issues.
cherichy Dec 15, 2025
530ca5e
fix: logger.exception
lucifer1004 Dec 15, 2025
f196e37
add comment for div_by=1
Dec 15, 2025
f1eb843
merge
lucifer1004 Dec 15, 2025
fbe1cf3
fix: reserve cutlass,cute,tl
lucifer1004 Dec 15, 2025
82a2968
fix: guard tma_store
lucifer1004 Dec 15, 2025
8a1011e
fix: allow int64 offset in make_tensor_at_offset
lucifer1004 Dec 15, 2025
b777c99
fix: guard barrier
lucifer1004 Dec 15, 2025
9021d8c
fix: add comments for div_by=16
lucifer1004 Dec 15, 2025
fd15d08
fix: div_by=1 issue
lucifer1004 Dec 15, 2025
a2f87b3
delete div_by when offset is 0
Dec 15, 2025
f996239
use tl.make_tensor when offset is 0
Dec 15, 2025
ede17db
fix: explicitly check cutedsl target
lucifer1004 Dec 17, 2025
7500841
Merge remote-tracking branch 'upstream/main' into github-cutedsl
lucifer1004 Dec 17, 2025
32fe9b4
fix: use param.torch_dtype()
lucifer1004 Dec 17, 2025
0575f11
Merge branch 'main' into github-cutedsl
lucifer1004 Dec 17, 2025
043ba95
Merge branch 'main' of https://github.com/tile-ai/tilelang into githu…
LeiWang1999 Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,27 @@ jobs:
pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
)
"${PYTEST[@]}" --maxfail=3 --numprocesses=4 \
--ignore=./python/jit/test_tilelang_jit_cutedsl.py \
./python

# CuTeDSL JIT tests require GEMM v1 (must be set before importing tilelang).
# Run them in a dedicated step to avoid changing the default GEMM selection
# (and to keep the rest of the CUDA tests on GEMM v2).
- name: Run CuTeDSL JIT tests (GEMM v1) with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
id: cutedsl-tests
if: contains(matrix.runner.toolkit, 'CUDA')
env:
TILELANG_USE_GEMM_V1: "1"
run: |
cd testing
PYTEST=(
uv run --no-project -m --
pytest --verbose --color=yes --durations=0 --showlocals --cache-clear
)
# Avoid xdist contention on a single GPU by running this file in one worker.
"${PYTEST[@]}" --maxfail=3 --numprocesses=1 \
./python/jit/test_tilelang_jit_cutedsl.py

# AMD ROCm tests
- name: Run ROCm tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
id: rocm-tests
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,11 @@ elseif(USE_CUDA)
src/runtime/runtime.cc
src/target/ptx.cc
src/target/codegen_cuda.cc
src/target/codegen_py.cc
src/target/codegen_utils.cc
src/target/codegen_cutedsl.cc
src/target/rt_mod_cuda.cc
src/target/rt_mod_cutedsl.cc
)
list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})

Expand Down
10 changes: 8 additions & 2 deletions maint/scripts/run_local_ci_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@ cd examples
python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
cd ..

# Run pytest in parallel (4 workers) for all tests in the testing/python directory
# Run pytest in parallel (4 workers) for all tests in the testing/python directory.
# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
# Do NOT export it globally here, or you'll silently change the default GEMM selection
# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
cd testing/python
python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
python -m pytest -n 4 . --ignore=jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear

# CuTeDSL JIT tests (isolate env + avoid xdist contention on a single GPU)
TILELANG_USE_GEMM_V1=1 python -m pytest -n 1 jit/test_tilelang_jit_cutedsl.py --verbose --color=yes --durations=0 --showlocals --cache-clear
cd ..
2 changes: 2 additions & 0 deletions requirements-test-cuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
# CUDA specific requirements
flash-attn==2.5.8
cuda-python==12.9.4
# CuTeDSL (CUTLASS Python DSL with CuTe support)
nvidia-cutlass-dsl>=4.3.1
Loading
Loading