diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml index c7be92298..c12cc5082 100644 --- a/.github/workflows/pr-regression-test-bot.yml +++ b/.github/workflows/pr-regression-test-bot.yml @@ -59,7 +59,7 @@ jobs: const { owner, repo } = context.repo const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username }) core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none - + - name: Reject if not allowed if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }} run: | @@ -241,10 +241,10 @@ jobs: const fs = require('fs'); // Read the file directly instead of passing via env/outputs to avoid escaping issues const md = fs.readFileSync('regression_result.md', 'utf8'); - + const runUrl = `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`; - - const body = + + const body = 'Performance Benchmark Report\n' + '============================\n\n' + `Triggered by: @${context.payload.comment.user.login}\n` + @@ -262,4 +262,3 @@ jobs: issue_number: context.issue.number, body }); - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9300c3049..f52f91b53 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,15 +13,13 @@ repos: hooks: - id: check-symlinks - id: destroyed-symlinks - # FIXME: enable these hooks - # - id: trailing-whitespace - # - id: end-of-file-fixer + - id: trailing-whitespace + - id: end-of-file-fixer - id: check-added-large-files - id: check-merge-conflict fail_fast: true - # FIXME: enable these hooks - # - id: check-executables-have-shebangs - # - id: check-shebang-scripts-are-executable + - id: check-executables-have-shebangs + - id: check-shebang-scripts-are-executable - id: detect-private-key - id: check-yaml - id: check-toml diff --git a/.pymarkdown b/.pymarkdown index ba233849a..5394265ed 100644 --- a/.pymarkdown +++ b/.pymarkdown @@ -4,7 +4,7 @@ "style": "atx" }, "md004": { - "style": "dash" + "style": "dash" }, "md013": { "enabled": false diff --git a/LICENSE b/LICENSE index 2122252e9..09dd51c8c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,7 @@ MIT License Copyright (c) Tile-AI. - **During the period from December 1, 2024, to Mar 14, 2025, this project is + **During the period from December 1, 2024, to Mar 14, 2025, this project is subject to additional collaboration terms with Microsoft Corporation.** Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/README.md b/README.md index 779e2b90b..eeef6d401 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ TileLang achieves exceptional performance across a variety of computational patt mla decode performance bs128 on H100 - + - Flash Attention Performance on H100
operator performance on H100 @@ -170,7 +170,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.fl # Perform a tile-level GEMM on the shared buffers # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs T.gemm(A_shared, B_shared, C_local) - + # relu for i, j in T.Parallel(block_M, block_N): C_local[i, j] = T.max(C_local[i, j], 0) diff --git a/THIRDPARTYNOTICES.txt b/THIRDPARTYNOTICES.txt index b7c481841..3558662a8 100644 --- a/THIRDPARTYNOTICES.txt +++ b/THIRDPARTYNOTICES.txt @@ -1,5 +1,5 @@ -BitBLAS uses third-party material as listed below. The attached notices are -provided for informational purposes only. +BitBLAS uses third-party material as listed below. The attached notices are +provided for informational purposes only. Notice for apache/tvm ------------------------------- diff --git a/docker/Dockerfile.cu118 b/docker/Dockerfile.cu118 index be8274461..969b0e43c 100644 --- a/docker/Dockerfile.cu118 +++ b/docker/Dockerfile.cu118 @@ -1,4 +1,4 @@ -FROM nvcr.io/nvidia/pytorch:22.12-py3 +FROM nvcr.io/nvidia/pytorch:22.12-py3 WORKDIR /root diff --git a/docker/Dockerfile.cu120 b/docker/Dockerfile.cu120 index 7ca1d931f..341fe40c0 100644 --- a/docker/Dockerfile.cu120 +++ b/docker/Dockerfile.cu120 @@ -1,4 +1,4 @@ -FROM nvcr.io/nvidia/pytorch:23.01-py3 +FROM nvcr.io/nvidia/pytorch:23.01-py3 WORKDIR /root diff --git a/docs/.gitignore b/docs/.gitignore index 4d8eb4049..79ba97163 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,2 +1,2 @@ _build/ -autoapi/ \ No newline at end of file +autoapi/ diff --git a/docs/CNAME b/docs/CNAME index ca903c694..6862cd2e9 100644 --- a/docs/CNAME +++ b/docs/CNAME @@ -1 +1 @@ -tilelang.com \ No newline at end of file +tilelang.com diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 0ef6b48cb..a1fee9c3d 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -8,4 +8,3 @@ .sidebar-logo-container { line-height: 0; } - diff --git a/docs/_static/img/logo-row.svg b/docs/_static/img/logo-row.svg index 633243f3a..e73244b74 100644 --- a/docs/_static/img/logo-row.svg +++ b/docs/_static/img/logo-row.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/docs/deeplearning_operators/deepseek_mla.md b/docs/deeplearning_operators/deepseek_mla.md index 2cc167c58..ed02b58b1 100644 --- a/docs/deeplearning_operators/deepseek_mla.md +++ b/docs/deeplearning_operators/deepseek_mla.md @@ -38,7 +38,7 @@ Notably, **TileLang accomplishes this with just around 80 lines of Python code** First, let's review the core computation logic of traditional FlashAttention: -```python +```python # acc_s: [block_M, block_N] # scores_max: [block_M] # scores_scale: [block_M] @@ -160,7 +160,7 @@ Key implementation differences between Hopper and MI300X architectures include: # Original shared memory allocation Q_shared = T.alloc_shared([block_H, dim], dtype) Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype) - + # Optimized register allocation Q_local = T.alloc_fragment([block_H, dim], dtype) Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype) diff --git a/docs/deeplearning_operators/elementwise.md b/docs/deeplearning_operators/elementwise.md index d14c516d5..6aa8e4085 100644 --- a/docs/deeplearning_operators/elementwise.md +++ b/docs/deeplearning_operators/elementwise.md @@ -8,7 +8,7 @@ :class: myclass1 myclass2 :name: a-tip-reference - This document is still **experimental** and may be incomplete. + This document is still **experimental** and may be incomplete. Suggestions and improvements are highly encouraged—please submit a PR! ::: diff --git a/docs/deeplearning_operators/gemv.md b/docs/deeplearning_operators/gemv.md index c1f937ab4..38287f220 100644 --- a/docs/deeplearning_operators/gemv.md +++ b/docs/deeplearning_operators/gemv.md @@ -6,7 +6,7 @@
:::{warning} - This document is still **experimental** and may be incomplete. + This document is still **experimental** and may be incomplete. Suggestions and improvements are highly encouraged—please submit a PR! ::: diff --git a/docs/deeplearning_operators/matmul_sparse.md b/docs/deeplearning_operators/matmul_sparse.md index ffab9c112..8caa6182f 100644 --- a/docs/deeplearning_operators/matmul_sparse.md +++ b/docs/deeplearning_operators/matmul_sparse.md @@ -5,7 +5,7 @@ :::{warning} - This document is still **experimental** and may be incomplete. + This document is still **experimental** and may be incomplete. This feature is still **experimental** and need further optimization. @@ -40,7 +40,7 @@ Both `PyTorch` and `vLLM` use `CUTLASS` as their computation backend (see refere A set of **CUTLASS-compatible** compressors is provided in `tilelang.utils.sparse`, where a dense tensor—along with other required arguments (e.g., block_K for sm90, transpose options)—can be passed in to perform the compression. -```python +```python from tilelang.utils.sparse import compress A_sparse, E = compress(A, transposed=trans_A, block_k=block_K) ``` diff --git a/docs/get_started/overview.md b/docs/get_started/overview.md index f4e1b5770..a7c154f31 100644 --- a/docs/get_started/overview.md +++ b/docs/get_started/overview.md @@ -15,38 +15,38 @@ Figure 1: High-level overview of the TileLang compilation flow. ## Programming Interfaces 1. **Beginner Level (Hardware-Unaware)** - - Intended for users who need to write code that is independent of specific hardware details. - - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations. + - Intended for users who need to write code that is independent of specific hardware details. + - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations. - *Note:* This interface is not yet fully implemented. 2. **Developer Level (Hardware-Aware with Tile Library)** - - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations. - - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures. + - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations. + - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures. - Users at this level can leverage these ready-made primitives without diving into low-level threading details. 3. **Expert Level (Hardware-Aware with Thread Primitives)** - - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing). - - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels. + - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing). + - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels. - This level grants maximum flexibility for specialized optimizations tailored to specific GPU or multi-core architectures. ## Compilation Flow -1. **Tile Program** +1. **Tile Program** A high-level specification of the computation. Depending on the user’s expertise, they may write a purely hardware-unaware tile program or incorporate constructs from the Tile Library or thread primitives. -2. **Tile Program with Tile Library** +2. **Tile Program with Tile Library** When developers choose from the Tile Library, the original Tile Program is expanded with specialized library calls. These calls encapsulate efficient implementation patterns for different operations. -3. **Tile Program with Thread Primitives** +3. **Tile Program with Thread Primitives** Expert-level developers can explicitly use low-level threading constructs to hand-optimize data layout, synchronization, and memory usage. -4. **IRModule** +4. **IRModule** After the program is composed with libraries or thread primitives, it is lowered to an intermediate representation (IR) that captures the necessary hardware details. -5. **Source Code Generation (C/CUDA/HIP/LLVM/…)** +5. **Source Code Generation (C/CUDA/HIP/LLVM/…)** From the IR, the system generates target-specific source code. This source code is tuned for the desired backends or GPU architectures (e.g., NVIDIA, AMD). -6. **Hardware-Specific Executable/Runtime** +6. **Hardware-Specific Executable/Runtime** Finally, the generated source is compiled into hardware-specific executables, ready to run on the corresponding devices. The pipeline supports multiple GPU backends and can be extended to additional architectures. ## Tile-based Programming Model diff --git a/examples/bitnet-1.58b/.gitignore b/examples/bitnet-1.58b/.gitignore index 6ea887496..2bcdfd92b 100644 --- a/examples/bitnet-1.58b/.gitignore +++ b/examples/bitnet-1.58b/.gitignore @@ -1 +1 @@ -models/ \ No newline at end of file +models/ diff --git a/examples/bitnet-1.58b/benchmark.sh b/examples/bitnet-1.58b/benchmark.sh index 6a2550d45..839443dc6 100755 --- a/examples/bitnet-1.58b/benchmark.sh +++ b/examples/bitnet-1.58b/benchmark.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + python benchmark_generate.py --bs 16 --in_seq_len 32 --out_seq_len 128 | tee b16_i32_o128.log python benchmark_generate.py --bs 1 --in_seq_len 512 --out_seq_len 64 | tee b1_i512_o64.log diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh index 741c3a124..b0430588a 100755 --- a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh +++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + # retrieve the native model input and saved model directory MODEL_DIR=$1 SAVED_MODEL_DIR=$2 diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh index a2df0eb8c..66356d3d8 100755 --- a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh +++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + # require git lfs if ! command -v git-lfs &> /dev/null; then echo "Please install git-lfs first by running 'sudo apt install git-lfs'" diff --git a/examples/bitnet-1.58b/maint/quantize_config.json b/examples/bitnet-1.58b/maint/quantize_config.json index e2b24123a..80fbf02f0 100644 --- a/examples/bitnet-1.58b/maint/quantize_config.json +++ b/examples/bitnet-1.58b/maint/quantize_config.json @@ -7,4 +7,4 @@ "model_name_or_path": "1bitLLM/bitnet_b1_58-3B", "quant_method": "bitnet", "checkpoint_format": "bitnet" -} \ No newline at end of file +} diff --git a/examples/bitnet-1.58b/maint/upload_models.sh b/examples/bitnet-1.58b/maint/upload_models.sh index b764b0da6..7c6d76e32 100755 --- a/examples/bitnet-1.58b/maint/upload_models.sh +++ b/examples/bitnet-1.58b/maint/upload_models.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + MODEL_DIR=$1 REMOTE_DIR=$2 diff --git a/examples/bitnet-1.58b/nvidia_measure_memory.sh b/examples/bitnet-1.58b/nvidia_measure_memory.sh index e8998f309..82cf4855f 100755 --- a/examples/bitnet-1.58b/nvidia_measure_memory.sh +++ b/examples/bitnet-1.58b/nvidia_measure_memory.sh @@ -1 +1,3 @@ +#!/usr/bin/env bash + nvidia-smi --query-gpu=memory.used --format=csv -lms 500 diff --git a/examples/deepseek_mla/README.md b/examples/deepseek_mla/README.md index de4addcc5..bd3539d26 100644 --- a/examples/deepseek_mla/README.md +++ b/examples/deepseek_mla/README.md @@ -31,7 +31,7 @@ Notably, **TileLang accomplishes this with just around 80 lines of Python code** First, let's review the core computation logic of traditional FlashAttention: -```python +```python # acc_s: [block_M, block_N] # scores_max: [block_M] # scores_scale: [block_M] diff --git a/examples/deepseek_nsa/requirements.txt b/examples/deepseek_nsa/requirements.txt index 777c2ad4c..e096dfd7d 100644 --- a/examples/deepseek_nsa/requirements.txt +++ b/examples/deepseek_nsa/requirements.txt @@ -1 +1 @@ -git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e \ No newline at end of file +git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e diff --git a/examples/deepseek_v32/README.md b/examples/deepseek_v32/README.md index 66596a619..01a14b6b2 100644 --- a/examples/deepseek_v32/README.md +++ b/examples/deepseek_v32/README.md @@ -193,10 +193,10 @@ for i_i in T.Pipelined(NI, num_stages=num_stages): # Load KV data for selected indices for bi_i, d_i in T.Parallel(BI, D): KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BI + bi_i], bz, d_i] - + # Recompute attention scores for backward T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol) - + # Apply softmax gradient: dP = P * (dP_raw - Delta) for h_i, bi_i in T.Parallel(padded_H, BI): acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale @@ -212,7 +212,7 @@ The key gradient computations are: ```python # Atomically update dKV at selected indices for bi_i, d_i in T.Parallel(BI // split_store, D // 4): - T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4], + T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4], acc_dkv_shared[bi_i, d_i * 4]) ``` diff --git a/examples/deepseek_v32/inference/config_671B_v3.2.json b/examples/deepseek_v32/inference/config_671B_v3.2.json index be88f1cca..375aa9aa2 100644 --- a/examples/deepseek_v32/inference/config_671B_v3.2.json +++ b/examples/deepseek_v32/inference/config_671B_v3.2.json @@ -23,4 +23,4 @@ "index_n_heads": 64, "index_head_dim": 128, "index_topk": 2048 -} \ No newline at end of file +} diff --git a/examples/deepseek_v32/inference/convert.py b/examples/deepseek_v32/inference/convert.py index df7943918..090be7145 100644 --- a/examples/deepseek_v32/inference/convert.py +++ b/examples/deepseek_v32/inference/convert.py @@ -42,7 +42,7 @@ def main(hf_ckpt_path, save_path, n_experts, mp): save_path (str): Path to the directory where the converted checkpoint files will be saved. n_experts (int): Total number of experts in the model. mp (int): Model parallelism factor. - + Returns: None """ diff --git a/examples/deepseek_v32/inference/requirements.txt b/examples/deepseek_v32/inference/requirements.txt index 604fed552..8c208a8b1 100644 --- a/examples/deepseek_v32/inference/requirements.txt +++ b/examples/deepseek_v32/inference/requirements.txt @@ -2,4 +2,4 @@ torch transformers safetensors fast_hadamard_transform -tilelang==0.1.6 \ No newline at end of file +tilelang==0.1.6 diff --git a/examples/dequantize_gemm/README.md b/examples/dequantize_gemm/README.md index 0c6116775..25ef617a2 100644 --- a/examples/dequantize_gemm/README.md +++ b/examples/dequantize_gemm/README.md @@ -19,7 +19,7 @@ def dequant_matmul( T.clear(Ct_local) for k in T.Pipelined( - T.ceildiv(K, block_K), + T.ceildiv(K, block_K), num_stages=num_stages ): T.copy(A[by * block_M, k * block_K], A_shared) diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb index 99cb977f0..197f31ce4 100644 --- a/examples/lazy_jit/lazyjit.en.ipynb +++ b/examples/lazy_jit/lazyjit.en.ipynb @@ -786,4 +786,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb index 601c5c5d2..132790bb3 100644 --- a/examples/lazy_jit/lazyjit.zh.ipynb +++ b/examples/lazy_jit/lazyjit.zh.ipynb @@ -786,4 +786,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/images/MatmulExample.svg b/images/MatmulExample.svg index 6e20daf55..294e8f631 100644 --- a/images/MatmulExample.svg +++ b/images/MatmulExample.svg @@ -1 +1 @@ -A_shared=T.alloc_shared((block_M,block_K))B_shared=T.alloc_shared((block_K,block_N))C_local=T.alloc_fragment((block_M,block_N),accum_dtype)importtilelang.languageasTdefMatmul(A:T.Buffer,B:T.Buffer,C:T.Buffer):withT.Kernel(ceildiv(N,block_N),ceildiv(M,block_M),threads=128)as(bx,by):T.clear(C_local)forkinT.Pipelined(ceildiv(K,block_K),num_stages=3):T.copy(A[by*block_M,k*block_K],A_shared)T.copy(B[k*block_K,bx*block_N],B_shared)T.gemm(A_shared,B_shared,C_local)Kernel Context InitializationBuffer AllocationRegisterInitialize Accumulate Buffer with ZeroMain Loop with Pipeline AnnotationT.copy(C_local,C[by*block_M,bx*block_N])Write Back to Global MemoryCopy Data from Global to Shared MemoryGEMMSharedMemoryGlobal MemoryShared MemoryRegister Files(a) Efficient GEMM with Multi-Level Tiling on GPUs(b) Describing Tiled GPU GEMM with TileLang \ No newline at end of file +A_shared=T.alloc_shared((block_M,block_K))B_shared=T.alloc_shared((block_K,block_N))C_local=T.alloc_fragment((block_M,block_N),accum_dtype)importtilelang.languageasTdefMatmul(A:T.Buffer,B:T.Buffer,C:T.Buffer):withT.Kernel(ceildiv(N,block_N),ceildiv(M,block_M),threads=128)as(bx,by):T.clear(C_local)forkinT.Pipelined(ceildiv(K,block_K),num_stages=3):T.copy(A[by*block_M,k*block_K],A_shared)T.copy(B[k*block_K,bx*block_N],B_shared)T.gemm(A_shared,B_shared,C_local)Kernel Context InitializationBuffer AllocationRegisterInitialize Accumulate Buffer with ZeroMain Loop with Pipeline AnnotationT.copy(C_local,C[by*block_M,bx*block_N])Write Back to Global MemoryCopy Data from Global to Shared MemoryGEMMSharedMemoryGlobal MemoryShared MemoryRegister Files(a) Efficient GEMM with Multi-Level Tiling on GPUs(b) Describing Tiled GPU GEMM with TileLang diff --git a/images/logo-row.svg b/images/logo-row.svg index 633243f3a..e73244b74 100644 --- a/images/logo-row.svg +++ b/images/logo-row.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py old mode 100644 new mode 100755 diff --git a/maint/precision/cuda_ops.cu b/maint/precision/cuda_ops.cu index 519335751..1f37d53de 100644 --- a/maint/precision/cuda_ops.cu +++ b/maint/precision/cuda_ops.cu @@ -239,4 +239,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type")); m.def("launch_fast_operator", &launch_fast_operator, "CUDA Fast Operator", py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type")); -} \ No newline at end of file +} diff --git a/maint/scripts/apply_mit_license.sh b/maint/scripts/apply_mit_license.sh index cc425b964..2bb7cc946 100755 --- a/maint/scripts/apply_mit_license.sh +++ b/maint/scripts/apply_mit_license.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + echo "Add MIT license boilerplate..." PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # TO source code root @@ -17,8 +19,8 @@ done for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \ '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cmake' \ -or -name '*.py' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do - sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} - if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then + sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} + if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then cat maint/scripts/mit_liscense2.txt ${SRC_FILE} > ${SRC_FILE}.new mv ${SRC_FILE}.new ${SRC_FILE} fi @@ -26,7 +28,7 @@ done for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \ '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -name '*.sh' ); do - sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} + sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then line=$(head -n 1 ${SRC_FILE}) if [[ $line == "#!/bin/bash"* ]]; then diff --git a/maint/scripts/build_docs.sh b/maint/scripts/build_docs.sh index f367dcc70..3119eb8c7 100755 --- a/maint/scripts/build_docs.sh +++ b/maint/scripts/build_docs.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + python -m venv .venv source .venv/bin/activate python -m pip install --upgrade pip --no-user diff --git a/maint/scripts/check_mit_license.sh b/maint/scripts/check_mit_license.sh index 855c48f4c..3802b1efa 100755 --- a/maint/scripts/check_mit_license.sh +++ b/maint/scripts/check_mit_license.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + echo "Check MIT License boilerplate..." PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # To source code root @@ -8,7 +10,7 @@ EXITCODE=0 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name '*apply_mit_license.sh' \ -not -name '*check_mit_license.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cpp' -or -name '*.cu' -or -name '*.h' -or -name '*.hpp' \ -or -name '*.py' -or -name '*.sh' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do - + # Skip files that already contain the Apache License if grep -q "Apache License" "${SRC_FILE}"; then continue diff --git a/maint/scripts/local_distribution.sh b/maint/scripts/local_distribution.sh index ff8239dff..d3b137fb4 100755 --- a/maint/scripts/local_distribution.sh +++ b/maint/scripts/local_distribution.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + set -eux rm -rf dist diff --git a/maint/scripts/pypi_distribution.sh b/maint/scripts/pypi_distribution.sh index 5a0865141..9a8c6e62c 100755 --- a/maint/scripts/pypi_distribution.sh +++ b/maint/scripts/pypi_distribution.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash + set -eux rm -rf dist raw_dist diff --git a/pyproject.toml b/pyproject.toml index 32c0c6db1..7c9849e3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,7 +127,7 @@ tilelang = "tilelang" "tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include" "tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library" - + [tool.codespell] ignore-words = "docs/spelling_wordlist.txt" diff --git a/requirements-test.txt b/requirements-test.txt index a46e87a1c..533cab567 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -19,7 +19,9 @@ docutils dtlib einops flash-linear-attention==0.3.2 +matplotlib packaging>=21.0 +pandas pytest-durations pytest-timeout pytest-xdist>=2.2.1 @@ -27,10 +29,8 @@ pytest>=6.2.4 pyyaml requests scipy +seaborn tabulate tornado wheel z3-solver>=4.13.0 -matplotlib -seaborn -pandas diff --git a/requirements.txt b/requirements.txt index 5efd7c7f9..6735d178b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ torch torch>=2.7; platform_system == 'Darwin' tqdm>=4.62.3 typing-extensions>=4.10.0 -z3-solver>=4.13.0 \ No newline at end of file +z3-solver>=4.13.0 diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h index 2f79c5e15..b23b9fc5c 100644 --- a/src/op/gemm_sp_py.h +++ b/src/op/gemm_sp_py.h @@ -91,4 +91,4 @@ class GemmSPPy : public TileOperator { } // namespace tl } // namespace tvm -#endif // TVM_TL_OP_GEMM_SP_PY_H_ \ No newline at end of file +#endif // TVM_TL_OP_GEMM_SP_PY_H_ diff --git a/src/tl_templates/cpp/common.h b/src/tl_templates/cpp/common.h index 0ce6580d3..f1fe801e6 100644 --- a/src/tl_templates/cpp/common.h +++ b/src/tl_templates/cpp/common.h @@ -5,4 +5,4 @@ #include using half_float::half; -// Not Implemented \ No newline at end of file +// Not Implemented diff --git a/src/tl_templates/cpu/common.h b/src/tl_templates/cpu/common.h index b288cd114..b69b23186 100644 --- a/src/tl_templates/cpu/common.h +++ b/src/tl_templates/cpu/common.h @@ -4,4 +4,4 @@ #include // Not Implemented -F \ No newline at end of file +F diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h index 3f8ce5e6b..40d364bc9 100644 --- a/src/tl_templates/cuda/debug.h +++ b/src/tl_templates/cuda/debug.h @@ -125,4 +125,4 @@ TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) { printf("Device assert failed: %s\n", msg); assert(0); } -} \ No newline at end of file +} diff --git a/src/tl_templates/cuda/gemm_sp_sm90.h b/src/tl_templates/cuda/gemm_sp_sm90.h index 6184f9be7..522fc11ee 100644 --- a/src/tl_templates/cuda/gemm_sp_sm90.h +++ b/src/tl_templates/cuda/gemm_sp_sm90.h @@ -231,4 +231,4 @@ TL_DEVICE void gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) { CUTE_GCC_UNREACHABLE; } } -} // namespace tl \ No newline at end of file +} // namespace tl diff --git a/src/tl_templates/cuda/ldsm.h b/src/tl_templates/cuda/ldsm.h index 4d6af8a09..a20746dff 100644 --- a/src/tl_templates/cuda/ldsm.h +++ b/src/tl_templates/cuda/ldsm.h @@ -118,4 +118,4 @@ TL_DEVICE void ptx_stmatrix_x4_trans(void const *const smem_ptr, "r"(value0), "r"(value1), "r"(value2), "r"(value3)); } -} // namespace tl \ No newline at end of file +} // namespace tl diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h index b32f84dca..82fb53031 100644 --- a/src/tl_templates/hip/hip_fp8.h +++ b/src/tl_templates/hip/hip_fp8.h @@ -164,4 +164,4 @@ __device__ fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, res.x = res_x; res.y = res_y; return res; -} \ No newline at end of file +} diff --git a/src/tl_templates/hip/ldsm.h b/src/tl_templates/hip/ldsm.h index 68c1455f7..286b77324 100644 --- a/src/tl_templates/hip/ldsm.h +++ b/src/tl_templates/hip/ldsm.h @@ -1,3 +1,3 @@ #pragma once -#include "common.h" \ No newline at end of file +#include "common.h" diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h index bb7a0f46f..0d7c3ee62 100644 --- a/src/transform/arg_binder.h +++ b/src/transform/arg_binder.h @@ -182,4 +182,4 @@ class ArgBinder { }; } // namespace tl } // namespace tvm -#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_ \ No newline at end of file +#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_ diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h index 627dc895f..6bd3309ae 100644 --- a/src/transform/atomicadd_vectorize.h +++ b/src/transform/atomicadd_vectorize.h @@ -57,4 +57,4 @@ class AtomicAddVectorizePlanner : public arith::IRVisitorWithAnalyzer { } // namespace tl } // namespace tvm -#endif // TVM_TL_ATOMICADD_VECTORIZE_H_ \ No newline at end of file +#endif // TVM_TL_ATOMICADD_VECTORIZE_H_ diff --git a/src/transform/common/assume.h b/src/transform/common/assume.h index c6eadc6b3..db830818e 100644 --- a/src/transform/common/assume.h +++ b/src/transform/common/assume.h @@ -25,4 +25,4 @@ bool IsAssumeInEvaluateForm(const Stmt &stmt); } // namespace tl } // namespace tvm -#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_ \ No newline at end of file +#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_ diff --git a/src/transform/common/loop_vectorization_utils.h b/src/transform/common/loop_vectorization_utils.h index b9b7715d0..c23252f41 100644 --- a/src/transform/common/loop_vectorization_utils.h +++ b/src/transform/common/loop_vectorization_utils.h @@ -781,4 +781,4 @@ class Vectorizer : public StmtMutator, }; } // namespace tl -} // namespace tvm \ No newline at end of file +} // namespace tvm diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc index e9e8f76e6..0fd3a2c16 100644 --- a/src/transform/make_packed_api.cc +++ b/src/transform/make_packed_api.cc @@ -619,4 +619,4 @@ TVM_FFI_STATIC_INIT_BLOCK() { } } // namespace tl -} // namespace tvm \ No newline at end of file +} // namespace tvm diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc index bfdcb5cd5..57d4b8127 100644 --- a/src/transform/split_host_device.cc +++ b/src/transform/split_host_device.cc @@ -268,4 +268,4 @@ TVM_FFI_STATIC_INIT_BLOCK() { } // namespace transform } // namespace tl -} // namespace tvm \ No newline at end of file +} // namespace tvm diff --git a/tilelang/carver/README.md b/tilelang/carver/README.md index 164006134..f484f47d7 100644 --- a/tilelang/carver/README.md +++ b/tilelang/carver/README.md @@ -33,8 +33,8 @@ arch = CUDA("nvidia/geforce-rtx-4090") # for k in Reduce(1024): # ... carve_template = carver.GeneralReductionTemplate( - structure="SSR", - shape=[1024, 1024, 1024], + structure="SSR", + shape=[1024, 1024, 1024], dtype="float16", ).with_arch(arch) diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx index dc462c627..38c1738f7 100644 --- a/tilelang/jit/adapter/cython/cython_wrapper.pyx +++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx @@ -83,8 +83,8 @@ cdef class CythonKernelWrapper: tensor_device = tensor.device device_type_match = device.type == tensor_device.type device_index_match = ( - tensor_device.index is None or - device.index is None or + tensor_device.index is None or + device.index is None or tensor_device.index == device.index ) if not (device_type_match and device_index_match): @@ -116,7 +116,7 @@ cdef class CythonKernelWrapper: f"expected {len(shape_list)} dimensions, " f"got {tensor.dim()}" ) - + # Check each dimension for shape_idx, expected_shape in shape_list: actual_shape = tensor.shape[shape_idx] @@ -176,7 +176,7 @@ cdef class CythonKernelWrapper: ) # Use current CUDA stream if none specified - if stream == -1: + if stream == -1: if torch.cuda.is_available(): try: stream = torch._C._cuda_getCurrentRawStream(torch.cuda.current_device()) @@ -239,7 +239,7 @@ cdef class CythonKernelWrapper: torch.int64: ctypes.c_int64, torch.bool: ctypes.c_bool, } - + call_args = [] for i, tensor in enumerate(tensor_list): if isinstance(tensor, torch.Tensor): @@ -286,4 +286,3 @@ cdef class CythonKernelWrapper: return tensor_list[self.result_idx[0]] else: return [tensor_list[i] for i in self.result_idx] -