diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
index c7be92298..c12cc5082 100644
--- a/.github/workflows/pr-regression-test-bot.yml
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -59,7 +59,7 @@ jobs:
const { owner, repo } = context.repo
const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username })
core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none
-
+
- name: Reject if not allowed
if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }}
run: |
@@ -241,10 +241,10 @@ jobs:
const fs = require('fs');
// Read the file directly instead of passing via env/outputs to avoid escaping issues
const md = fs.readFileSync('regression_result.md', 'utf8');
-
+
const runUrl = `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`;
-
- const body =
+
+ const body =
'Performance Benchmark Report\n' +
'============================\n\n' +
`Triggered by: @${context.payload.comment.user.login}\n` +
@@ -262,4 +262,3 @@ jobs:
issue_number: context.issue.number,
body
});
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9300c3049..f52f91b53 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,15 +13,13 @@ repos:
hooks:
- id: check-symlinks
- id: destroyed-symlinks
- # FIXME: enable these hooks
- # - id: trailing-whitespace
- # - id: end-of-file-fixer
+ - id: trailing-whitespace
+ - id: end-of-file-fixer
- id: check-added-large-files
- id: check-merge-conflict
fail_fast: true
- # FIXME: enable these hooks
- # - id: check-executables-have-shebangs
- # - id: check-shebang-scripts-are-executable
+ - id: check-executables-have-shebangs
+ - id: check-shebang-scripts-are-executable
- id: detect-private-key
- id: check-yaml
- id: check-toml
diff --git a/.pymarkdown b/.pymarkdown
index ba233849a..5394265ed 100644
--- a/.pymarkdown
+++ b/.pymarkdown
@@ -4,7 +4,7 @@
"style": "atx"
},
"md004": {
- "style": "dash"
+ "style": "dash"
},
"md013": {
"enabled": false
diff --git a/LICENSE b/LICENSE
index 2122252e9..09dd51c8c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,7 @@
MIT License
Copyright (c) Tile-AI.
- **During the period from December 1, 2024, to Mar 14, 2025, this project is
+ **During the period from December 1, 2024, to Mar 14, 2025, this project is
subject to additional collaboration terms with Microsoft Corporation.**
Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/README.md b/README.md
index 779e2b90b..eeef6d401 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ TileLang achieves exceptional performance across a variety of computational patt
-
+
- Flash Attention Performance on H100

@@ -170,7 +170,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.fl
# Perform a tile-level GEMM on the shared buffers
# Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
T.gemm(A_shared, B_shared, C_local)
-
+
# relu
for i, j in T.Parallel(block_M, block_N):
C_local[i, j] = T.max(C_local[i, j], 0)
diff --git a/THIRDPARTYNOTICES.txt b/THIRDPARTYNOTICES.txt
index b7c481841..3558662a8 100644
--- a/THIRDPARTYNOTICES.txt
+++ b/THIRDPARTYNOTICES.txt
@@ -1,5 +1,5 @@
-BitBLAS uses third-party material as listed below. The attached notices are
-provided for informational purposes only.
+BitBLAS uses third-party material as listed below. The attached notices are
+provided for informational purposes only.
Notice for apache/tvm
-------------------------------
diff --git a/docker/Dockerfile.cu118 b/docker/Dockerfile.cu118
index be8274461..969b0e43c 100644
--- a/docker/Dockerfile.cu118
+++ b/docker/Dockerfile.cu118
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:22.12-py3
+FROM nvcr.io/nvidia/pytorch:22.12-py3
WORKDIR /root
diff --git a/docker/Dockerfile.cu120 b/docker/Dockerfile.cu120
index 7ca1d931f..341fe40c0 100644
--- a/docker/Dockerfile.cu120
+++ b/docker/Dockerfile.cu120
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:23.01-py3
+FROM nvcr.io/nvidia/pytorch:23.01-py3
WORKDIR /root
diff --git a/docs/.gitignore b/docs/.gitignore
index 4d8eb4049..79ba97163 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,2 +1,2 @@
_build/
-autoapi/
\ No newline at end of file
+autoapi/
diff --git a/docs/CNAME b/docs/CNAME
index ca903c694..6862cd2e9 100644
--- a/docs/CNAME
+++ b/docs/CNAME
@@ -1 +1 @@
-tilelang.com
\ No newline at end of file
+tilelang.com
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index 0ef6b48cb..a1fee9c3d 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -8,4 +8,3 @@
.sidebar-logo-container {
line-height: 0;
}
-
diff --git a/docs/_static/img/logo-row.svg b/docs/_static/img/logo-row.svg
index 633243f3a..e73244b74 100644
--- a/docs/_static/img/logo-row.svg
+++ b/docs/_static/img/logo-row.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
diff --git a/docs/deeplearning_operators/deepseek_mla.md b/docs/deeplearning_operators/deepseek_mla.md
index 2cc167c58..ed02b58b1 100644
--- a/docs/deeplearning_operators/deepseek_mla.md
+++ b/docs/deeplearning_operators/deepseek_mla.md
@@ -38,7 +38,7 @@ Notably, **TileLang accomplishes this with just around 80 lines of Python code**
First, let's review the core computation logic of traditional FlashAttention:
-```python
+```python
# acc_s: [block_M, block_N]
# scores_max: [block_M]
# scores_scale: [block_M]
@@ -160,7 +160,7 @@ Key implementation differences between Hopper and MI300X architectures include:
# Original shared memory allocation
Q_shared = T.alloc_shared([block_H, dim], dtype)
Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
-
+
# Optimized register allocation
Q_local = T.alloc_fragment([block_H, dim], dtype)
Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
diff --git a/docs/deeplearning_operators/elementwise.md b/docs/deeplearning_operators/elementwise.md
index d14c516d5..6aa8e4085 100644
--- a/docs/deeplearning_operators/elementwise.md
+++ b/docs/deeplearning_operators/elementwise.md
@@ -8,7 +8,7 @@
:class: myclass1 myclass2
:name: a-tip-reference
- This document is still **experimental** and may be incomplete.
+ This document is still **experimental** and may be incomplete.
Suggestions and improvements are highly encouraged—please submit a PR!
:::
diff --git a/docs/deeplearning_operators/gemv.md b/docs/deeplearning_operators/gemv.md
index c1f937ab4..38287f220 100644
--- a/docs/deeplearning_operators/gemv.md
+++ b/docs/deeplearning_operators/gemv.md
@@ -6,7 +6,7 @@
:::{warning}
- This document is still **experimental** and may be incomplete.
+ This document is still **experimental** and may be incomplete.
Suggestions and improvements are highly encouraged—please submit a PR!
:::
diff --git a/docs/deeplearning_operators/matmul_sparse.md b/docs/deeplearning_operators/matmul_sparse.md
index ffab9c112..8caa6182f 100644
--- a/docs/deeplearning_operators/matmul_sparse.md
+++ b/docs/deeplearning_operators/matmul_sparse.md
@@ -5,7 +5,7 @@
:::{warning}
- This document is still **experimental** and may be incomplete.
+ This document is still **experimental** and may be incomplete.
This feature is still **experimental** and need further optimization.
@@ -40,7 +40,7 @@ Both `PyTorch` and `vLLM` use `CUTLASS` as their computation backend (see refere
A set of **CUTLASS-compatible** compressors is provided in `tilelang.utils.sparse`, where a dense tensor—along with other required arguments (e.g., block_K for sm90, transpose options)—can be passed in to perform the compression.
-```python
+```python
from tilelang.utils.sparse import compress
A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
```
diff --git a/docs/get_started/overview.md b/docs/get_started/overview.md
index f4e1b5770..a7c154f31 100644
--- a/docs/get_started/overview.md
+++ b/docs/get_started/overview.md
@@ -15,38 +15,38 @@ Figure 1: High-level overview of the TileLang compilation flow.
## Programming Interfaces
1. **Beginner Level (Hardware-Unaware)**
- - Intended for users who need to write code that is independent of specific hardware details.
- - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.
+ - Intended for users who need to write code that is independent of specific hardware details.
+ - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.
- *Note:* This interface is not yet fully implemented.
2. **Developer Level (Hardware-Aware with Tile Library)**
- - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.
- - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.
+ - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.
+ - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.
- Users at this level can leverage these ready-made primitives without diving into low-level threading details.
3. **Expert Level (Hardware-Aware with Thread Primitives)**
- - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).
- - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.
+ - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).
+ - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.
- This level grants maximum flexibility for specialized optimizations tailored to specific GPU or multi-core architectures.
## Compilation Flow
-1. **Tile Program**
+1. **Tile Program**
A high-level specification of the computation. Depending on the user’s expertise, they may write a purely hardware-unaware tile program or incorporate constructs from the Tile Library or thread primitives.
-2. **Tile Program with Tile Library**
+2. **Tile Program with Tile Library**
When developers choose from the Tile Library, the original Tile Program is expanded with specialized library calls. These calls encapsulate efficient implementation patterns for different operations.
-3. **Tile Program with Thread Primitives**
+3. **Tile Program with Thread Primitives**
Expert-level developers can explicitly use low-level threading constructs to hand-optimize data layout, synchronization, and memory usage.
-4. **IRModule**
+4. **IRModule**
After the program is composed with libraries or thread primitives, it is lowered to an intermediate representation (IR) that captures the necessary hardware details.
-5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**
+5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**
From the IR, the system generates target-specific source code. This source code is tuned for the desired backends or GPU architectures (e.g., NVIDIA, AMD).
-6. **Hardware-Specific Executable/Runtime**
+6. **Hardware-Specific Executable/Runtime**
Finally, the generated source is compiled into hardware-specific executables, ready to run on the corresponding devices. The pipeline supports multiple GPU backends and can be extended to additional architectures.
## Tile-based Programming Model
diff --git a/examples/bitnet-1.58b/.gitignore b/examples/bitnet-1.58b/.gitignore
index 6ea887496..2bcdfd92b 100644
--- a/examples/bitnet-1.58b/.gitignore
+++ b/examples/bitnet-1.58b/.gitignore
@@ -1 +1 @@
-models/
\ No newline at end of file
+models/
diff --git a/examples/bitnet-1.58b/benchmark.sh b/examples/bitnet-1.58b/benchmark.sh
index 6a2550d45..839443dc6 100755
--- a/examples/bitnet-1.58b/benchmark.sh
+++ b/examples/bitnet-1.58b/benchmark.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
python benchmark_generate.py --bs 16 --in_seq_len 32 --out_seq_len 128 | tee b16_i32_o128.log
python benchmark_generate.py --bs 1 --in_seq_len 512 --out_seq_len 64 | tee b1_i512_o64.log
diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
index 741c3a124..b0430588a 100755
--- a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
+++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
# retrieve the native model input and saved model directory
MODEL_DIR=$1
SAVED_MODEL_DIR=$2
diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
index a2df0eb8c..66356d3d8 100755
--- a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
+++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
# require git lfs
if ! command -v git-lfs &> /dev/null; then
echo "Please install git-lfs first by running 'sudo apt install git-lfs'"
diff --git a/examples/bitnet-1.58b/maint/quantize_config.json b/examples/bitnet-1.58b/maint/quantize_config.json
index e2b24123a..80fbf02f0 100644
--- a/examples/bitnet-1.58b/maint/quantize_config.json
+++ b/examples/bitnet-1.58b/maint/quantize_config.json
@@ -7,4 +7,4 @@
"model_name_or_path": "1bitLLM/bitnet_b1_58-3B",
"quant_method": "bitnet",
"checkpoint_format": "bitnet"
-}
\ No newline at end of file
+}
diff --git a/examples/bitnet-1.58b/maint/upload_models.sh b/examples/bitnet-1.58b/maint/upload_models.sh
index b764b0da6..7c6d76e32 100755
--- a/examples/bitnet-1.58b/maint/upload_models.sh
+++ b/examples/bitnet-1.58b/maint/upload_models.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
MODEL_DIR=$1
REMOTE_DIR=$2
diff --git a/examples/bitnet-1.58b/nvidia_measure_memory.sh b/examples/bitnet-1.58b/nvidia_measure_memory.sh
index e8998f309..82cf4855f 100755
--- a/examples/bitnet-1.58b/nvidia_measure_memory.sh
+++ b/examples/bitnet-1.58b/nvidia_measure_memory.sh
@@ -1 +1,3 @@
+#!/usr/bin/env bash
+
nvidia-smi --query-gpu=memory.used --format=csv -lms 500
diff --git a/examples/deepseek_mla/README.md b/examples/deepseek_mla/README.md
index de4addcc5..bd3539d26 100644
--- a/examples/deepseek_mla/README.md
+++ b/examples/deepseek_mla/README.md
@@ -31,7 +31,7 @@ Notably, **TileLang accomplishes this with just around 80 lines of Python code**
First, let's review the core computation logic of traditional FlashAttention:
-```python
+```python
# acc_s: [block_M, block_N]
# scores_max: [block_M]
# scores_scale: [block_M]
diff --git a/examples/deepseek_nsa/requirements.txt b/examples/deepseek_nsa/requirements.txt
index 777c2ad4c..e096dfd7d 100644
--- a/examples/deepseek_nsa/requirements.txt
+++ b/examples/deepseek_nsa/requirements.txt
@@ -1 +1 @@
-git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
\ No newline at end of file
+git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
diff --git a/examples/deepseek_v32/README.md b/examples/deepseek_v32/README.md
index 66596a619..01a14b6b2 100644
--- a/examples/deepseek_v32/README.md
+++ b/examples/deepseek_v32/README.md
@@ -193,10 +193,10 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
# Load KV data for selected indices
for bi_i, d_i in T.Parallel(BI, D):
KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BI + bi_i], bz, d_i]
-
+
# Recompute attention scores for backward
T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-
+
# Apply softmax gradient: dP = P * (dP_raw - Delta)
for h_i, bi_i in T.Parallel(padded_H, BI):
acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
@@ -212,7 +212,7 @@ The key gradient computations are:
```python
# Atomically update dKV at selected indices
for bi_i, d_i in T.Parallel(BI // split_store, D // 4):
- T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4],
+ T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4],
acc_dkv_shared[bi_i, d_i * 4])
```
diff --git a/examples/deepseek_v32/inference/config_671B_v3.2.json b/examples/deepseek_v32/inference/config_671B_v3.2.json
index be88f1cca..375aa9aa2 100644
--- a/examples/deepseek_v32/inference/config_671B_v3.2.json
+++ b/examples/deepseek_v32/inference/config_671B_v3.2.json
@@ -23,4 +23,4 @@
"index_n_heads": 64,
"index_head_dim": 128,
"index_topk": 2048
-}
\ No newline at end of file
+}
diff --git a/examples/deepseek_v32/inference/convert.py b/examples/deepseek_v32/inference/convert.py
index df7943918..090be7145 100644
--- a/examples/deepseek_v32/inference/convert.py
+++ b/examples/deepseek_v32/inference/convert.py
@@ -42,7 +42,7 @@ def main(hf_ckpt_path, save_path, n_experts, mp):
save_path (str): Path to the directory where the converted checkpoint files will be saved.
n_experts (int): Total number of experts in the model.
mp (int): Model parallelism factor.
-
+
Returns:
None
"""
diff --git a/examples/deepseek_v32/inference/requirements.txt b/examples/deepseek_v32/inference/requirements.txt
index 604fed552..8c208a8b1 100644
--- a/examples/deepseek_v32/inference/requirements.txt
+++ b/examples/deepseek_v32/inference/requirements.txt
@@ -2,4 +2,4 @@ torch
transformers
safetensors
fast_hadamard_transform
-tilelang==0.1.6
\ No newline at end of file
+tilelang==0.1.6
diff --git a/examples/dequantize_gemm/README.md b/examples/dequantize_gemm/README.md
index 0c6116775..25ef617a2 100644
--- a/examples/dequantize_gemm/README.md
+++ b/examples/dequantize_gemm/README.md
@@ -19,7 +19,7 @@ def dequant_matmul(
T.clear(Ct_local)
for k in T.Pipelined(
- T.ceildiv(K, block_K),
+ T.ceildiv(K, block_K),
num_stages=num_stages
):
T.copy(A[by * block_M, k * block_K], A_shared)
diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
index 99cb977f0..197f31ce4 100644
--- a/examples/lazy_jit/lazyjit.en.ipynb
+++ b/examples/lazy_jit/lazyjit.en.ipynb
@@ -786,4 +786,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
index 601c5c5d2..132790bb3 100644
--- a/examples/lazy_jit/lazyjit.zh.ipynb
+++ b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -786,4 +786,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/images/MatmulExample.svg b/images/MatmulExample.svg
index 6e20daf55..294e8f631 100644
--- a/images/MatmulExample.svg
+++ b/images/MatmulExample.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
diff --git a/images/logo-row.svg b/images/logo-row.svg
index 633243f3a..e73244b74 100644
--- a/images/logo-row.svg
+++ b/images/logo-row.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
old mode 100644
new mode 100755
diff --git a/maint/precision/cuda_ops.cu b/maint/precision/cuda_ops.cu
index 519335751..1f37d53de 100644
--- a/maint/precision/cuda_ops.cu
+++ b/maint/precision/cuda_ops.cu
@@ -239,4 +239,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
m.def("launch_fast_operator", &launch_fast_operator, "CUDA Fast Operator",
py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
-}
\ No newline at end of file
+}
diff --git a/maint/scripts/apply_mit_license.sh b/maint/scripts/apply_mit_license.sh
index cc425b964..2bb7cc946 100755
--- a/maint/scripts/apply_mit_license.sh
+++ b/maint/scripts/apply_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
echo "Add MIT license boilerplate..."
PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# TO source code root
@@ -17,8 +19,8 @@ done
for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
'*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cmake' \
-or -name '*.py' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
- sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
- if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
+ sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
+ if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
cat maint/scripts/mit_liscense2.txt ${SRC_FILE} > ${SRC_FILE}.new
mv ${SRC_FILE}.new ${SRC_FILE}
fi
@@ -26,7 +28,7 @@ done
for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
'*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -name '*.sh' ); do
- sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
+ sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
line=$(head -n 1 ${SRC_FILE})
if [[ $line == "#!/bin/bash"* ]]; then
diff --git a/maint/scripts/build_docs.sh b/maint/scripts/build_docs.sh
index f367dcc70..3119eb8c7 100755
--- a/maint/scripts/build_docs.sh
+++ b/maint/scripts/build_docs.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
python -m venv .venv
source .venv/bin/activate
python -m pip install --upgrade pip --no-user
diff --git a/maint/scripts/check_mit_license.sh b/maint/scripts/check_mit_license.sh
index 855c48f4c..3802b1efa 100755
--- a/maint/scripts/check_mit_license.sh
+++ b/maint/scripts/check_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
echo "Check MIT License boilerplate..."
PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# To source code root
@@ -8,7 +10,7 @@ EXITCODE=0
for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name '*apply_mit_license.sh' \
-not -name '*check_mit_license.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cpp' -or -name '*.cu' -or -name '*.h' -or -name '*.hpp' \
-or -name '*.py' -or -name '*.sh' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
-
+
# Skip files that already contain the Apache License
if grep -q "Apache License" "${SRC_FILE}"; then
continue
diff --git a/maint/scripts/local_distribution.sh b/maint/scripts/local_distribution.sh
index ff8239dff..d3b137fb4 100755
--- a/maint/scripts/local_distribution.sh
+++ b/maint/scripts/local_distribution.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
set -eux
rm -rf dist
diff --git a/maint/scripts/pypi_distribution.sh b/maint/scripts/pypi_distribution.sh
index 5a0865141..9a8c6e62c 100755
--- a/maint/scripts/pypi_distribution.sh
+++ b/maint/scripts/pypi_distribution.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
set -eux
rm -rf dist raw_dist
diff --git a/pyproject.toml b/pyproject.toml
index 32c0c6db1..7c9849e3c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -127,7 +127,7 @@ tilelang = "tilelang"
"tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include"
"tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library"
-
+
[tool.codespell]
ignore-words = "docs/spelling_wordlist.txt"
diff --git a/requirements-test.txt b/requirements-test.txt
index a46e87a1c..533cab567 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -19,7 +19,9 @@ docutils
dtlib
einops
flash-linear-attention==0.3.2
+matplotlib
packaging>=21.0
+pandas
pytest-durations
pytest-timeout
pytest-xdist>=2.2.1
@@ -27,10 +29,8 @@ pytest>=6.2.4
pyyaml
requests
scipy
+seaborn
tabulate
tornado
wheel
z3-solver>=4.13.0
-matplotlib
-seaborn
-pandas
diff --git a/requirements.txt b/requirements.txt
index 5efd7c7f9..6735d178b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,4 @@ torch
torch>=2.7; platform_system == 'Darwin'
tqdm>=4.62.3
typing-extensions>=4.10.0
-z3-solver>=4.13.0
\ No newline at end of file
+z3-solver>=4.13.0
diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h
index 2f79c5e15..b23b9fc5c 100644
--- a/src/op/gemm_sp_py.h
+++ b/src/op/gemm_sp_py.h
@@ -91,4 +91,4 @@ class GemmSPPy : public TileOperator {
} // namespace tl
} // namespace tvm
-#endif // TVM_TL_OP_GEMM_SP_PY_H_
\ No newline at end of file
+#endif // TVM_TL_OP_GEMM_SP_PY_H_
diff --git a/src/tl_templates/cpp/common.h b/src/tl_templates/cpp/common.h
index 0ce6580d3..f1fe801e6 100644
--- a/src/tl_templates/cpp/common.h
+++ b/src/tl_templates/cpp/common.h
@@ -5,4 +5,4 @@
#include
using half_float::half;
-// Not Implemented
\ No newline at end of file
+// Not Implemented
diff --git a/src/tl_templates/cpu/common.h b/src/tl_templates/cpu/common.h
index b288cd114..b69b23186 100644
--- a/src/tl_templates/cpu/common.h
+++ b/src/tl_templates/cpu/common.h
@@ -4,4 +4,4 @@
#include
// Not Implemented
-F
\ No newline at end of file
+F
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 3f8ce5e6b..40d364bc9 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -125,4 +125,4 @@ TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) {
printf("Device assert failed: %s\n", msg);
assert(0);
}
-}
\ No newline at end of file
+}
diff --git a/src/tl_templates/cuda/gemm_sp_sm90.h b/src/tl_templates/cuda/gemm_sp_sm90.h
index 6184f9be7..522fc11ee 100644
--- a/src/tl_templates/cuda/gemm_sp_sm90.h
+++ b/src/tl_templates/cuda/gemm_sp_sm90.h
@@ -231,4 +231,4 @@ TL_DEVICE void gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) {
CUTE_GCC_UNREACHABLE;
}
}
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/cuda/ldsm.h b/src/tl_templates/cuda/ldsm.h
index 4d6af8a09..a20746dff 100644
--- a/src/tl_templates/cuda/ldsm.h
+++ b/src/tl_templates/cuda/ldsm.h
@@ -118,4 +118,4 @@ TL_DEVICE void ptx_stmatrix_x4_trans(void const *const smem_ptr,
"r"(value0), "r"(value1), "r"(value2), "r"(value3));
}
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
index b32f84dca..82fb53031 100644
--- a/src/tl_templates/hip/hip_fp8.h
+++ b/src/tl_templates/hip/hip_fp8.h
@@ -164,4 +164,4 @@ __device__ fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
res.x = res_x;
res.y = res_y;
return res;
-}
\ No newline at end of file
+}
diff --git a/src/tl_templates/hip/ldsm.h b/src/tl_templates/hip/ldsm.h
index 68c1455f7..286b77324 100644
--- a/src/tl_templates/hip/ldsm.h
+++ b/src/tl_templates/hip/ldsm.h
@@ -1,3 +1,3 @@
#pragma once
-#include "common.h"
\ No newline at end of file
+#include "common.h"
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index bb7a0f46f..0d7c3ee62 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -182,4 +182,4 @@ class ArgBinder {
};
} // namespace tl
} // namespace tvm
-#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_
\ No newline at end of file
+#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_
diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h
index 627dc895f..6bd3309ae 100644
--- a/src/transform/atomicadd_vectorize.h
+++ b/src/transform/atomicadd_vectorize.h
@@ -57,4 +57,4 @@ class AtomicAddVectorizePlanner : public arith::IRVisitorWithAnalyzer {
} // namespace tl
} // namespace tvm
-#endif // TVM_TL_ATOMICADD_VECTORIZE_H_
\ No newline at end of file
+#endif // TVM_TL_ATOMICADD_VECTORIZE_H_
diff --git a/src/transform/common/assume.h b/src/transform/common/assume.h
index c6eadc6b3..db830818e 100644
--- a/src/transform/common/assume.h
+++ b/src/transform/common/assume.h
@@ -25,4 +25,4 @@ bool IsAssumeInEvaluateForm(const Stmt &stmt);
} // namespace tl
} // namespace tvm
-#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_
\ No newline at end of file
+#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_
diff --git a/src/transform/common/loop_vectorization_utils.h b/src/transform/common/loop_vectorization_utils.h
index b9b7715d0..c23252f41 100644
--- a/src/transform/common/loop_vectorization_utils.h
+++ b/src/transform/common/loop_vectorization_utils.h
@@ -781,4 +781,4 @@ class Vectorizer : public StmtMutator,
};
} // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index e9e8f76e6..0fd3a2c16 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -619,4 +619,4 @@ TVM_FFI_STATIC_INIT_BLOCK() {
}
} // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc
index bfdcb5cd5..57d4b8127 100644
--- a/src/transform/split_host_device.cc
+++ b/src/transform/split_host_device.cc
@@ -268,4 +268,4 @@ TVM_FFI_STATIC_INIT_BLOCK() {
} // namespace transform
} // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/tilelang/carver/README.md b/tilelang/carver/README.md
index 164006134..f484f47d7 100644
--- a/tilelang/carver/README.md
+++ b/tilelang/carver/README.md
@@ -33,8 +33,8 @@ arch = CUDA("nvidia/geforce-rtx-4090")
# for k in Reduce(1024):
# ...
carve_template = carver.GeneralReductionTemplate(
- structure="SSR",
- shape=[1024, 1024, 1024],
+ structure="SSR",
+ shape=[1024, 1024, 1024],
dtype="float16",
).with_arch(arch)
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index dc462c627..38c1738f7 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -83,8 +83,8 @@ cdef class CythonKernelWrapper:
tensor_device = tensor.device
device_type_match = device.type == tensor_device.type
device_index_match = (
- tensor_device.index is None or
- device.index is None or
+ tensor_device.index is None or
+ device.index is None or
tensor_device.index == device.index
)
if not (device_type_match and device_index_match):
@@ -116,7 +116,7 @@ cdef class CythonKernelWrapper:
f"expected {len(shape_list)} dimensions, "
f"got {tensor.dim()}"
)
-
+
# Check each dimension
for shape_idx, expected_shape in shape_list:
actual_shape = tensor.shape[shape_idx]
@@ -176,7 +176,7 @@ cdef class CythonKernelWrapper:
)
# Use current CUDA stream if none specified
- if stream == -1:
+ if stream == -1:
if torch.cuda.is_available():
try:
stream = torch._C._cuda_getCurrentRawStream(torch.cuda.current_device())
@@ -239,7 +239,7 @@ cdef class CythonKernelWrapper:
torch.int64: ctypes.c_int64,
torch.bool: ctypes.c_bool,
}
-
+
call_args = []
for i, tensor in enumerate(tensor_list):
if isinstance(tensor, torch.Tensor):
@@ -286,4 +286,3 @@ cdef class CythonKernelWrapper:
return tensor_list[self.result_idx[0]]
else:
return [tensor_list[i] for i in self.result_idx]
-