tile-ai · SiriusNEO · Dec 25, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 17, 2025
diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
@@ -59,7 +59,7 @@ jobs:
             const { owner, repo } = context.repo
             const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username })
             core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none
-      
+
       - name: Reject if not allowed
         if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }}
         run: |
@@ -241,10 +241,10 @@ jobs:
             const fs = require('fs');
             // Read the file directly instead of passing via env/outputs to avoid escaping issues
             const md = fs.readFileSync('regression_result.md', 'utf8');
-            
+
             const runUrl = `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`;
-            
-            const body = 
+
+            const body =
               'Performance Benchmark Report\n' +
               '============================\n\n' +
               `Triggered by: @${context.payload.comment.user.login}\n` +
@@ -262,4 +262,3 @@ jobs:
               issue_number: context.issue.number,
               body
             });
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,15 +13,13 @@ repos:
     hooks:
       - id: check-symlinks
       - id: destroyed-symlinks
-      # FIXME: enable these hooks
-      # - id: trailing-whitespace
-      # - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
       - id: check-added-large-files
       - id: check-merge-conflict
         fail_fast: true
-      # FIXME: enable these hooks
-      # - id: check-executables-have-shebangs
-      # - id: check-shebang-scripts-are-executable
+      - id: check-executables-have-shebangs
+      - id: check-shebang-scripts-are-executable
       - id: detect-private-key
       - id: check-yaml
       - id: check-toml

diff --git a/.pymarkdown b/.pymarkdown
@@ -4,7 +4,7 @@
       "style": "atx"
     },
     "md004": {
-      "style": "dash" 
+      "style": "dash"
     },
     "md013": {
       "enabled": false

diff --git a/LICENSE b/LICENSE
@@ -1,7 +1,7 @@
     MIT License
 
     Copyright (c) Tile-AI.
-    **During the period from December 1, 2024, to Mar 14, 2025, this project is 
+    **During the period from December 1, 2024, to Mar 14, 2025, this project is
     subject to additional collaboration terms with Microsoft Corporation.**
 
     Permission is hereby granted, free of charge, to any person obtaining a copy

diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ TileLang achieves exceptional performance across a variety of computational patt
       <img src="./examples/deepseek_mla/figures/bs128_float16.png" alt="mla decode performance bs128 on H100" width="100%" />
     </div>
   </div>
-  
+
 - Flash Attention Performance on H100
 
   <div align="center">    <img src="./images/mha_performance_h100.png" alt="operator performance on H100" width=80% />
@@ -170,7 +170,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.fl
                 # Perform a tile-level GEMM on the shared buffers
                 # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
                 T.gemm(A_shared, B_shared, C_local)
-            
+
             # relu
             for i, j in T.Parallel(block_M, block_N):
                 C_local[i, j] = T.max(C_local[i, j], 0)

diff --git a/THIRDPARTYNOTICES.txt b/THIRDPARTYNOTICES.txt
@@ -1,5 +1,5 @@
-BitBLAS uses third-party material as listed below. The attached notices are 
-provided for informational purposes only. 
+BitBLAS uses third-party material as listed below. The attached notices are
+provided for informational purposes only.
 
 Notice for apache/tvm
 -------------------------------

diff --git a/docker/Dockerfile.cu118 b/docker/Dockerfile.cu118
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:22.12-py3 
+FROM nvcr.io/nvidia/pytorch:22.12-py3
 
 WORKDIR /root
 

diff --git a/docker/Dockerfile.cu120 b/docker/Dockerfile.cu120
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:23.01-py3 
+FROM nvcr.io/nvidia/pytorch:23.01-py3
 
 WORKDIR /root
 

diff --git a/docs/.gitignore b/docs/.gitignore
@@ -1,2 +1,2 @@
 _build/
-autoapi/
+autoapi/
diff --git a/docs/CNAME b/docs/CNAME
@@ -1 +1 @@
-tilelang.com
+tilelang.com
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
@@ -8,4 +8,3 @@
 .sidebar-logo-container {
   line-height: 0;
 }
-
diff --git a/docs/_static/img/logo-row.svg b/docs/_static/img/logo-row.svg
diff --git a/docs/deeplearning_operators/deepseek_mla.md b/docs/deeplearning_operators/deepseek_mla.md
@@ -38,7 +38,7 @@ Notably, **TileLang accomplishes this with just around 80 lines of Python code**
 
 First, let's review the core computation logic of traditional FlashAttention:
 
-```python   
+```python
 # acc_s: [block_M, block_N]
 # scores_max: [block_M]
 # scores_scale: [block_M]
@@ -160,7 +160,7 @@ Key implementation differences between Hopper and MI300X architectures include:
    # Original shared memory allocation
    Q_shared = T.alloc_shared([block_H, dim], dtype)
    Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
-   
+
    # Optimized register allocation
    Q_local = T.alloc_fragment([block_H, dim], dtype)
    Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)

diff --git a/docs/deeplearning_operators/elementwise.md b/docs/deeplearning_operators/elementwise.md
@@ -8,7 +8,7 @@
 :class: myclass1 myclass2
 :name: a-tip-reference
 
-   This document is still **experimental** and may be incomplete.  
+   This document is still **experimental** and may be incomplete.
    Suggestions and improvements are highly encouraged—please submit a PR!
 :::
 

diff --git a/docs/deeplearning_operators/gemv.md b/docs/deeplearning_operators/gemv.md
@@ -6,7 +6,7 @@
 </div>
 
 :::{warning}
-   This document is still **experimental** and may be incomplete.  
+   This document is still **experimental** and may be incomplete.
    Suggestions and improvements are highly encouraged—please submit a PR!
 :::
 

diff --git a/docs/deeplearning_operators/matmul_sparse.md b/docs/deeplearning_operators/matmul_sparse.md
@@ -5,7 +5,7 @@
 </div>
 
 :::{warning}
-   This document is still **experimental** and may be incomplete.  
+   This document is still **experimental** and may be incomplete.
 
    This feature is still **experimental** and need further optimization.
 
@@ -40,7 +40,7 @@ Both `PyTorch` and `vLLM` use `CUTLASS` as their computation backend (see refere
 
 A set of **CUTLASS-compatible** compressors is provided in `tilelang.utils.sparse`, where a dense tensor—along with other required arguments (e.g., block_K for sm90, transpose options)—can be passed in to perform the compression.
 
-```python 
+```python
 from tilelang.utils.sparse import compress
 A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
 ```

diff --git a/docs/get_started/overview.md b/docs/get_started/overview.md
@@ -15,38 +15,38 @@ Figure 1: High-level overview of the TileLang compilation flow.
 ## Programming Interfaces
 
 1. **Beginner Level (Hardware-Unaware)**
-   - Intended for users who need to write code that is independent of specific hardware details.  
-   - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.  
+   - Intended for users who need to write code that is independent of specific hardware details.
+   - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.
    - *Note:* This interface is not yet fully implemented.
 
 2. **Developer Level (Hardware-Aware with Tile Library)**
-   - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.  
-   - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.  
+   - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.
+   - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.
    - Users at this level can leverage these ready-made primitives without diving into low-level threading details.
 
 3. **Expert Level (Hardware-Aware with Thread Primitives)**
-   - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).  
-   - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.  
+   - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).
+   - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.
    - This level grants maximum flexibility for specialized optimizations tailored to specific GPU or multi-core architectures.
 
 ## Compilation Flow
 
-1. **Tile Program**  
+1. **Tile Program**
    A high-level specification of the computation. Depending on the user’s expertise, they may write a purely hardware-unaware tile program or incorporate constructs from the Tile Library or thread primitives.
 
-2. **Tile Program with Tile Library**  
+2. **Tile Program with Tile Library**
    When developers choose from the Tile Library, the original Tile Program is expanded with specialized library calls. These calls encapsulate efficient implementation patterns for different operations.
 
-3. **Tile Program with Thread Primitives**  
+3. **Tile Program with Thread Primitives**
    Expert-level developers can explicitly use low-level threading constructs to hand-optimize data layout, synchronization, and memory usage.
 
-4. **IRModule**  
+4. **IRModule**
    After the program is composed with libraries or thread primitives, it is lowered to an intermediate representation (IR) that captures the necessary hardware details.
 
-5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**  
+5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**
    From the IR, the system generates target-specific source code. This source code is tuned for the desired backends or GPU architectures (e.g., NVIDIA, AMD).
 
-6. **Hardware-Specific Executable/Runtime**  
+6. **Hardware-Specific Executable/Runtime**
    Finally, the generated source is compiled into hardware-specific executables, ready to run on the corresponding devices. The pipeline supports multiple GPU backends and can be extended to additional architectures.
 
 ## Tile-based Programming Model

diff --git a/examples/bitnet-1.58b/.gitignore b/examples/bitnet-1.58b/.gitignore
@@ -1 +1 @@
-models/
+models/
diff --git a/examples/bitnet-1.58b/benchmark.sh b/examples/bitnet-1.58b/benchmark.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 python benchmark_generate.py --bs 16 --in_seq_len 32 --out_seq_len 128 | tee b16_i32_o128.log
 
 python benchmark_generate.py --bs 1 --in_seq_len 512 --out_seq_len 64 | tee b1_i512_o64.log

diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # retrieve the native model input and saved model directory
 MODEL_DIR=$1
 SAVED_MODEL_DIR=$2

diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # require git lfs
 if ! command -v git-lfs &> /dev/null; then
     echo "Please install git-lfs first by running 'sudo apt install git-lfs'"

diff --git a/examples/bitnet-1.58b/maint/quantize_config.json b/examples/bitnet-1.58b/maint/quantize_config.json
@@ -7,4 +7,4 @@
     "model_name_or_path": "1bitLLM/bitnet_b1_58-3B",
     "quant_method": "bitnet",
     "checkpoint_format": "bitnet"
-}
+}
diff --git a/examples/bitnet-1.58b/maint/upload_models.sh b/examples/bitnet-1.58b/maint/upload_models.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 MODEL_DIR=$1
 REMOTE_DIR=$2
 

diff --git a/examples/bitnet-1.58b/nvidia_measure_memory.sh b/examples/bitnet-1.58b/nvidia_measure_memory.sh
@@ -1 +1,3 @@
+#!/usr/bin/env bash
+
 nvidia-smi --query-gpu=memory.used --format=csv -lms 500
diff --git a/examples/deepseek_mla/README.md b/examples/deepseek_mla/README.md
@@ -31,7 +31,7 @@ Notably, **TileLang accomplishes this with just around 80 lines of Python code**
 
 First, let's review the core computation logic of traditional FlashAttention:
 
-```python   
+```python
 # acc_s: [block_M, block_N]
 # scores_max: [block_M]
 # scores_scale: [block_M]

diff --git a/examples/deepseek_nsa/requirements.txt b/examples/deepseek_nsa/requirements.txt
@@ -1 +1 @@
-git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
+git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
diff --git a/examples/deepseek_v32/README.md b/examples/deepseek_v32/README.md
@@ -193,10 +193,10 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
     # Load KV data for selected indices
     for bi_i, d_i in T.Parallel(BI, D):
         KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BI + bi_i], bz, d_i]
-    
+
     # Recompute attention scores for backward
     T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-    
+
     # Apply softmax gradient: dP = P * (dP_raw - Delta)
     for h_i, bi_i in T.Parallel(padded_H, BI):
         acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
@@ -212,7 +212,7 @@ The key gradient computations are:
 ```python
 # Atomically update dKV at selected indices
 for bi_i, d_i in T.Parallel(BI // split_store, D // 4):
-    T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4], 
+    T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4],
                    acc_dkv_shared[bi_i, d_i * 4])
 ```
 

diff --git a/examples/deepseek_v32/inference/config_671B_v3.2.json b/examples/deepseek_v32/inference/config_671B_v3.2.json
@@ -23,4 +23,4 @@
     "index_n_heads": 64,
     "index_head_dim": 128,
     "index_topk": 2048
-}
+}
diff --git a/examples/deepseek_v32/inference/convert.py b/examples/deepseek_v32/inference/convert.py
@@ -42,7 +42,7 @@ def main(hf_ckpt_path, save_path, n_experts, mp):
         save_path (str): Path to the directory where the converted checkpoint files will be saved.
         n_experts (int): Total number of experts in the model.
         mp (int): Model parallelism factor.
-        
+
     Returns:
         None
     """

diff --git a/examples/deepseek_v32/inference/requirements.txt b/examples/deepseek_v32/inference/requirements.txt
@@ -2,4 +2,4 @@ torch
 transformers
 safetensors
 fast_hadamard_transform
-tilelang==0.1.6
+tilelang==0.1.6
diff --git a/examples/dequantize_gemm/README.md b/examples/dequantize_gemm/README.md
@@ -19,7 +19,7 @@ def dequant_matmul(
 
         T.clear(Ct_local)
         for k in T.Pipelined(
-            T.ceildiv(K, block_K), 
+            T.ceildiv(K, block_K),
             num_stages=num_stages
         ):
             T.copy(A[by * block_M, k * block_K], A_shared)

diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
@@ -786,4 +786,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -786,4 +786,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/images/MatmulExample.svg b/images/MatmulExample.svg
diff --git a/images/logo-row.svg b/images/logo-row.svg
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
diff --git a/maint/precision/cuda_ops.cu b/maint/precision/cuda_ops.cu
@@ -239,4 +239,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
     m.def("launch_fast_operator", &launch_fast_operator, "CUDA Fast Operator",
           py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
-}
+}
diff --git a/maint/scripts/apply_mit_license.sh b/maint/scripts/apply_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 echo "Add MIT license boilerplate..."
 PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # TO source code root
@@ -17,16 +19,16 @@ done
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
     '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cmake' \
     -or -name '*.py' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
-    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} 
-    if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then       
+    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
+    if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
         cat maint/scripts/mit_liscense2.txt ${SRC_FILE} > ${SRC_FILE}.new
         mv ${SRC_FILE}.new ${SRC_FILE}
     fi
 done
 
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
     '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -name '*.sh' ); do
-    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} 
+    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
     if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
         line=$(head -n 1 ${SRC_FILE})
         if [[ $line == "#!/bin/bash"* ]]; then

diff --git a/maint/scripts/build_docs.sh b/maint/scripts/build_docs.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 python -m venv .venv
 source .venv/bin/activate
 python -m pip install --upgrade pip --no-user

diff --git a/maint/scripts/check_mit_license.sh b/maint/scripts/check_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 echo "Check MIT License boilerplate..."
 PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # To source code root
@@ -8,7 +10,7 @@ EXITCODE=0
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name '*apply_mit_license.sh' \
     -not -name '*check_mit_license.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cpp' -or -name '*.cu' -or -name '*.h'  -or -name '*.hpp' \
     -or -name '*.py' -or -name '*.sh' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
-    
+
     # Skip files that already contain the Apache License
     if grep -q "Apache License" "${SRC_FILE}"; then
         continue
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,4 +8,3 @@
		.sidebar-logo-container {
		line-height: 0;
		}
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		#!/usr/bin/env bash

		nvidia-smi --query-gpu=memory.used --format=csv -lms 500
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
		git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
-Original file line number
+Diff line change
@@ Expand Up / @@ -786,4 +786,4 @@ @@
      },
      "nbformat": 4,
      "nbformat_minor": 5
-    }
+    }