From 6b1faf71faf18c564f5f77e0f5c1671cd91dfbc3 Mon Sep 17 00:00:00 2001
From: LeiWang1999 <leiwang1999@outlook.com>
Date: Sun, 19 Oct 2025 00:33:21 +0800
Subject: [PATCH 1/5] Add document PYTHONPATH build path

---
 docs/get_started/Installation.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md
index f183c99b1..3d5c6db9d 100644
--- a/docs/get_started/Installation.md
+++ b/docs/get_started/Installation.md
@@ -65,6 +65,26 @@ If you want to install **tile-lang** in development mode, you can run the follow
 pip install -e . -v
 ```
 
+If you prefer to work directly from the source tree via `PYTHONPATH`, make sure the native extension is built first:
+
+```bash
+mkdir -p build
+cd build
+cmake .. -DUSE_CUDA=ON
+make -j
+```
+Then add the repository root to `PYTHONPATH` before importing `tilelang`, for example:
+
+```bash
+export PYTHONPATH=/path/to/tilelang:$PYTHONPATH
+python -c "import tilelang; print(tilelang.__version__)"
+```
+
+Some useful CMake options you can toggle while configuring:
+- `-DUSE_CUDA=ON|OFF` builds against NVIDIA CUDA (default ON when CUDA headers are found).
+- `-DUSE_ROCM=ON` selects ROCm support when building on AMD GPUs.
+- `-DNO_VERSION_LABEL=ON` disables the backend/git suffix in `tilelang.__version__`.
+
 We currently provide four methods to install **tile-lang**:
 
 1. [Install Using Docker](#install-method-1) (Recommended)

From 128a7967032a49190f8843934b5d8e3771ff9d42 Mon Sep 17 00:00:00 2001
From: LeiWang1999 <leiwang1999@outlook.com>
Date: Sun, 19 Oct 2025 02:21:25 +0800
Subject: [PATCH 2/5] update fp8 benchmark result

---
 benchmark/matmul_fp8/README.md | 36 ++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 benchmark/matmul_fp8/README.md

diff --git a/benchmark/matmul_fp8/README.md b/benchmark/matmul_fp8/README.md
new file mode 100644
index 000000000..2a0a027dc
--- /dev/null
+++ b/benchmark/matmul_fp8/README.md
@@ -0,0 +1,36 @@
+# FP8 Matmul Benchmark (8192×8192)
+
+This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP8 matrices sized `M = N = 8192` across different `K` dimensions. Each measurement relies on the default autotuning search space bundled with the benchmark. The file lives in `/weka-hg/prod/deepseek/permanent/wanglei/tilelang/benchmark/matmul_fp8`.
+
+## Environment
+
+- Repository commit: `6b1faf71faf18c564f5f77e0f5c1671cd91dfbc3`
+- GPUs: `NVIDIA H800 SXM` on driver `560.35.05`
+
+## How to Reproduce
+
+```bash
+cd benchmark/matmul_fp8
+python - <<'PY'
+from benchmark_matmul import matmul
+
+M = 8192
+N = 8192
+for K in [256, 512, 1024, 2048, 4096, 8192, 16384]:
+    res = matmul(M, N, K, False)
+    tflops = 2 * M * N * K / res.latency * 1e-12
+    print(f"K={K:5d}  latency={res.latency:.6f}s  TFlops={tflops:.3f}")
+PY
+```
+
+## Results
+
+| K     | Latency (s) | Throughput (TFLOPs) |
+|-------|-------------|---------------------|
+|   256 | 0.091488    | 0.376               |
+|   512 | 0.110496    | 0.622               |
+|  1024 | 0.148256    | 0.927               |
+|  2048 | 0.234080    | 1.174               |
+|  4096 | 0.398944    | 1.378               |
+|  8192 | 0.752416    | 1.461               |
+| 16384 | 1.443808    | 1.523               |

From dff30dcb2b155f1145caf6cf7045b29c80cf6482 Mon Sep 17 00:00:00 2001
From: LeiWang1999 <leiwang1999@outlook.com>
Date: Sun, 19 Oct 2025 02:23:24 +0800
Subject: [PATCH 3/5] remove redpath

---
 benchmark/matmul_fp8/benchmark_matmul.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 4606f80b2..9477c7ea0 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -219,9 +219,9 @@ def main(
 if __name__ == "__main__":
     # Parse command-line arguments for matrix dimensions
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
-    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
-    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument("--m", type=int, default=8192, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=8192, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=8192, help="Matrix dimension K")
     parser.add_argument(
         "--with_roller",
         action="store_true",
@@ -237,13 +237,11 @@ def main(
 
     # matmul(...) returns (best_latency, best_config, ref_latency)
     best_result = matmul(M, N, K, with_roller)
+    print(best_result.get_kernel_source())
     best_latency = best_result.latency
     best_config = best_result.config
-    ref_latency = best_result.ref_latency
 
     # Print out the benchmark results
     print(f"Best latency (s): {best_latency}")
     print(f"Best TFlops: {total_flops / best_latency * 1e-9:.3f}")
     print(f"Best config: {best_config}")
-
-    print(f"Reference TFlops: {total_flops / ref_latency * 1e-9:.3f}")

From 9fcebdcf65b8787cca8df351e8fc6bba5ce59d64 Mon Sep 17 00:00:00 2001
From: LeiWang1999 <leiwang1999@outlook.com>
Date: Sun, 19 Oct 2025 02:24:08 +0800
Subject: [PATCH 4/5] remove path

---
 benchmark/matmul_fp8/README.md           | 2 +-
 benchmark/matmul_fp8/benchmark_matmul.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/benchmark/matmul_fp8/README.md b/benchmark/matmul_fp8/README.md
index 2a0a027dc..d50f56e10 100644
--- a/benchmark/matmul_fp8/README.md
+++ b/benchmark/matmul_fp8/README.md
@@ -1,6 +1,6 @@
 # FP8 Matmul Benchmark (8192×8192)
 
-This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP8 matrices sized `M = N = 8192` across different `K` dimensions. Each measurement relies on the default autotuning search space bundled with the benchmark. The file lives in `/weka-hg/prod/deepseek/permanent/wanglei/tilelang/benchmark/matmul_fp8`.
+This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP8 matrices sized `M = N = 8192` across different `K` dimensions. Each measurement relies on the default autotuning search space bundled with the benchmark.
 
 ## Environment
 
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 9477c7ea0..472a60061 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -219,9 +219,9 @@ def main(
 if __name__ == "__main__":
     # Parse command-line arguments for matrix dimensions
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument("--m", type=int, default=8192, help="Matrix dimension M")
-    parser.add_argument("--n", type=int, default=8192, help="Matrix dimension N")
-    parser.add_argument("--k", type=int, default=8192, help="Matrix dimension K")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
     parser.add_argument(
         "--with_roller",
         action="store_true",
@@ -237,7 +237,6 @@ def main(
 
     # matmul(...) returns (best_latency, best_config, ref_latency)
     best_result = matmul(M, N, K, with_roller)
-    print(best_result.get_kernel_source())
     best_latency = best_result.latency
     best_config = best_result.config
 

From f199105359bcaba593aaaed3f84ad9fe83dbdb24 Mon Sep 17 00:00:00 2001
From: LeiWang1999 <leiwang1999@outlook.com>
Date: Sun, 19 Oct 2025 12:14:39 +0800
Subject: [PATCH 5/5] tflops fix

---
 benchmark/matmul_fp8/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmark/matmul_fp8/README.md b/benchmark/matmul_fp8/README.md
index d50f56e10..fa33d19cd 100644
--- a/benchmark/matmul_fp8/README.md
+++ b/benchmark/matmul_fp8/README.md
@@ -27,10 +27,10 @@ PY
 
 | K     | Latency (s) | Throughput (TFLOPs) |
 |-------|-------------|---------------------|
-|   256 | 0.091488    | 0.376               |
-|   512 | 0.110496    | 0.622               |
-|  1024 | 0.148256    | 0.927               |
-|  2048 | 0.234080    | 1.174               |
-|  4096 | 0.398944    | 1.378               |
-|  8192 | 0.752416    | 1.461               |
-| 16384 | 1.443808    | 1.523               |
+|   256 | 0.091488    | 376                 |
+|   512 | 0.110496    | 622                 |
+|  1024 | 0.148256    | 927                 |
+|  2048 | 0.234080    | 1174                |
+|  4096 | 0.398944    | 1378                |
+|  8192 | 0.752416    | 1461                |
+| 16384 | 1.443808    | 1523                |