From 6b1faf71faf18c564f5f77e0f5c1671cd91dfbc3 Mon Sep 17 00:00:00 2001 From: LeiWang1999 Date: Sun, 19 Oct 2025 00:33:21 +0800 Subject: [PATCH 1/5] Add document PYTHONPATH build path --- docs/get_started/Installation.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md index f183c99b1..3d5c6db9d 100644 --- a/docs/get_started/Installation.md +++ b/docs/get_started/Installation.md @@ -65,6 +65,26 @@ If you want to install **tile-lang** in development mode, you can run the follow pip install -e . -v ``` +If you prefer to work directly from the source tree via `PYTHONPATH`, make sure the native extension is built first: + +```bash +mkdir -p build +cd build +cmake .. -DUSE_CUDA=ON +make -j +``` +Then add the repository root to `PYTHONPATH` before importing `tilelang`, for example: + +```bash +export PYTHONPATH=/path/to/tilelang:$PYTHONPATH +python -c "import tilelang; print(tilelang.__version__)" +``` + +Some useful CMake options you can toggle while configuring: +- `-DUSE_CUDA=ON|OFF` builds against NVIDIA CUDA (default ON when CUDA headers are found). +- `-DUSE_ROCM=ON` selects ROCm support when building on AMD GPUs. +- `-DNO_VERSION_LABEL=ON` disables the backend/git suffix in `tilelang.__version__`. + We currently provide four methods to install **tile-lang**: 1. [Install Using Docker](#install-method-1) (Recommended) From 128a7967032a49190f8843934b5d8e3771ff9d42 Mon Sep 17 00:00:00 2001 From: LeiWang1999 Date: Sun, 19 Oct 2025 02:21:25 +0800 Subject: [PATCH 2/5] update fp8 benchmark result --- benchmark/matmul_fp8/README.md | 36 ++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 benchmark/matmul_fp8/README.md diff --git a/benchmark/matmul_fp8/README.md b/benchmark/matmul_fp8/README.md new file mode 100644 index 000000000..2a0a027dc --- /dev/null +++ b/benchmark/matmul_fp8/README.md @@ -0,0 +1,36 @@ +# FP8 Matmul Benchmark (8192×8192) + +This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP8 matrices sized `M = N = 8192` across different `K` dimensions. Each measurement relies on the default autotuning search space bundled with the benchmark. The file lives in `/weka-hg/prod/deepseek/permanent/wanglei/tilelang/benchmark/matmul_fp8`. + +## Environment + +- Repository commit: `6b1faf71faf18c564f5f77e0f5c1671cd91dfbc3` +- GPUs: `NVIDIA H800 SXM` on driver `560.35.05` + +## How to Reproduce + +```bash +cd benchmark/matmul_fp8 +python - <<'PY' +from benchmark_matmul import matmul + +M = 8192 +N = 8192 +for K in [256, 512, 1024, 2048, 4096, 8192, 16384]: + res = matmul(M, N, K, False) + tflops = 2 * M * N * K / res.latency * 1e-12 + print(f"K={K:5d} latency={res.latency:.6f}s TFlops={tflops:.3f}") +PY +``` + +## Results + +| K | Latency (s) | Throughput (TFLOPs) | +|-------|-------------|---------------------| +| 256 | 0.091488 | 0.376 | +| 512 | 0.110496 | 0.622 | +| 1024 | 0.148256 | 0.927 | +| 2048 | 0.234080 | 1.174 | +| 4096 | 0.398944 | 1.378 | +| 8192 | 0.752416 | 1.461 | +| 16384 | 1.443808 | 1.523 | From dff30dcb2b155f1145caf6cf7045b29c80cf6482 Mon Sep 17 00:00:00 2001 From: LeiWang1999 Date: Sun, 19 Oct 2025 02:23:24 +0800 Subject: [PATCH 3/5] remove redpath --- benchmark/matmul_fp8/benchmark_matmul.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py index 4606f80b2..9477c7ea0 100644 --- a/benchmark/matmul_fp8/benchmark_matmul.py +++ b/benchmark/matmul_fp8/benchmark_matmul.py @@ -219,9 +219,9 @@ def main( if __name__ == "__main__": # Parse command-line arguments for matrix dimensions parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark") - parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M") - parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N") - parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K") + parser.add_argument("--m", type=int, default=8192, help="Matrix dimension M") + parser.add_argument("--n", type=int, default=8192, help="Matrix dimension N") + parser.add_argument("--k", type=int, default=8192, help="Matrix dimension K") parser.add_argument( "--with_roller", action="store_true", @@ -237,13 +237,11 @@ def main( # matmul(...) returns (best_latency, best_config, ref_latency) best_result = matmul(M, N, K, with_roller) + print(best_result.get_kernel_source()) best_latency = best_result.latency best_config = best_result.config - ref_latency = best_result.ref_latency # Print out the benchmark results print(f"Best latency (s): {best_latency}") print(f"Best TFlops: {total_flops / best_latency * 1e-9:.3f}") print(f"Best config: {best_config}") - - print(f"Reference TFlops: {total_flops / ref_latency * 1e-9:.3f}") From 9fcebdcf65b8787cca8df351e8fc6bba5ce59d64 Mon Sep 17 00:00:00 2001 From: LeiWang1999 Date: Sun, 19 Oct 2025 02:24:08 +0800 Subject: [PATCH 4/5] remove path --- benchmark/matmul_fp8/README.md | 2 +- benchmark/matmul_fp8/benchmark_matmul.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/benchmark/matmul_fp8/README.md b/benchmark/matmul_fp8/README.md index 2a0a027dc..d50f56e10 100644 --- a/benchmark/matmul_fp8/README.md +++ b/benchmark/matmul_fp8/README.md @@ -1,6 +1,6 @@ # FP8 Matmul Benchmark (8192×8192) -This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP8 matrices sized `M = N = 8192` across different `K` dimensions. Each measurement relies on the default autotuning search space bundled with the benchmark. The file lives in `/weka-hg/prod/deepseek/permanent/wanglei/tilelang/benchmark/matmul_fp8`. +This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP8 matrices sized `M = N = 8192` across different `K` dimensions. Each measurement relies on the default autotuning search space bundled with the benchmark. ## Environment diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py index 9477c7ea0..472a60061 100644 --- a/benchmark/matmul_fp8/benchmark_matmul.py +++ b/benchmark/matmul_fp8/benchmark_matmul.py @@ -219,9 +219,9 @@ def main( if __name__ == "__main__": # Parse command-line arguments for matrix dimensions parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark") - parser.add_argument("--m", type=int, default=8192, help="Matrix dimension M") - parser.add_argument("--n", type=int, default=8192, help="Matrix dimension N") - parser.add_argument("--k", type=int, default=8192, help="Matrix dimension K") + parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M") + parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N") + parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K") parser.add_argument( "--with_roller", action="store_true", @@ -237,7 +237,6 @@ def main( # matmul(...) returns (best_latency, best_config, ref_latency) best_result = matmul(M, N, K, with_roller) - print(best_result.get_kernel_source()) best_latency = best_result.latency best_config = best_result.config From f199105359bcaba593aaaed3f84ad9fe83dbdb24 Mon Sep 17 00:00:00 2001 From: LeiWang1999 Date: Sun, 19 Oct 2025 12:14:39 +0800 Subject: [PATCH 5/5] tflops fix --- benchmark/matmul_fp8/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmark/matmul_fp8/README.md b/benchmark/matmul_fp8/README.md index d50f56e10..fa33d19cd 100644 --- a/benchmark/matmul_fp8/README.md +++ b/benchmark/matmul_fp8/README.md @@ -27,10 +27,10 @@ PY | K | Latency (s) | Throughput (TFLOPs) | |-------|-------------|---------------------| -| 256 | 0.091488 | 0.376 | -| 512 | 0.110496 | 0.622 | -| 1024 | 0.148256 | 0.927 | -| 2048 | 0.234080 | 1.174 | -| 4096 | 0.398944 | 1.378 | -| 8192 | 0.752416 | 1.461 | -| 16384 | 1.443808 | 1.523 | +| 256 | 0.091488 | 376 | +| 512 | 0.110496 | 622 | +| 1024 | 0.148256 | 927 | +| 2048 | 0.234080 | 1174 | +| 4096 | 0.398944 | 1378 | +| 8192 | 0.752416 | 1461 | +| 16384 | 1.443808 | 1523 |