diff --git a/aiter/utility/mp_tuner.py b/aiter/utility/mp_tuner.py index b99f9e4ffa..6a4d8652b7 100644 --- a/aiter/utility/mp_tuner.py +++ b/aiter/utility/mp_tuner.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import torch import multiprocessing as mp import time @@ -37,7 +37,7 @@ def worker( us = round(us, 4) except RuntimeError as e: - print(f"run gpu func error: info:{info}\t {e}") + print(f"run gpu func warning: info:{info}\t {e}", flush=True) us = -1 # not support or error max_err_ratio = 1.0 max_retries = 3 @@ -82,24 +82,28 @@ def worker( max_err_ratio = max(max_err_ratio, err_ratio) except RuntimeError as e: if "CUDA" in str(e) or "HIP" in str(e) or "out of memory" in str(e).lower(): - print(f"GPU Runtime Error in process:{pid} info:{info}: {e}") + if printLog: + print(f"GPU Runtime Error in process:{pid} info:{info}: {e}") # Try to recover GPU state try: torch.cuda.empty_cache() torch.cuda.synchronize() except Exception as e: - print(f"Error in process:{pid} info:{info}: {e}") + if printLog: + print(f"Error in process:{pid} info:{info}: {e}") pass else: print(f"Runtime Error in process:{pid} info:{info}: {e}") us = -1 # float("inf") max_err_ratio = 1.0 except TimeoutError as e: - print(f"Timeout in process:{pid} info:{info}: {e}") + if printLog: + print(f"Timeout in process:{pid} info:{info}: {e}") us = float("inf") max_err_ratio = 1.0 except Exception as e: - print(f"Unexpected Error in process:{pid} info:{info}: {e}") + if printLog: + print(f"Unexpected Error in process:{pid} info:{info}: {e}") # import traceback # traceback.print_exc() @@ -109,7 +113,7 @@ def worker( return info, us, round(max_err_ratio, 4) -def work_group(GPUIDMap, fast_mode, err_ratio, in_data, tasks, printLog=False): +def work_group(GPUIDMap, fast_mode, err_ratio, in_data, tasks, verbose=False): """Work group that processes a batch of related tasks.""" group_task = [tasks] if not isinstance(tasks, list) else tasks kernels_num, (input_data) = in_data @@ -204,7 +208,7 @@ def work_group(GPUIDMap, fast_mode, err_ratio, in_data, tasks, printLog=False): ) # Run worker with explicit GPU ID - ret = worker(*work_args, tol_err_ratio=err_ratio) + ret = worker(*work_args, printLog=verbose, tol_err_ratio=err_ratio) rets.append(ret) return rets @@ -458,7 +462,8 @@ def add_dummy_result(k, results_list): # pool_restart_needed = True else: error_msg = f"[Failed] Task {k} failed with {error_type}: {e}" - # pool_restart_needed = True + failed_tasks.append((k, "timeout")) + completed_this_round.append((k, async_result)) # Only log error once per error type if error_type not in logged_error_types: @@ -515,7 +520,7 @@ def add_dummy_result(k, results_list): # Reconstruct results in original task order result = [] for k in range(len(rets)): - task_result = result_dict[k] + task_result = result_dict.get(k, []) if shape_grouped: result.extend(task_result) else: diff --git a/csrc/ck_batched_gemm_a8w8/README.md b/csrc/ck_batched_gemm_a8w8/README.md index fb8631baf1..002c9f2ca9 100644 --- a/csrc/ck_batched_gemm_a8w8/README.md +++ b/csrc/ck_batched_gemm_a8w8/README.md @@ -1,4 +1,4 @@ -# CK batched_gemm a8w8 tune +# CK Batched GEMM A8W8 Tune 1. Install aiter: `cd $aiter_path` @@ -10,15 +10,143 @@ |16 |128 |1536 |7168 | 3. Start tuning: -Run the following cmd to start tuning, run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_a8w8_tune via jit: +Run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_a8w8_tune via jit: `python3 csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_batched_gemm.csv -o aiter/configs/a8w8_tuned_batched_gemm.csv` -You can find the results of the tuning in `aiter/configs/a8w8_tuned_batched_gemm.csv`. +You can find the results of the tuning in `aiter/configs/a8w8_tuned_batched_gemm.csv`, like this: + |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |80 |16 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + + `cu_num` means the number of compute units, and it is used to distinguish between graphics. 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_batched_gemm_a8w8.py` and run it, please wait a few minutes as it will build batched_gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_batched_gemm.csv` via jit: `python3 op_tests/test_batched_gemm_a8w8.py` -If you have built batched_gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_A8W8_BATCHED_GEMM` the default one will be `aiter/configs/a8w8_tuned_batched_gemm.csv`. +If you have built batched_gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_A8W8_BATCHED_GEMM`, the default one will be results merged from `aiter/configs/a8w8_tuned_batched_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_tuned_batched_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_tuned_batched_gemm.csv`. + +## More Options + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_a8w8_batched_all.csv +``` + +#### `--sort` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files. + + +**Example**: +```bash +--sort +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.05` (5%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. + +**Example**: +```bash +--mp 4 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. + +**Example**: +```bash +--batch 50 +``` + +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes. + +**Example**: +```bash +-k +--splitK +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship. +- If `tune_file` == `untune_file`: Retune all shapes in the tune file +- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file + + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. + +**Example**: +```bash +--warmup 10 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +**Example**: +```bash +--iters 200 +``` + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information. -## More -If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. +**Example**: +```bash +-v +``` +## Notes +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build batched_gemm_a8w8 kernels in tuned gemm csv by default. If you want to use the new result of batched_gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py index c5eaefdcdd..32a375832e 100644 --- a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py +++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. -import os +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import aiter -import pandas as pd import torch import torch.nn.functional as F from aiter import dtypes @@ -123,7 +121,6 @@ def tune( kernel = kernels_list[i] maxsplitK = ( aiter.compute_batched_gemm_SplitK( - B, M, N, K, diff --git a/csrc/ck_batched_gemm_bf16/README.md b/csrc/ck_batched_gemm_bf16/README.md index 7e77615355..a5c6e2b30f 100644 --- a/csrc/ck_batched_gemm_bf16/README.md +++ b/csrc/ck_batched_gemm_bf16/README.md @@ -1,4 +1,4 @@ -# CK batched_gemm bf16 tune +# CK Batched GEMM BF16 Tune 1. Install aiter: `cd $aiter_path` @@ -9,16 +9,140 @@ |-----|-----|-----|-----| |16 |128 |1536 |7168 | - 3. Start tuning: -Run the following cmd to start tuning, run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_bf16_tune via jit: +Run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_bf16_tune via jit: `python3 csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py -i aiter/configs/bf16_untuned_batched_gemm.csv -o aiter/configs/bf16_tuned_batched_gemm.csv` -You can find the results of the tuning in `aiter/configs/bf16_tuned_batched_gemm.csv`. +You can find the results of the tuning in `aiter/configs/bf16_tuned_batched_gemm.csv`, like this: + |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |80 |16 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | + + `cu_num` means the number of compute units, and it is used to distinguish between graphics. 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_batched_gemm_bf16.py` and run it, please wait a few minutes as it will build batched_gemm_bf16 tuned kernels in `aiter/configs/bf16_tuned_batched_gemm.csv` via jit: `python3 op_tests/test_batched_gemm_bf16.py` -If you have built batched_gemm_bf16 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_bf16.py`. It will rebuild kernels from `AITER_CONFIG_BF16_BATCHED_GEMM` the default one will be `aiter/configs/bf16_tuned_batched_gemm.csv`. +If you have built batched_gemm_bf16 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_bf16.py`. It will rebuild kernels from `AITER_CONFIG_BF16_BATCHED_GEMM`, the default one will be results merged from `aiter/configs/bf16_tuned_batched_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_bf16_tuned_batched_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/bf16_tuned_batched_gemm.csv`. + +## More Options + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_bf16_batched_all.csv +``` + +#### `--sort` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `B`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files. + + +**Example**: +```bash +--sort +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.05` (5%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. + +**Example**: +```bash +--mp 4 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. + +**Example**: +```bash +--batch 50 +``` + +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes. + +**Example**: +```bash +-k +--splitK +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship. +- If `tune_file` == `untune_file`: Retune all shapes in the tune file +- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file + + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. + +**Example**: +```bash +--iters 200 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output + +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information. + +**Example**: +```bash +-v +``` -## More -If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build batched_gemm_bf16 kernels in tuned gemm csv by default. If you want to use the new result of batched_gemm_bf16_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. +## Notes +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build batched_gemm_bf16 kernels in tuned gemm csv by default. If you want to use the new result of batched_gemm_bf16_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py index 217a9fb2a9..b0e8990b35 100644 --- a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py +++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. -import os +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import aiter -import pandas as pd import torch import torch.nn.functional as F from aiter.jit.core import AITER_CONFIG_BF16_BATCHED_GEMM @@ -102,7 +100,6 @@ def tune( kernel = kernels_list[i] maxsplitK = ( aiter.compute_batched_gemm_SplitK( - B, M, N, K, diff --git a/csrc/ck_gemm_a4w4_blockscale/README.md b/csrc/ck_gemm_a4w4_blockscale/README.md index d6e8194077..26798053d1 100755 --- a/csrc/ck_gemm_a4w4_blockscale/README.md +++ b/csrc/ck_gemm_a4w4_blockscale/README.md @@ -1,4 +1,4 @@ -# CK gemm a4w4 blockscale tune +# CK GEMM A4W4 Blockscale Tune 1. Install aiter: `cd $aiter_path` @@ -12,17 +12,143 @@ 3. Start tuning: Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a4w4_blockscale_tune via jit: `GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH=/work/llvm-project/build/bin/ python3 csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py -i aiter/configs/a4w4_blockscale_untuned_gemm.csv -o aiter/configs/a4w4_blockscale_tuned_gemm.csv` -You can find the results of the tuning in `aiter/configs/a4w4_blockscale_tuned_gemm.csv`. - |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**| - |----------|-----|-----|-----|------------|----------|------|--------------| - |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx | +You can find the results of the tuning in `aiter/configs/a4w4_blockscale_tuned_gemm.csv`, like this: + |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | `cu_num` means the number of compute units, and it is used to distinguish between graphics. 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a4w4_blockscale.py` and run it, please wait a few minutes as it will build gemm_a4w4_blockscale tuned kernels in `aiter/configs/a4w4_blockscale_tuned_gemm.csv` via jit: `GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH=/work/llvm-project/build/bin/ python3 op_tests/test_gemm_a4w4_blockscale.py` -If you have built gemm_a4w4 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a4w4_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A4W4` the default one will be `aiter/configs/a4w4_blockscale_tuned_gemm.csv`. +If you have built gemm_a4w4 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a4w4_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A4W4`, the default one will be results merged from `aiter/configs/a4w4_blockscale_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a4w4_blockscale_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a4w4_blockscale_tuned_gemm.csv` -## More -If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a4w4 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a4w4_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. +## More Options + +**Note**: All commands require setting `GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH=/work/llvm-project/build/bin/` environment variable. + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_a4w4_blockscale_all.csv +``` + +#### `--sort` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Sort the output file according to the key columns(e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files. + +**Example**: +```bash +--sort +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.05` (5%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. + +**Example**: +```bash +--mp 4 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. + +**Example**: +```bash +--batch 50 +``` + +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes. + +**Example**: +```bash +-k +--splitK +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship. +- If `tune_file` == `untune_file`: Retune all shapes in the tune file +- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file + + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. + +**Example**: +```bash +--warmup 10 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +**Example**: +```bash +--iters 200 +``` + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output + +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information. + +**Example**: +```bash +-v +``` + +## Notes +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a4w4 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a4w4_blockscale_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_gemm_a8w8/README.md b/csrc/ck_gemm_a8w8/README.md index 21acf3cccc..ee0e08336f 100644 --- a/csrc/ck_gemm_a8w8/README.md +++ b/csrc/ck_gemm_a8w8/README.md @@ -1,4 +1,4 @@ -# CK gemm a8w8 tune +# CK GEMM A8W8 Tune 1. Install aiter: `cd $aiter_path` @@ -9,22 +9,144 @@ |-----|-----|-----| |128 |1536 |7168 | - 3. Start tuning: Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_tune via jit: `python3 csrc/ck_gemm_a8w8/gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_gemm.csv -o aiter/configs/a8w8_tuned_gemm.csv` -If you want to use split K kernels, you can add the `-k` parameter at the end, notice that should change `bias` to `bias/(2^k)`. You can find the results of this tuning in `aiter/configs/a8w8_tuned_gemm.csv`, like this: - |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**| - |----------|-----|-----|-----|------------|----------|------|--------------| - |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx | + |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | `cu_num` means the number of compute units, and it is used to distinguish between graphics. 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_gemm.csv` via jit: `python3 op_tests/test_gemm_a8w8.py` -If you have built gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8` the default one will be `aiter/configs/a8w8_tuned_gemm.csv`. +If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8`, the default one will be results merged from `aiter/configs/a8w8_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_tuned_gemm.csv`. + +## More Options + +### Split K Kernels +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split K kernels. +**Example**: +```bash +-k +--splitK +``` + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_a8w8_all.csv +``` + +#### `--sort` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files. + + +**Example**: +```bash +--sort +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.05` (5%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. Each process runs on a separate GPU. + +**Example**: +```bash +--mp 4 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. Split untuned shapes into batches to manage memory and progress tracking. + +**Example**: +```bash +--batch 50 +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship: + - If `tune_file` == `untune_file`: Retune all shapes in the tune file + - If `tune_file` != `untune_file`: Retune shapes that exist in both files + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. Warmup runs help stabilize GPU state before measurement. + +**Example**: +```bash +--warmup 10 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +**Example**: +```bash +--iters 200 +``` + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. Useful to prevent hanging on problematic kernels. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output + +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information, including skipped shapes, tuning progress, and detailed error messages. + +**Example**: +```bash +-v +``` -## More -If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. +## Notes +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_gemm_a8w8_blockscale/README.md b/csrc/ck_gemm_a8w8_blockscale/README.md index 958fa396e3..d087615e8f 100755 --- a/csrc/ck_gemm_a8w8_blockscale/README.md +++ b/csrc/ck_gemm_a8w8_blockscale/README.md @@ -1,4 +1,4 @@ -# CK gemm a8w8 blockscale tune +# CK GEMM A8W8 Blockscale Tune 1. Install aiter: `cd $aiter_path` @@ -12,17 +12,139 @@ 3. Start tuning: Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_blockscale_tune via jit: `python3 csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py -i aiter/configs/a8w8_blockscale_untuned_gemm.csv -o aiter/configs/a8w8_blockscale_tuned_gemm.csv` -You can find the results of the tuning in `aiter/configs/a8w8_blockscale_tuned_gemm.csv`. - |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**| - |----------|-----|-----|-----|------------|----------|------|--------------| - |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx | +You can find the results of the tuning in `aiter/configs/a8w8_blockscale_tuned_gemm.csv`, like this: + |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | `cu_num` means the number of compute units, and it is used to distinguish between graphics. 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a8w8_blockscale.py` and run it, please wait a few minutes as it will build gemm_a8w8_blockscale tuned kernels in `aiter/configs/a8w8_blockscale_tuned_gemm.csv` via jit: `python3 op_tests/test_gemm_a8w8_blockscale.py` -If you have built gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE` the default one will be `aiter/configs/a8w8_blockscale_tuned_gemm.csv`. +If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE`, the default one will be results merged from `aiter/configs/a8w8_blockscale_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_blockscale_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_blockscale_tuned_gemm.csv`. -## More -If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. +## More Options + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_a8w8_blockscale_all.csv +``` + +#### `--sort` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Sort the output file according to the key columns(e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files. + +**Example**: +```bash +--sort +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.05` (5%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. + +**Example**: +```bash +--mp 1 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. + +**Example**: +```bash +--batch 50 +``` + +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes. + +**Example**: +```bash +-k +--splitK +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship. +- If `tune_file` == `untune_file`: Retune all shapes in the tune file +- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. + +**Example**: +```bash +--warmup 10 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +**Example**: +```bash +--iters 200 +``` + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output + +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information. + +**Example**: +```bash +-v +``` +## Notes +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 blockscale kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_blockscale_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md index 5e5ea1a914..af83122ef3 100755 --- a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md +++ b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md @@ -1,4 +1,4 @@ -# CK gemm a8w8 blockscale bpreshuffle tune +# CK GEMM A8W8 Blockscale BPreshuffle Tune 1. Install aiter: `cd $aiter_path` @@ -12,17 +12,140 @@ 3. Start tuning: Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_blockscale_bpreshuffle_tune via jit: `python3 csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle_tune.py -i aiter/configs/a8w8_blockscale_bpreshuffle_untuned_gemm.csv -o aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv` -You can find the results of the tuning in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`. - |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**| - |----------|-----|-----|-----|------------|----------|------|--------------| - |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx | +You can find the results of the tuning in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`, like this: + |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------| + |80 |128 |1536 |7168 |23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | `cu_num` means the number of compute units, and it is used to distinguish between graphics. 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a8w8_blockscale.py` and run it, please wait a few minutes as it will build gemm_a8w8_blockscale_bpreshuffle tuned kernels in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv` via jit: `python3 op_tests/test_gemm_a8w8_blockscale.py` -If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE` the default one will be `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`. +If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE`, the default one will be results merged from `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_blockscale_bpreshuffle_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`. -## More -If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. +## More Options + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_a8w8_blockscale_bpreshuffle_all.csv +``` + +#### `--sort` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files. + + +**Example**: +```bash +--sort +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.05` (5%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. + +**Example**: +```bash +--mp 4 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. + +**Example**: +```bash +--batch 50 +``` + +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes. + +**Example**: +```bash +-k +--splitK +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship. +- If `tune_file` == `untune_file`: Retune all shapes in the tune file +- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. + +**Example**: +```bash +--warmup 10 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +**Example**: +```bash +--iters 200 +``` + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output + +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information. + +**Example**: +```bash +-v +``` +## Notes +If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 blockscale bpreshuffle kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_blockscale_bpreshuffle_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/README.md b/csrc/ck_gemm_a8w8_bpreshuffle/README.md index 40f3a7ba22..c1df848d97 100644 --- a/csrc/ck_gemm_a8w8_bpreshuffle/README.md +++ b/csrc/ck_gemm_a8w8_bpreshuffle/README.md @@ -1,4 +1,4 @@ -# CK gemm a8w8 tune +# CK GEMM A8W8 BPreshuffle Tune 1. Install aiter: `cd $aiter_path` @@ -13,10 +13,10 @@ 3. Start tuning: Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_tune via jit: `python3 csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py -i aiter/configs/a8w8_bpreshuffle_untuned_gemm.csv -o aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv` -You can find the results of this tuning in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`. - |**cu_num**|**M**|**N**|**K**|******q_dtype_w******|**kernelId**|**splitK**|**us**|**kernelName**| - |----------|-----|-----|-----|---------------------|------------|----------|------|--------------| - |80 |128 |1536 |7168 |torch.float8_e4m3fnuz|23 |0 |32.99 |xxxxxxxx | +You can find the results of this tuning in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`, like this: + |**cu_num**|**M**|**N**|**K**|**q_dtype_w** |**libtype**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**| + |----------|-----|-----|-----|---------------------|-----------|------------|----------|------|--------------|----------|------|------------| + |80 |128 |1536 |7168 |torch.float8_e4m3fnuz| ck | 23 |0 |32.99 |xxxxxxxx |125.4 |89.5 |0.01 | `cu_num` means the number of compute units, and it is used to distinguish between graphics. `q_dtype_w` means the quantization data type of weight, and it is used to distinguish between different quantization data types. support torch.int8 and fp8 @@ -24,7 +24,149 @@ You can find the results of this tuning in `aiter/configs/a8w8_bpreshuffle_tuned 4. Build tuned kernels and test: Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8 tuned kernels in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv` via jit: `python3 op_tests/test_gemm_a8w8.py` -If you have built gemm_a8w8_bpreshuffle kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE` the default one will be aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`. +If you have built gemm_a8w8_bpreshuffle kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE`, the default one will be results merged from `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_bpreshuffle_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_bpreshuffle_tuned_gemm.csv`. + +## More Options + +### Library Selection + +#### `--libtype` +- **Type**: List of strings +- **Default**: `["all"]` +- **Choices**: `all`, `asm`, `ck`, `cktile` +- **Description**: Choose which library implementations to tune. You can specify one or multiple library types to compare their performance. + - `all`: Tune all available library implementations + - `asm`: Use assembly optimized kernels + - `ck`: Use Composable Kernel library + - `cktile`: Use CK Tile library + +**Example**: +```bash +--libtype all +--libtype ck,cktile +--libtype cktile +``` + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_a8w8_bpreshuffle_all.csv +``` + +#### `--sort` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files. + +**Example**: +```bash +--sort +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.05` (5%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. + +**Example**: +```bash +--mp 4 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. + +**Example**: +```bash +--batch 50 +``` + +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes. + +**Example**: +```bash +-k +--splitK +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship. +- If `tune_file` == `untune_file`: Retune all shapes in the tune file +- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. + +**Example**: +```bash +--warmup 10 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +**Example**: +```bash +--iters 200 +``` + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output + +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information. + +**Example**: +```bash +-v +``` ## More If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 bpreshuffle kernels in tuned bpreshuffle gemm csv by default. If you want to use the new result of gemm_a8w8_bpreshuffle_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended. diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py index 9c39e788ad..0268c655c1 100755 --- a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py +++ b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import os import sys import aiter @@ -256,9 +256,9 @@ def get_cktile_gemm_a8w8_bpreshuffle_tune_task( M, N, K, - kernel.MPerBLOCK, - kernel.NPerBLOCK, - kernel.KPerBLOCK, + kernel.MTile, + kernel.NTile, + kernel.KTile, ) if useSplitK else 0 diff --git a/csrc/ck_gemm_moe_2stages_codegen/README.md b/csrc/ck_gemm_moe_2stages_codegen/README.md new file mode 100644 index 0000000000..d57b891248 --- /dev/null +++ b/csrc/ck_gemm_moe_2stages_codegen/README.md @@ -0,0 +1,161 @@ +# CK GEMM MoE 2-Stages Codegen or asm 1-stage Tune + +1. Install aiter: +`cd $aiter_path` +`python3 setup.py develop` + +2. Add MoE shapes in `aiter/configs/untuned_fmoe.csv` + |**token**|**model_dim**|**inter_dim**|**expert**|**topk**|**act_type**|**dtype**|**q_dtype_a**|**q_dtype_w**|**q_type**|**use_g1u1**|**doweight_stage1**| + |---------|-------------|-------------|----------|--------|------------|---------|-------------|-------------|----------|------------|-------------------| + |1024 |4096 |14336 |8 |2 |ActivationType.Silu|dtypes.bf16|dtypes.fp8|dtypes.fp8|QuantType.per_Token|True|True| + + +3. Start tuning: +Run the following cmd to start tuning, please wait a few minutes as it will build moe 2-stages kernels via jit: +`python3 csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py -i aiter/configs/untuned_fmoe.csv -o aiter/configs/tuned_fmoe.csv` +You can find the results of this tuning in `aiter/configs/tuned_fmoe.csv`, like this: + |**cu_num**|**token**|**model_dim**|**inter_dim**|**expert**|**topk**|**act_type**|**dtype**|**q_dtype_a**|**q_dtype_w**|**q_type**|**use_g1u1**|**doweight_stage1**|**block_m**|**ksplit**|**us1**|**kernelName1**|**err1**|**us2**|**kernelName2**|**err2**|**us**|**run_1stage**|**tflops**|**bw**| + |----------|---------|-------------|-------------|----------|--------|------------|---------|-------------|-------------|----------|------------|-------------------|-----------|----------|-------|---------------|--------|-------|---------------|--------|------|--------------|----------|------| + |80 |1024 |4096 |14336 |8 |2 |ActivationType.Silu|dtypes.bf16|dtypes.fp8|dtypes.fp8|QuantType.per_Token|True|True|64|0|45.23|kernel_stage1|0.5%|38.67|kernel_stage2|0.3%|83.90|0|125.4|89.5| + + `cu_num` means the number of compute units, and it is used to distinguish between graphics. + `run_1stage` indicates whether to run fused 1-stage kernel (1) or 2-stages kernels (0). + +4. Build tuned kernels and test: +Test the performance, modify the test instance in `op_tests/test_moe.py` or `python3 op_tests/test_moe_2stage.py` and run it, please wait a few minutes as it will build moe tuned kernels in `aiter/configs/tuned_fmoe.csv` via jit: +`python3 op_tests/test_moe.py` or `python3 op_tests/test_moe_2stage.py` +If you have built moe kernels before tuning new MoE shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_moe.py`. It will rebuild kernels from `AITER_CONFIG_FMOE`, the default one will be results merged from `aiter/configs/tuned_fmoe.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_tuned_fmoe_xx.csv`, the merged result is stored in `/tmp/aiter_configs/tuned_fmoe.csv`. + +## More Options + +### Tuning Scope + +#### `--last` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Only tune the last kernel in the CSV file. Useful for quickly testing newly added shapes. + +**Example**: +```bash +--last +``` + +### Output Configuration + +#### `-o2, --profile_file` +- **Type**: String +- **Default**: `""` (empty string) +- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates. + +**Example**: +```bash +--profile_file aiter/configs/profile_fmoe_all.csv +``` + +### Tuning Configuration + +#### `--errRatio` +- **Type**: Float +- **Default**: `0.5` (50%) +- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates. + +**Example**: +```bash +--errRatio 0.01 +``` + +#### `--mp` +- **Type**: Integer +- **Default**: Number of available GPUs +- **Description**: Number of parallel processes to use for tuning across multiple GPUs. + +**Example**: +```bash +--mp 8 +``` + +#### `--batch` +- **Type**: Integer +- **Default**: `100` +- **Description**: Number of shapes to tune in each batch. + +**Example**: +```bash +--batch 50 +``` + +#### `-k, --splitK` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes. + +**Example**: +```bash +-k +--splitK +``` + +#### `--all` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Retune all shapes based on file relationship. +- If `tune_file` == `untune_file`: Retune all shapes in the tune file +- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file + + +**Example**: +```bash +--all +``` + +### Profiling Configuration + +#### `--warmup` +- **Type**: Integer +- **Default**: `5` +- **Description**: Number of warmup iterations before profiling. + +**Example**: +```bash +--warmup 10 +``` + +#### `--iters` +- **Type**: Integer +- **Default**: `101` +- **Description**: Number of profiling iterations to run for performance measurement. + +**Example**: +```bash +--iters 200 +``` + +#### `--timeout` +- **Type**: Integer +- **Default**: `None` +- **Description**: Timeout in seconds for each task group. + +**Example**: +```bash +--timeout 300 +``` + +### Debugging and Verbose Output + +#### `-v, --verbose` +- **Type**: Flag (boolean) +- **Default**: `False` +- **Description**: Enable verbose output with detailed logging information. + +**Example**: +```bash +-v +``` + +## Notes +- This tuner supports both 1-stage fused MoE kernels and 2-stages MoE kernels (stage1 and stage2) +- The tuner will automatically select the best kernel configuration based on performance +- Only G1U1 (gate-up fused) MoE configurations are currently supported for tuning +- Supported quantization types include: per_Token, per_1x128 (blockscale), per_1x32 (MXFP4, gfx950 only) +- If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build moe kernels in tuned csv by default. If you want to use the new result of moe tuning, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended. + diff --git a/hsa/gfx942/fmoe_2stages/tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py similarity index 97% rename from hsa/gfx942/fmoe_2stages/tune.py rename to csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py index f2a4014d8f..501142409f 100644 --- a/hsa/gfx942/fmoe_2stages/tune.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import torch import aiter @@ -1278,22 +1278,42 @@ def calculate(self, results, bpes=(1, 1, 2)): return tflops, bw def get_1stage_file_info(self, q_type, q_dtype_a, doweight_stage1): - extraInfo_1stage = "" - if q_dtype_a == dtypes.i8: - quantDtype = "Int8" - elif q_dtype_a == dtypes.fp8: - quantDtype = "Fp8" - else: - quantDtype = "" - if doweight_stage1: - extraInfo_1stage = "_tkw1" - if q_type == QuantType.No: - quantDtype_1stage = "noquant" - elif q_type == QuantType.per_1x128: - quantDtype_1stage = "blockscale" + quantDtype - else: - quantDtype_1stage = "pertoken" + quantDtype - return quantDtype_1stage, extraInfo_1stage + if get_gfx() == "gfx950": + extraInfo_1stage = "" + if q_dtype_a == dtypes.i8: + quantDtype = "Int8" + elif q_dtype_a == dtypes.fp8: + quantDtype = "Fp8" + else: + quantDtype = "" + if doweight_stage1: + extraInfo_1stage = "_tkw1" + if q_type == QuantType.No: + quantDtype_1stage = "noquant" + elif q_type == QuantType.per_1x128: + quantDtype_1stage = "blockscale" + quantDtype + elif q_type == QuantType.per_1x32: + quantDtype_1stage = "pertoken" + "MXfp4" + else: + quantDtype_1stage = "pertoken" + quantDtype + return quantDtype_1stage, extraInfo_1stage + elif get_gfx() == "gfx942": + extraInfo_1stage = "" + if q_dtype_a == dtypes.i8: + quantDtype = "Int8" + elif q_dtype_a == dtypes.fp8: + quantDtype = "Fp8" + else: + quantDtype = "" + if doweight_stage1: + extraInfo_1stage = "_tkw1" + if q_type == QuantType.No: + quantDtype_1stage = "noquant" + elif q_type == QuantType.per_1x128: + quantDtype_1stage = "blockscale" + quantDtype + else: + quantDtype_1stage = "pertoken" + quantDtype + return quantDtype_1stage, extraInfo_1stage def gen_1stage_asm_task(self, key): task_1stage = [] diff --git a/hsa/gfx950/fmoe_2stages/tune.py b/hsa/gfx950/fmoe_2stages/tune.py deleted file mode 100644 index aa5b417007..0000000000 --- a/hsa/gfx950/fmoe_2stages/tune.py +++ /dev/null @@ -1,191 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. - -import torch -import pandas as pd -import time -import os -import sys -from aiter import QuantType -from aiter.jit.core import ( - AITER_CSRC_DIR, - AITER_META_DIR, - AITER_CONFIG_FMOE, -) -from aiter.utility.mp_tuner import mp_tuner - -from aiter import dtypes -from aiter import ActivationType as ActivationType - -sys.path.insert(0, f"{AITER_META_DIR}/hsa/gfx942") -from fmoe_2stages.tune import FmoeTuner - - -sys.path.insert(0, f"{AITER_CSRC_DIR}/ck_gemm_moe_2stages_codegen/") - - -torch.set_default_device("cuda") -torch.int4 = getattr(torch, "int4", torch.uint32) - - -def get_kernels_dict(file, key="tile_m"): - if not os.path.exists(file): - print(f"ASM kernel list file not exist: {file}") - return {} - df = pd.read_csv(file) - kernel_dict = df.groupby(key)["knl_name"].apply(list).to_dict() - return kernel_dict - - -class FmoeTuner950(FmoeTuner): - ARG_DEFAULTS = { - "verbose": False, - "tune_file": f"{AITER_CONFIG_FMOE}", - "untune_file": "aiter/configs/untuned_fmoe.csv", - "errRatio": 0.5, - "batch": 100, - "profile_file": "aiter/configs/profile_fmoe.csv", # for all results - } - - def get_1stage_file_info(self, q_type, q_dtype_a, doweight_stage1): - extraInfo_1stage = "" - if q_dtype_a == dtypes.i8: - quantDtype = "Int8" - elif q_dtype_a == dtypes.fp8: - quantDtype = "Fp8" - else: - quantDtype = "" - if doweight_stage1: - extraInfo_1stage = "_tkw1" - if q_type == QuantType.No: - quantDtype_1stage = "noquant" - elif q_type == QuantType.per_1x128: - quantDtype_1stage = "blockscale" + quantDtype - elif q_type == QuantType.per_1x32: - quantDtype_1stage = "pertoken" + "MXfp4" - else: - quantDtype_1stage = "pertoken" + quantDtype - return quantDtype_1stage, extraInfo_1stage - - def tune( - self, - untunedf, - tunedf, - args, - ): - mp_num = args.mp - startTS = time.perf_counter() - # blockMs = [16, 32, 48, 64, 80, 96, 112, 128, 144, 160] - blockMs = [16, 32, 64, 128] - - args = self.keys - print(untunedf[args]) - tasks = [] - tasks_ck = [] - task_1stage = [] - in_data = [] - for line in untunedf[args].values: - ( - cu_num, - token, - model_dim, - inter_dim, - expert, - topk, - act_type, - dtype, - q_dtype_a, - q_dtype_w, - q_type, - use_g1u1, - doweight_stage1, - ) = line - dtype = eval(dtype) - q_dtype_a = eval(q_dtype_a) - q_dtype_w = eval(q_dtype_w) - q_type = eval(q_type) - q_type = QuantType.per_1x128 if q_type == QuantType.per_128x128 else q_type - print("\nStart tuning", line) - if not use_g1u1: - print("no moe solution(g1u0) can tune for ", line) - continue - act_type = eval(act_type) - info = ( - cu_num, - token, - model_dim, - inter_dim, - expert, - topk, - act_type, - dtype, - q_dtype_a, - q_dtype_w, - q_type, - use_g1u1, - doweight_stage1, - ) - tasks.extend(self.gen_2stages_asm1_task(info, blockMs)) - tasks_ck.extend(self.gen_2stages_task(info, blockMs)) - task_1stage.extend(self.gen_1stage_asm_task(info)) - if tasks is None and tasks_ck is None and task_1stage is None: - print("no moe solution can tune for ", line) - continue - print( - f"stage1 asm tasks is {len(tasks)}, tasks_ck is {len(tasks_ck)}, task_1stage is {len(task_1stage)}" - ) - in_data.append((len(tasks) + len(tasks_ck) + len(task_1stage), ())) - rets = [] - if len(tasks) + len(tasks_ck) + len(task_1stage) > 0: - ### shape_grouped should be False as multiple stages - rets = mp_tuner( - tasks + tasks_ck + task_1stage, - in_data, - mp_num, - True, - False, - timeout=args.timeout, - verbose=args.verbose, - ) - if not rets: - print("no shape to tune or no solution found") - return [] - else: - return rets - - -if __name__ == "__main__": - - key = [ - "cu_num", - "token", - "model_dim", - "inter_dim", - "expert", - "topk", - "act_type", - "dtype", - "q_dtype_a", - "q_dtype_w", - "q_type", - "use_g1u1", - "doweight_stage1", - ] - resultList = [ - "block_m", - "ksplit", - "us1", - "kernelName1", - "err1", - "us2", - "kernelName2", - "err2", - "us", - "run_1stage", - "tflops", - "bw", - ] - tuner = FmoeTuner950("fmoeTuner950", key, resultList, "fmoe tuner on gfx950") - args = tuner.parse_args() - - tuner.run(args, False)