diff --git a/aiter/utility/mp_tuner.py b/aiter/utility/mp_tuner.py
index b99f9e4ffa..6a4d8652b7 100644
--- a/aiter/utility/mp_tuner.py
+++ b/aiter/utility/mp_tuner.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import torch
 import multiprocessing as mp
 import time
@@ -37,7 +37,7 @@ def worker(
             us = round(us, 4)
 
         except RuntimeError as e:
-            print(f"run gpu func error: info:{info}\t {e}")
+            print(f"run gpu func warning: info:{info}\t {e}", flush=True)
             us = -1  # not support or error
             max_err_ratio = 1.0
         max_retries = 3
@@ -82,24 +82,28 @@ def worker(
                     max_err_ratio = max(max_err_ratio, err_ratio)
     except RuntimeError as e:
         if "CUDA" in str(e) or "HIP" in str(e) or "out of memory" in str(e).lower():
-            print(f"GPU Runtime Error in process:{pid} info:{info}: {e}")
+            if printLog:
+                print(f"GPU Runtime Error in process:{pid} info:{info}: {e}")
             # Try to recover GPU state
             try:
                 torch.cuda.empty_cache()
                 torch.cuda.synchronize()
             except Exception as e:
-                print(f"Error in process:{pid} info:{info}: {e}")
+                if printLog:
+                    print(f"Error in process:{pid} info:{info}: {e}")
                 pass
         else:
             print(f"Runtime Error in process:{pid} info:{info}: {e}")
         us = -1  # float("inf")
         max_err_ratio = 1.0
     except TimeoutError as e:
-        print(f"Timeout in process:{pid} info:{info}: {e}")
+        if printLog:
+            print(f"Timeout in process:{pid} info:{info}: {e}")
         us = float("inf")
         max_err_ratio = 1.0
     except Exception as e:
-        print(f"Unexpected Error in process:{pid} info:{info}: {e}")
+        if printLog:
+            print(f"Unexpected Error in process:{pid} info:{info}: {e}")
         # import traceback
 
         # traceback.print_exc()
@@ -109,7 +113,7 @@ def worker(
     return info, us, round(max_err_ratio, 4)
 
 
-def work_group(GPUIDMap, fast_mode, err_ratio, in_data, tasks, printLog=False):
+def work_group(GPUIDMap, fast_mode, err_ratio, in_data, tasks, verbose=False):
     """Work group that processes a batch of related tasks."""
     group_task = [tasks] if not isinstance(tasks, list) else tasks
     kernels_num, (input_data) = in_data
@@ -204,7 +208,7 @@ def work_group(GPUIDMap, fast_mode, err_ratio, in_data, tasks, printLog=False):
             )
 
             # Run worker with explicit GPU ID
-            ret = worker(*work_args, tol_err_ratio=err_ratio)
+            ret = worker(*work_args, printLog=verbose, tol_err_ratio=err_ratio)
             rets.append(ret)
         return rets
 
@@ -458,7 +462,8 @@ def add_dummy_result(k, results_list):
                     # pool_restart_needed = True
                 else:
                     error_msg = f"[Failed] Task {k} failed with {error_type}: {e}"
-                    # pool_restart_needed = True
+                    failed_tasks.append((k, "timeout"))
+                    completed_this_round.append((k, async_result))
 
                 # Only log error once per error type
                 if error_type not in logged_error_types:
@@ -515,7 +520,7 @@ def add_dummy_result(k, results_list):
     # Reconstruct results in original task order
     result = []
     for k in range(len(rets)):
-        task_result = result_dict[k]
+        task_result = result_dict.get(k, [])
         if shape_grouped:
             result.extend(task_result)
         else:
diff --git a/csrc/ck_batched_gemm_a8w8/README.md b/csrc/ck_batched_gemm_a8w8/README.md
index fb8631baf1..002c9f2ca9 100644
--- a/csrc/ck_batched_gemm_a8w8/README.md
+++ b/csrc/ck_batched_gemm_a8w8/README.md
@@ -1,4 +1,4 @@
-# CK batched_gemm a8w8 tune
+# CK Batched GEMM A8W8 Tune
 
 1. Install aiter:
 `cd $aiter_path`
@@ -10,15 +10,143 @@
     |16   |128  |1536 |7168 |
 
 3. Start tuning:
-Run the following cmd to start tuning, run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_a8w8_tune via jit:
+Run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_a8w8_tune via jit:
 `python3 csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_batched_gemm.csv -o aiter/configs/a8w8_tuned_batched_gemm.csv`
-You can find the results of the tuning in `aiter/configs/a8w8_tuned_batched_gemm.csv`.
+You can find the results of the tuning in `aiter/configs/a8w8_tuned_batched_gemm.csv`, like this:
+    |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**|
+    |----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------|
+    |80        |16   |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |125.4     |89.5  |0.01        |
+
+    `cu_num` means the number of compute units, and it is used to distinguish between graphics.
 
 4. Build tuned kernels and test:
 Test the performance, modify the test instance in `op_tests/test_batched_gemm_a8w8.py` and run it, please wait a few minutes as it will build batched_gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_batched_gemm.csv` via jit:
 `python3 op_tests/test_batched_gemm_a8w8.py`
-If you have built batched_gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_A8W8_BATCHED_GEMM` the default one will be `aiter/configs/a8w8_tuned_batched_gemm.csv`.
+If you have built batched_gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_A8W8_BATCHED_GEMM`, the default one will be results merged from `aiter/configs/a8w8_tuned_batched_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_tuned_batched_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_tuned_batched_gemm.csv`.
+
+## More Options
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_a8w8_batched_all.csv
+```
+
+#### `--sort`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files.
+
+
+**Example**:
+```bash
+--sort
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.05` (5%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs.
+
+**Example**:
+```bash
+--mp 4
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes.
+
+**Example**:
+```bash
+-k
+--splitK
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship.
+- If `tune_file` == `untune_file`: Retune all shapes in the tune file
+- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file
+
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling.
+
+**Example**:
+```bash
+--warmup 10
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
 
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information.
 
-## More
-If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended.
+**Example**:
+```bash
+-v
+```
+## Notes
+If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build batched_gemm_a8w8 kernels in tuned gemm csv by default. If you want to use the new result of batched_gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended.
diff --git a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py
index c5eaefdcdd..32a375832e 100644
--- a/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py
+++ b/csrc/ck_batched_gemm_a8w8/batched_gemm_a8w8_tune.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-import os
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import aiter
-import pandas as pd
 import torch
 import torch.nn.functional as F
 from aiter import dtypes
@@ -123,7 +121,6 @@ def tune(
                 kernel = kernels_list[i]
                 maxsplitK = (
                     aiter.compute_batched_gemm_SplitK(
-                        B,
                         M,
                         N,
                         K,
diff --git a/csrc/ck_batched_gemm_bf16/README.md b/csrc/ck_batched_gemm_bf16/README.md
index 7e77615355..a5c6e2b30f 100644
--- a/csrc/ck_batched_gemm_bf16/README.md
+++ b/csrc/ck_batched_gemm_bf16/README.md
@@ -1,4 +1,4 @@
-# CK batched_gemm bf16 tune
+# CK Batched GEMM BF16 Tune
 
 1. Install aiter:
 `cd $aiter_path`
@@ -9,16 +9,140 @@
     |-----|-----|-----|-----|
     |16   |128  |1536 |7168 |
 
-
 3. Start tuning:
-Run the following cmd to start tuning, run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_bf16_tune via jit:
+Run the following cmd to start tuning, please wait a few minutes as it will build batched_gemm_bf16_tune via jit:
 `python3 csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py -i aiter/configs/bf16_untuned_batched_gemm.csv -o aiter/configs/bf16_tuned_batched_gemm.csv`
-You can find the results of the tuning in `aiter/configs/bf16_tuned_batched_gemm.csv`.
+You can find the results of the tuning in `aiter/configs/bf16_tuned_batched_gemm.csv`, like this:
+    |**cu_num**|**B**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**|
+    |----------|-----|-----|-----|-----|------------|----------|------|--------------|----------|------|------------|
+    |80        |16   |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |125.4     |89.5  |0.01        |
+
+    `cu_num` means the number of compute units, and it is used to distinguish between graphics.
 
 4. Build tuned kernels and test:
 Test the performance, modify the test instance in `op_tests/test_batched_gemm_bf16.py` and run it, please wait a few minutes as it will build batched_gemm_bf16 tuned kernels in `aiter/configs/bf16_tuned_batched_gemm.csv` via jit:
 `python3 op_tests/test_batched_gemm_bf16.py`
-If you have built batched_gemm_bf16 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_bf16.py`. It will rebuild kernels from `AITER_CONFIG_BF16_BATCHED_GEMM` the default one will be `aiter/configs/bf16_tuned_batched_gemm.csv`.
+If you have built batched_gemm_bf16 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_batched_gemm_bf16.py`. It will rebuild kernels from `AITER_CONFIG_BF16_BATCHED_GEMM`, the default one will be results merged from `aiter/configs/bf16_tuned_batched_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_bf16_tuned_batched_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/bf16_tuned_batched_gemm.csv`.
+
+## More Options
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_bf16_batched_all.csv
+```
+
+#### `--sort`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `B`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files.
+
+
+**Example**:
+```bash
+--sort
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.05` (5%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs.
+
+**Example**:
+```bash
+--mp 4
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes.
+
+**Example**:
+```bash
+-k
+--splitK
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship.
+- If `tune_file` == `untune_file`: Retune all shapes in the tune file
+- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file
+
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
+
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information.
+
+**Example**:
+```bash
+-v
+```
 
-## More
-If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build batched_gemm_bf16 kernels in tuned gemm csv by default. If you want to use the new result of batched_gemm_bf16_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended.
+## Notes
+If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build batched_gemm_bf16 kernels in tuned gemm csv by default. If you want to use the new result of batched_gemm_bf16_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended.
diff --git a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py
index 217a9fb2a9..b0e8990b35 100644
--- a/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py
+++ b/csrc/ck_batched_gemm_bf16/batched_gemm_bf16_tune.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-import os
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import aiter
-import pandas as pd
 import torch
 import torch.nn.functional as F
 from aiter.jit.core import AITER_CONFIG_BF16_BATCHED_GEMM
@@ -102,7 +100,6 @@ def tune(
                 kernel = kernels_list[i]
                 maxsplitK = (
                     aiter.compute_batched_gemm_SplitK(
-                        B,
                         M,
                         N,
                         K,
diff --git a/csrc/ck_gemm_a4w4_blockscale/README.md b/csrc/ck_gemm_a4w4_blockscale/README.md
index d6e8194077..26798053d1 100755
--- a/csrc/ck_gemm_a4w4_blockscale/README.md
+++ b/csrc/ck_gemm_a4w4_blockscale/README.md
@@ -1,4 +1,4 @@
-# CK gemm a4w4 blockscale tune
+# CK GEMM A4W4 Blockscale Tune
 
 1. Install aiter:
 `cd $aiter_path`
@@ -12,17 +12,143 @@
 3. Start tuning:
 Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a4w4_blockscale_tune via jit:
 `GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH=/work/llvm-project/build/bin/ python3 csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py -i aiter/configs/a4w4_blockscale_untuned_gemm.csv -o aiter/configs/a4w4_blockscale_tuned_gemm.csv`
-You can find the results of the tuning in `aiter/configs/a4w4_blockscale_tuned_gemm.csv`.
-    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|
-    |----------|-----|-----|-----|------------|----------|------|--------------|
-    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |
+You can find the results of the tuning in `aiter/configs/a4w4_blockscale_tuned_gemm.csv`, like this:
+    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**|
+    |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------|
+    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |125.4     |89.5  |0.01        |
 
     `cu_num` means the number of compute units, and it is used to distinguish between graphics.
 
 4. Build tuned kernels and test:
 Test the performance, modify the test instance in `op_tests/test_gemm_a4w4_blockscale.py` and run it, please wait a few minutes as it will build gemm_a4w4_blockscale tuned kernels in `aiter/configs/a4w4_blockscale_tuned_gemm.csv` via jit:
 `GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH=/work/llvm-project/build/bin/ python3 op_tests/test_gemm_a4w4_blockscale.py`
-If you have built gemm_a4w4 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a4w4_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A4W4` the default one will be `aiter/configs/a4w4_blockscale_tuned_gemm.csv`.
+If you have built gemm_a4w4 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a4w4_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A4W4`, the default one will be results merged from `aiter/configs/a4w4_blockscale_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a4w4_blockscale_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a4w4_blockscale_tuned_gemm.csv`
 
-## More
-If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a4w4 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a4w4_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended.
+## More Options
+
+**Note**: All commands require setting `GEMM_A4W4_BLOCKWISE_HIP_CLANG_PATH=/work/llvm-project/build/bin/` environment variable.
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_a4w4_blockscale_all.csv
+```
+
+#### `--sort`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Sort the output file according to the key columns(e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files.
+
+**Example**:
+```bash
+--sort
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.05` (5%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs.
+
+**Example**:
+```bash
+--mp 4
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes.
+
+**Example**:
+```bash
+-k
+--splitK
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship.
+- If `tune_file` == `untune_file`: Retune all shapes in the tune file
+- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file
+
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling.
+
+**Example**:
+```bash
+--warmup 10
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
+
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information.
+
+**Example**:
+```bash
+-v
+```
+
+## Notes
+If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a4w4 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a4w4_blockscale_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended.
diff --git a/csrc/ck_gemm_a8w8/README.md b/csrc/ck_gemm_a8w8/README.md
index 21acf3cccc..ee0e08336f 100644
--- a/csrc/ck_gemm_a8w8/README.md
+++ b/csrc/ck_gemm_a8w8/README.md
@@ -1,4 +1,4 @@
-# CK gemm a8w8 tune
+# CK GEMM A8W8 Tune
 
 1. Install aiter:
 `cd $aiter_path`
@@ -9,22 +9,144 @@
     |-----|-----|-----|
     |128  |1536 |7168 |
 
-
 3. Start tuning:
 Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_tune via jit:
 `python3 csrc/ck_gemm_a8w8/gemm_a8w8_tune.py -i aiter/configs/a8w8_untuned_gemm.csv -o aiter/configs/a8w8_tuned_gemm.csv`
-If you want to use split K kernels, you can add the `-k` parameter at the end, notice that should change `bias` to `bias/(2^k)`.
 You can find the results of this tuning in `aiter/configs/a8w8_tuned_gemm.csv`, like this:
-    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|
-    |----------|-----|-----|-----|------------|----------|------|--------------|
-    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |
+    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**|
+    |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------|
+    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |125.4     |89.5  |0.01        |
 
     `cu_num` means the number of compute units, and it is used to distinguish between graphics.
 
 4. Build tuned kernels and test:
 Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8 tuned kernels in `aiter/configs/a8w8_tuned_gemm.csv` via jit:
 `python3 op_tests/test_gemm_a8w8.py`
-If you have built gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8` the default one will be  `aiter/configs/a8w8_tuned_gemm.csv`.
+If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8`, the default one will be results merged from `aiter/configs/a8w8_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_tuned_gemm.csv`.
+
+## More Options
+
+### Split K Kernels
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split K kernels.
+**Example**:
+```bash
+-k
+--splitK
+```
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_a8w8_all.csv
+```
+
+#### `--sort`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files.
+
+
+**Example**:
+```bash
+--sort
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.05` (5%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs. Each process runs on a separate GPU.
+
+**Example**:
+```bash
+--mp 4
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch. Split untuned shapes into batches to manage memory and progress tracking.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship:
+  - If `tune_file` == `untune_file`: Retune all shapes in the tune file
+  - If `tune_file` != `untune_file`: Retune shapes that exist in both files
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling. Warmup runs help stabilize GPU state before measurement.
+
+**Example**:
+```bash
+--warmup 10
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group. Useful to prevent hanging on problematic kernels.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
+
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information, including skipped shapes, tuning progress, and detailed error messages.
+
+**Example**:
+```bash
+-v
+```
 
-## More
-If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended.
+## Notes
+If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended.
diff --git a/csrc/ck_gemm_a8w8_blockscale/README.md b/csrc/ck_gemm_a8w8_blockscale/README.md
index 958fa396e3..d087615e8f 100755
--- a/csrc/ck_gemm_a8w8_blockscale/README.md
+++ b/csrc/ck_gemm_a8w8_blockscale/README.md
@@ -1,4 +1,4 @@
-# CK gemm a8w8 blockscale tune
+# CK GEMM A8W8 Blockscale Tune
 
 1. Install aiter:
 `cd $aiter_path`
@@ -12,17 +12,139 @@
 3. Start tuning:
 Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_blockscale_tune via jit:
 `python3 csrc/ck_gemm_a8w8_blockscale/gemm_a8w8_blockscale_tune.py -i aiter/configs/a8w8_blockscale_untuned_gemm.csv -o aiter/configs/a8w8_blockscale_tuned_gemm.csv`
-You can find the results of the tuning in `aiter/configs/a8w8_blockscale_tuned_gemm.csv`.
-    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|
-    |----------|-----|-----|-----|------------|----------|------|--------------|
-    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |
+You can find the results of the tuning in `aiter/configs/a8w8_blockscale_tuned_gemm.csv`, like this:
+    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**|
+    |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------|
+    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |125.4     |89.5  |0.01        |
 
     `cu_num` means the number of compute units, and it is used to distinguish between graphics.
 
 4. Build tuned kernels and test:
 Test the performance, modify the test instance in `op_tests/test_gemm_a8w8_blockscale.py` and run it, please wait a few minutes as it will build gemm_a8w8_blockscale tuned kernels in `aiter/configs/a8w8_blockscale_tuned_gemm.csv` via jit:
 `python3 op_tests/test_gemm_a8w8_blockscale.py`
-If you have built gemm_a8w8 kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE` the default one will be `aiter/configs/a8w8_blockscale_tuned_gemm.csv`.
+If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE`, the default one will be results merged from `aiter/configs/a8w8_blockscale_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_blockscale_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_blockscale_tuned_gemm.csv`.
 
-## More
-If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended.
+## More Options
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_a8w8_blockscale_all.csv
+```
+
+#### `--sort`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Sort the output file according to the key columns(e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files.
+
+**Example**:
+```bash
+--sort
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.05` (5%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs.
+
+**Example**:
+```bash
+--mp 1
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes.
+
+**Example**:
+```bash
+-k
+--splitK
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship.
+- If `tune_file` == `untune_file`: Retune all shapes in the tune file
+- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling.
+
+**Example**:
+```bash
+--warmup 10
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
+
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information.
+
+**Example**:
+```bash
+-v
+```
+## Notes
+If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 blockscale kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_blockscale_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended.
diff --git a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md
index 5e5ea1a914..af83122ef3 100755
--- a/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md
+++ b/csrc/ck_gemm_a8w8_blockscale_bpreshuffle/README.md
@@ -1,4 +1,4 @@
-# CK gemm a8w8 blockscale bpreshuffle tune
+# CK GEMM A8W8 Blockscale BPreshuffle Tune
 
 1. Install aiter:
 `cd $aiter_path`
@@ -12,17 +12,140 @@
 3. Start tuning:
 Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_blockscale_bpreshuffle_tune via jit:
 `python3 csrc/ck_gemm_a8w8_blockscale_bpreshuffle/gemm_a8w8_blockscale_bpreshuffle_tune.py -i aiter/configs/a8w8_blockscale_bpreshuffle_untuned_gemm.csv -o aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`
-You can find the results of the tuning in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`.
-    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|
-    |----------|-----|-----|-----|------------|----------|------|--------------|
-    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |
+You can find the results of the tuning in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`, like this:
+    |**cu_num**|**M**|**N**|**K**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**|
+    |----------|-----|-----|-----|------------|----------|------|--------------|----------|------|------------|
+    |80        |128  |1536 |7168 |23          |0         |32.99 |xxxxxxxx      |125.4     |89.5  |0.01        |
 
     `cu_num` means the number of compute units, and it is used to distinguish between graphics.
 
 4. Build tuned kernels and test:
 Test the performance, modify the test instance in `op_tests/test_gemm_a8w8_blockscale.py` and run it, please wait a few minutes as it will build gemm_a8w8_blockscale_bpreshuffle tuned kernels in `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv` via jit:
 `python3 op_tests/test_gemm_a8w8_blockscale.py`
-If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE` the default one will be `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`.
+If you have built gemm_a8w8 kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8_blockscale.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE`, the default one will be results merged from `aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_blockscale_bpreshuffle_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv`.
 
-## More
-If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended.
+## More Options
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_a8w8_blockscale_bpreshuffle_all.csv
+```
+
+#### `--sort`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files.
+
+
+**Example**:
+```bash
+--sort
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.05` (5%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs.
+
+**Example**:
+```bash
+--mp 4
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes.
+
+**Example**:
+```bash
+-k
+--splitK
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship.
+- If `tune_file` == `untune_file`: Retune all shapes in the tune file
+- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling.
+
+**Example**:
+```bash
+--warmup 10
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
+
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information.
+
+**Example**:
+```bash
+-v
+```
+## Notes
+If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 blockscale bpreshuffle kernels in tuned gemm csv by default. If you want to use the new result of gemm_a8w8_blockscale_bpreshuffle_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended.
diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/README.md b/csrc/ck_gemm_a8w8_bpreshuffle/README.md
index 40f3a7ba22..c1df848d97 100644
--- a/csrc/ck_gemm_a8w8_bpreshuffle/README.md
+++ b/csrc/ck_gemm_a8w8_bpreshuffle/README.md
@@ -1,4 +1,4 @@
-# CK gemm a8w8 tune
+# CK GEMM A8W8 BPreshuffle Tune
 
 1. Install aiter:
 `cd $aiter_path`
@@ -13,10 +13,10 @@
 3. Start tuning:
 Run the following cmd to start tuning, please wait a few minutes as it will build gemm_a8w8_bpreshuffle_tune via jit:
 `python3 csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py -i aiter/configs/a8w8_bpreshuffle_untuned_gemm.csv -o aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`
-You can find the results of this tuning in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`.
-    |**cu_num**|**M**|**N**|**K**|******q_dtype_w******|**kernelId**|**splitK**|**us**|**kernelName**|
-    |----------|-----|-----|-----|---------------------|------------|----------|------|--------------|
-    |80        |128  |1536 |7168 |torch.float8_e4m3fnuz|23          |0         |32.99 |xxxxxxxx      |
+You can find the results of this tuning in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`, like this:
+    |**cu_num**|**M**|**N**|**K**|**q_dtype_w**        |**libtype**|**kernelId**|**splitK**|**us**|**kernelName**|**tflops**|**bw**|**errRatio**|
+    |----------|-----|-----|-----|---------------------|-----------|------------|----------|------|--------------|----------|------|------------|
+    |80        |128  |1536 |7168 |torch.float8_e4m3fnuz|    ck     | 23         |0         |32.99 |xxxxxxxx      |125.4     |89.5  |0.01        |
 
     `cu_num` means the number of compute units, and it is used to distinguish between graphics.
     `q_dtype_w` means the quantization data type of weight, and it is used to distinguish between different quantization data types. support torch.int8 and fp8
@@ -24,7 +24,149 @@ You can find the results of this tuning in `aiter/configs/a8w8_bpreshuffle_tuned
 4. Build tuned kernels and test:
 Test the performance, modify the test instance in `op_tests/test_gemm_a8w8.py` and run it, please wait a few minutes as it will build gemm_a8w8 tuned kernels in `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv` via jit:
 `python3 op_tests/test_gemm_a8w8.py`
-If you have built gemm_a8w8_bpreshuffle kernels brefore tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE`  the default one will be aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv`.
+If you have built gemm_a8w8_bpreshuffle kernels before tuning new GEMM shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_gemm_a8w8.py`. It will rebuild kernels from `AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE`, the default one will be results merged from `aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_a8w8_bpreshuffle_tuned_gemm_xx.csv`, the merged result is store in `/tmp/aiter_configs/a8w8_bpreshuffle_tuned_gemm.csv`.
+
+## More Options
+
+### Library Selection
+
+#### `--libtype`
+- **Type**: List of strings
+- **Default**: `["all"]`
+- **Choices**: `all`, `asm`, `ck`, `cktile`
+- **Description**: Choose which library implementations to tune. You can specify one or multiple library types to compare their performance.
+  - `all`: Tune all available library implementations
+  - `asm`: Use assembly optimized kernels
+  - `ck`: Use Composable Kernel library
+  - `cktile`: Use CK Tile library
+
+**Example**:
+```bash
+--libtype all
+--libtype ck,cktile
+--libtype cktile
+```
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_a8w8_bpreshuffle_all.csv
+```
+
+#### `--sort`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Sort the output file according to the key columns (e.g., `cu_num`, `N`, `M`, `K` for GEMM). Useful for maintaining consistent ordering in result files.
+
+**Example**:
+```bash
+--sort
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.05` (5%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs.
+
+**Example**:
+```bash
+--mp 4
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes.
+
+**Example**:
+```bash
+-k
+--splitK
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship.
+- If `tune_file` == `untune_file`: Retune all shapes in the tune file
+- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling.
+
+**Example**:
+```bash
+--warmup 10
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
+
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information.
+
+**Example**:
+```bash
+-v
+```
 
 ## More
 If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build gemm a8w8 bpreshuffle kernels in tuned bpreshuffle gemm csv by default. If you want to use the new result of gemm_a8w8_bpreshuffle_tune, please remove `build` and `*.so` in `aiter/jit` first, then re-intall aiter after finishing tune. This can take a lot of time and is not recommended.
diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py
index 9c39e788ad..0268c655c1 100755
--- a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py
+++ b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 import os
 import sys
 import aiter
@@ -256,9 +256,9 @@ def get_cktile_gemm_a8w8_bpreshuffle_tune_task(
                     M,
                     N,
                     K,
-                    kernel.MPerBLOCK,
-                    kernel.NPerBLOCK,
-                    kernel.KPerBLOCK,
+                    kernel.MTile,
+                    kernel.NTile,
+                    kernel.KTile,
                 )
                 if useSplitK
                 else 0
diff --git a/csrc/ck_gemm_moe_2stages_codegen/README.md b/csrc/ck_gemm_moe_2stages_codegen/README.md
new file mode 100644
index 0000000000..d57b891248
--- /dev/null
+++ b/csrc/ck_gemm_moe_2stages_codegen/README.md
@@ -0,0 +1,161 @@
+# CK GEMM MoE 2-Stages Codegen or asm 1-stage Tune
+
+1. Install aiter:
+`cd $aiter_path`
+`python3 setup.py develop`
+
+2. Add MoE shapes in `aiter/configs/untuned_fmoe.csv`
+    |**token**|**model_dim**|**inter_dim**|**expert**|**topk**|**act_type**|**dtype**|**q_dtype_a**|**q_dtype_w**|**q_type**|**use_g1u1**|**doweight_stage1**|
+    |---------|-------------|-------------|----------|--------|------------|---------|-------------|-------------|----------|------------|-------------------|
+    |1024     |4096         |14336        |8         |2       |ActivationType.Silu|dtypes.bf16|dtypes.fp8|dtypes.fp8|QuantType.per_Token|True|True|
+
+
+3. Start tuning:
+Run the following cmd to start tuning, please wait a few minutes as it will build moe 2-stages kernels via jit:
+`python3 csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py -i aiter/configs/untuned_fmoe.csv -o aiter/configs/tuned_fmoe.csv`
+You can find the results of this tuning in `aiter/configs/tuned_fmoe.csv`, like this:
+    |**cu_num**|**token**|**model_dim**|**inter_dim**|**expert**|**topk**|**act_type**|**dtype**|**q_dtype_a**|**q_dtype_w**|**q_type**|**use_g1u1**|**doweight_stage1**|**block_m**|**ksplit**|**us1**|**kernelName1**|**err1**|**us2**|**kernelName2**|**err2**|**us**|**run_1stage**|**tflops**|**bw**|
+    |----------|---------|-------------|-------------|----------|--------|------------|---------|-------------|-------------|----------|------------|-------------------|-----------|----------|-------|---------------|--------|-------|---------------|--------|------|--------------|----------|------|
+    |80        |1024     |4096         |14336        |8         |2       |ActivationType.Silu|dtypes.bf16|dtypes.fp8|dtypes.fp8|QuantType.per_Token|True|True|64|0|45.23|kernel_stage1|0.5%|38.67|kernel_stage2|0.3%|83.90|0|125.4|89.5|
+
+    `cu_num` means the number of compute units, and it is used to distinguish between graphics.
+    `run_1stage` indicates whether to run fused 1-stage kernel (1) or 2-stages kernels (0).
+
+4. Build tuned kernels and test:
+Test the performance, modify the test instance in `op_tests/test_moe.py` or `python3 op_tests/test_moe_2stage.py` and run it, please wait a few minutes as it will build moe tuned kernels in `aiter/configs/tuned_fmoe.csv` via jit:
+`python3 op_tests/test_moe.py` or `python3 op_tests/test_moe_2stage.py`
+If you have built moe kernels before tuning new MoE shapes, please add `AITER_REBUILD=1` before your test cmd, such as `AITER_REBUILD=1 python3 op_tests/test_moe.py`. It will rebuild kernels from `AITER_CONFIG_FMOE`, the default one will be results merged from `aiter/configs/tuned_fmoe.csv` and tuned fmoe csv under `aiter/configs/model_configs/xx_tuned_fmoe_xx.csv`, the merged result is stored in `/tmp/aiter_configs/tuned_fmoe.csv`.
+
+## More Options
+
+### Tuning Scope
+
+#### `--last`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Only tune the last kernel in the CSV file. Useful for quickly testing newly added shapes.
+
+**Example**:
+```bash
+--last
+```
+
+### Output Configuration
+
+#### `-o2, --profile_file`
+- **Type**: String
+- **Default**: `""` (empty string)
+- **Description**: Optional output file to store **all** tuning results (not just the best ones). Useful for profiling and analyzing all kernel candidates.
+
+**Example**:
+```bash
+--profile_file aiter/configs/profile_fmoe_all.csv
+```
+
+### Tuning Configuration
+
+#### `--errRatio`
+- **Type**: Float
+- **Default**: `0.5` (50%)
+- **Description**: Tolerable error ratio threshold. Only kernels with error ratios below this threshold will be considered valid candidates.
+
+**Example**:
+```bash
+--errRatio 0.01
+```
+
+#### `--mp`
+- **Type**: Integer
+- **Default**: Number of available GPUs
+- **Description**: Number of parallel processes to use for tuning across multiple GPUs.
+
+**Example**:
+```bash
+--mp 8
+```
+
+#### `--batch`
+- **Type**: Integer
+- **Default**: `100`
+- **Description**: Number of shapes to tune in each batch.
+
+**Example**:
+```bash
+--batch 50
+```
+
+#### `-k, --splitK`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable split-K optimization for GEMM kernels. Split-K divides the K dimension across multiple workgroups to improve parallelism and performance for certain shapes.
+
+**Example**:
+```bash
+-k
+--splitK
+```
+
+#### `--all`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Retune all shapes based on file relationship.
+- If `tune_file` == `untune_file`: Retune all shapes in the tune file
+- If `tune_file` != `untune_file`: Retune shapes that exist in untuned file
+
+
+**Example**:
+```bash
+--all
+```
+
+### Profiling Configuration
+
+#### `--warmup`
+- **Type**: Integer
+- **Default**: `5`
+- **Description**: Number of warmup iterations before profiling.
+
+**Example**:
+```bash
+--warmup 10
+```
+
+#### `--iters`
+- **Type**: Integer
+- **Default**: `101`
+- **Description**: Number of profiling iterations to run for performance measurement.
+
+**Example**:
+```bash
+--iters 200
+```
+
+#### `--timeout`
+- **Type**: Integer
+- **Default**: `None`
+- **Description**: Timeout in seconds for each task group.
+
+**Example**:
+```bash
+--timeout 300
+```
+
+### Debugging and Verbose Output
+
+#### `-v, --verbose`
+- **Type**: Flag (boolean)
+- **Default**: `False`
+- **Description**: Enable verbose output with detailed logging information.
+
+**Example**:
+```bash
+-v
+```
+
+## Notes
+- This tuner supports both 1-stage fused MoE kernels and 2-stages MoE kernels (stage1 and stage2)
+- The tuner will automatically select the best kernel configuration based on performance
+- Only G1U1 (gate-up fused) MoE configurations are currently supported for tuning
+- Supported quantization types include: per_Token, per_1x128 (blockscale), per_1x32 (MXFP4, gfx950 only)
+- If you use flag `PREBUILD_KERNELS=1` when you install aiter, it will build moe kernels in tuned csv by default. If you want to use the new result of moe tuning, please remove `build` and `*.so` in `aiter/jit` first, then re-install aiter after finishing tune. This can take a lot of time and is not recommended.
+
diff --git a/hsa/gfx942/fmoe_2stages/tune.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py
similarity index 97%
rename from hsa/gfx942/fmoe_2stages/tune.py
rename to csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py
index f2a4014d8f..501142409f 100644
--- a/hsa/gfx942/fmoe_2stages/tune.py
+++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_tune.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import aiter
@@ -1278,22 +1278,42 @@ def calculate(self, results, bpes=(1, 1, 2)):
         return tflops, bw
 
     def get_1stage_file_info(self, q_type, q_dtype_a, doweight_stage1):
-        extraInfo_1stage = ""
-        if q_dtype_a == dtypes.i8:
-            quantDtype = "Int8"
-        elif q_dtype_a == dtypes.fp8:
-            quantDtype = "Fp8"
-        else:
-            quantDtype = ""
-        if doweight_stage1:
-            extraInfo_1stage = "_tkw1"
-        if q_type == QuantType.No:
-            quantDtype_1stage = "noquant"
-        elif q_type == QuantType.per_1x128:
-            quantDtype_1stage = "blockscale" + quantDtype
-        else:
-            quantDtype_1stage = "pertoken" + quantDtype
-        return quantDtype_1stage, extraInfo_1stage
+        if get_gfx() == "gfx950":
+            extraInfo_1stage = ""
+            if q_dtype_a == dtypes.i8:
+                quantDtype = "Int8"
+            elif q_dtype_a == dtypes.fp8:
+                quantDtype = "Fp8"
+            else:
+                quantDtype = ""
+            if doweight_stage1:
+                extraInfo_1stage = "_tkw1"
+            if q_type == QuantType.No:
+                quantDtype_1stage = "noquant"
+            elif q_type == QuantType.per_1x128:
+                quantDtype_1stage = "blockscale" + quantDtype
+            elif q_type == QuantType.per_1x32:
+                quantDtype_1stage = "pertoken" + "MXfp4"
+            else:
+                quantDtype_1stage = "pertoken" + quantDtype
+            return quantDtype_1stage, extraInfo_1stage
+        elif get_gfx() == "gfx942":
+            extraInfo_1stage = ""
+            if q_dtype_a == dtypes.i8:
+                quantDtype = "Int8"
+            elif q_dtype_a == dtypes.fp8:
+                quantDtype = "Fp8"
+            else:
+                quantDtype = ""
+            if doweight_stage1:
+                extraInfo_1stage = "_tkw1"
+            if q_type == QuantType.No:
+                quantDtype_1stage = "noquant"
+            elif q_type == QuantType.per_1x128:
+                quantDtype_1stage = "blockscale" + quantDtype
+            else:
+                quantDtype_1stage = "pertoken" + quantDtype
+            return quantDtype_1stage, extraInfo_1stage
 
     def gen_1stage_asm_task(self, key):
         task_1stage = []
diff --git a/hsa/gfx950/fmoe_2stages/tune.py b/hsa/gfx950/fmoe_2stages/tune.py
deleted file mode 100644
index aa5b417007..0000000000
--- a/hsa/gfx950/fmoe_2stages/tune.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-import torch
-import pandas as pd
-import time
-import os
-import sys
-from aiter import QuantType
-from aiter.jit.core import (
-    AITER_CSRC_DIR,
-    AITER_META_DIR,
-    AITER_CONFIG_FMOE,
-)
-from aiter.utility.mp_tuner import mp_tuner
-
-from aiter import dtypes
-from aiter import ActivationType as ActivationType
-
-sys.path.insert(0, f"{AITER_META_DIR}/hsa/gfx942")
-from fmoe_2stages.tune import FmoeTuner
-
-
-sys.path.insert(0, f"{AITER_CSRC_DIR}/ck_gemm_moe_2stages_codegen/")
-
-
-torch.set_default_device("cuda")
-torch.int4 = getattr(torch, "int4", torch.uint32)
-
-
-def get_kernels_dict(file, key="tile_m"):
-    if not os.path.exists(file):
-        print(f"ASM kernel list file not exist: {file}")
-        return {}
-    df = pd.read_csv(file)
-    kernel_dict = df.groupby(key)["knl_name"].apply(list).to_dict()
-    return kernel_dict
-
-
-class FmoeTuner950(FmoeTuner):
-    ARG_DEFAULTS = {
-        "verbose": False,
-        "tune_file": f"{AITER_CONFIG_FMOE}",
-        "untune_file": "aiter/configs/untuned_fmoe.csv",
-        "errRatio": 0.5,
-        "batch": 100,
-        "profile_file": "aiter/configs/profile_fmoe.csv",  # for all results
-    }
-
-    def get_1stage_file_info(self, q_type, q_dtype_a, doweight_stage1):
-        extraInfo_1stage = ""
-        if q_dtype_a == dtypes.i8:
-            quantDtype = "Int8"
-        elif q_dtype_a == dtypes.fp8:
-            quantDtype = "Fp8"
-        else:
-            quantDtype = ""
-        if doweight_stage1:
-            extraInfo_1stage = "_tkw1"
-        if q_type == QuantType.No:
-            quantDtype_1stage = "noquant"
-        elif q_type == QuantType.per_1x128:
-            quantDtype_1stage = "blockscale" + quantDtype
-        elif q_type == QuantType.per_1x32:
-            quantDtype_1stage = "pertoken" + "MXfp4"
-        else:
-            quantDtype_1stage = "pertoken" + quantDtype
-        return quantDtype_1stage, extraInfo_1stage
-
-    def tune(
-        self,
-        untunedf,
-        tunedf,
-        args,
-    ):
-        mp_num = args.mp
-        startTS = time.perf_counter()
-        # blockMs = [16, 32, 48, 64, 80, 96, 112, 128, 144, 160]
-        blockMs = [16, 32, 64, 128]
-
-        args = self.keys
-        print(untunedf[args])
-        tasks = []
-        tasks_ck = []
-        task_1stage = []
-        in_data = []
-        for line in untunedf[args].values:
-            (
-                cu_num,
-                token,
-                model_dim,
-                inter_dim,
-                expert,
-                topk,
-                act_type,
-                dtype,
-                q_dtype_a,
-                q_dtype_w,
-                q_type,
-                use_g1u1,
-                doweight_stage1,
-            ) = line
-            dtype = eval(dtype)
-            q_dtype_a = eval(q_dtype_a)
-            q_dtype_w = eval(q_dtype_w)
-            q_type = eval(q_type)
-            q_type = QuantType.per_1x128 if q_type == QuantType.per_128x128 else q_type
-            print("\nStart tuning", line)
-            if not use_g1u1:
-                print("no moe solution(g1u0) can tune for ", line)
-                continue
-            act_type = eval(act_type)
-            info = (
-                cu_num,
-                token,
-                model_dim,
-                inter_dim,
-                expert,
-                topk,
-                act_type,
-                dtype,
-                q_dtype_a,
-                q_dtype_w,
-                q_type,
-                use_g1u1,
-                doweight_stage1,
-            )
-            tasks.extend(self.gen_2stages_asm1_task(info, blockMs))
-            tasks_ck.extend(self.gen_2stages_task(info, blockMs))
-            task_1stage.extend(self.gen_1stage_asm_task(info))
-            if tasks is None and tasks_ck is None and task_1stage is None:
-                print("no moe solution can tune for ", line)
-                continue
-            print(
-                f"stage1 asm tasks is {len(tasks)}, tasks_ck is {len(tasks_ck)}, task_1stage is {len(task_1stage)}"
-            )
-        in_data.append((len(tasks) + len(tasks_ck) + len(task_1stage), ()))
-        rets = []
-        if len(tasks) + len(tasks_ck) + len(task_1stage) > 0:
-            ### shape_grouped should be False as multiple stages
-            rets = mp_tuner(
-                tasks + tasks_ck + task_1stage,
-                in_data,
-                mp_num,
-                True,
-                False,
-                timeout=args.timeout,
-                verbose=args.verbose,
-            )
-        if not rets:
-            print("no shape to tune or no solution found")
-            return []
-        else:
-            return rets
-
-
-if __name__ == "__main__":
-
-    key = [
-        "cu_num",
-        "token",
-        "model_dim",
-        "inter_dim",
-        "expert",
-        "topk",
-        "act_type",
-        "dtype",
-        "q_dtype_a",
-        "q_dtype_w",
-        "q_type",
-        "use_g1u1",
-        "doweight_stage1",
-    ]
-    resultList = [
-        "block_m",
-        "ksplit",
-        "us1",
-        "kernelName1",
-        "err1",
-        "us2",
-        "kernelName2",
-        "err2",
-        "us",
-        "run_1stage",
-        "tflops",
-        "bw",
-    ]
-    tuner = FmoeTuner950("fmoeTuner950", key, resultList, "fmoe tuner on gfx950")
-    args = tuner.parse_args()
-
-    tuner.run(args, False)