ROCm · valarLip · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/aiter/configs/tuned_fmoe.csv b/aiter/configs/tuned_fmoe.csv
@@ -1,9 +1,8 @@
-cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,total_us,run_1stage,tflops,bw
+cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw
 80,512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,64,0,373.4158,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf3E,0.0%,268.4886,moe_ck2stages_gemm2_256x64x128x256_1x4_MulABScaleExpertWeight_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.3%,641.9044,0,240.88,955.62
 80,512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0,64,0,386.1143,_ZN5aiter49fmoe_stage1_bf16_pertokenInt8_g1u1_64x128_2tg_pf3E,0.0%,250.0186,moe_ck2stages_gemm2_256x64x128x256_1x4_MulABScaleExpertWeight_v3_Nswizzle0_Quant1_MulRoutedWeight1_I8_I8_B16,2.1%,636.1329000000001,0,243.06,964.29
 80,4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,17.6606,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,15.126,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.3%,32.7866,0,5.18,2591.37
 80,4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,17.8008,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,14.5115,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,32.3123,0,5.26,2629.41
-80,56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,203.0534,_ZN5aiter59fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x128_2tg_pf3E,5.0%,128.7294,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,5.2%,331.7828,0,50.97,1823.52
 80,512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,774.6328,moe_ck2stages_gemm1_256x64x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,459.0113,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCastExpertWeight_v3_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.3%,1233.6441,0,125.34,989.38
 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.4639,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,70.3202,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,200.7841,0,7.02,14040.11
 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,195.38,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,107.5659,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,302.9459,0,9.3,9306.91
@@ -774,3 +773,4 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,
 80,512,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,94.7864,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,76.6968,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,171.4832,0,56.35,1785.51
 80,1024,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98
 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,129.9261,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,470.0698,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,599.9959,0,128.85,1027.61
+80,56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,228.7482,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.5%,0.0,Null,0.0%,228.7482,1,73.93,2644.88
diff --git a/aiter/jit/core.py b/aiter/jit/core.py
@@ -102,7 +102,11 @@ def update_config_files(file_path: str, merge_name: str):
     if os.path.exists(untuned_path):
         untunedf = pd.read_csv(untuned_path)
         keys = untunedf.columns
-        merge_df = merge_df.drop_duplicates(subset=keys, keep="last")
+        merge_df = (
+            merge_df.sort_values("us")
+            .drop_duplicates(subset=keys, keep="first")
+            .reset_index(drop=True)
+        )
     else:
         logger.warning(
             f"Untuned config file not found: {untuned_path}. Using all columns for deduplication."
@@ -112,9 +116,10 @@ def update_config_files(file_path: str, merge_name: str):
     return new_file_path
 
 
-def get_config_file(env_name, tuned_file_name):
+# @functools.lru_cache(maxsize=1)
+def get_config_file(env_name, default_file, tuned_file_name):
     config_env_file = os.getenv(env_name)
-    default_file = f"{AITER_ROOT_DIR}/aiter/configs/{tuned_file_name}.csv"
+    # default_file = f"{AITER_ROOT_DIR}/aiter/configs/{tuned_file_name}.csv"
     from pathlib import Path
 
     if not config_env_file:
@@ -130,44 +135,95 @@ def get_config_file(env_name, tuned_file_name):
         else:
             tuned_files = ":".join(str(p) for p in op_tuned_file_list)
             tuned_files = default_file + ":" + tuned_files
-            print(f"merge tuned file under model_configs/ and configs/")
+            print(f"merge tuned file under model_configs/ and configs/ ", tuned_files)
             config_file = update_config_files(tuned_files, tuned_file_name)
     else:
         config_file = update_config_files(config_env_file, tuned_file_name)
-        print(f"get {env_name} from environment ", config_file)
+        # print(f"get config file from environment ", config_file)
     return config_file
 
 
+AITER_CONFIG_GEMM_A4W4 = os.getenv(
+    "AITER_CONFIG_GEMM_A4W4",
+    f"{AITER_ROOT_DIR}/aiter/configs/a4w4_blockscale_tuned_gemm.csv",
+)
+AITER_CONFIG_GEMM_A8W8 = os.getenv(
+    "AITER_CONFIG_GEMM_A8W8",
+    f"{AITER_ROOT_DIR}/aiter/configs/a8w8_tuned_gemm.csv",
+)
+AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE = os.getenv(
+    "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE",
+    f"{AITER_ROOT_DIR}/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv",
+)
+AITER_CONFIG_GEMM_A8W8_BLOCKSCALE = os.getenv(
+    "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE",
+    f"{AITER_ROOT_DIR}/aiter/configs/a8w8_blockscale_tuned_gemm.csv",
+)
+AITER_CONFIG_FMOE = os.getenv(
+    "AITER_CONFIG_FMOE",
+    f"{AITER_ROOT_DIR}/aiter/configs/tuned_fmoe.csv",
+)
+
+AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE = os.getenv(
+    "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE",
+    f"{AITER_ROOT_DIR}/aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv",
+)
+
+AITER_CONFIG_A8W8_BATCHED_GEMM = os.getenv(
+    "AITER_CONFIG_A8W8_BATCHED_GEMM",
+    f"{AITER_ROOT_DIR}/aiter/configs/a8w8_tuned_batched_gemm.csv",
+)
+
+AITER_CONFIG_BF16_BATCHED_GEMM = os.getenv(
+    "AITER_CONFIG_BF16_BATCHED_GEMM",
+    f"{AITER_ROOT_DIR}/aiter/configs/bf16_tuned_batched_gemm.csv",
+)
+
+AITER_CONFIG_GEMM_BF16 = os.getenv(
+    "AITER_CONFIG_GEMM_BF16",
+    f"{AITER_ROOT_DIR}/aiter/configs/tuned_gemm.csv",
+)
 AITER_CONFIG_GEMM_A4W4_FILE = get_config_file(
-    "AITER_CONFIG_GEMM_A4W4", "a4w4_blockscale_tuned_gemm"
+    "AITER_CONFIG_GEMM_A4W4", AITER_CONFIG_GEMM_A4W4, "a4w4_blockscale_tuned_gemm"
 )
 
 AITER_CONFIG_GEMM_A8W8_FILE = get_config_file(
-    "AITER_CONFIG_GEMM_A8W8", "a8w8_tuned_gemm"
+    "AITER_CONFIG_GEMM_A8W8", AITER_CONFIG_GEMM_A8W8, "a8w8_tuned_gemm"
 )
 AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE = get_config_file(
-    "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE", "a8w8_bpreshuffle_tuned_gemm"
+    "AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE",
+    AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE,
+    "a8w8_bpreshuffle_tuned_gemm",
 )
 AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE = get_config_file(
-    "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE", "a8w8_blockscale_tuned_gemm"
+    "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE",
+    AITER_CONFIG_GEMM_A8W8_BLOCKSCALE,
+    "a8w8_blockscale_tuned_gemm",
+)
+AITER_CONFIG_FMOE_FILE = get_config_file(
+    "AITER_CONFIG_FMOE", AITER_CONFIG_FMOE, "tuned_fmoe"
 )
-AITER_CONFIG_FMOE_FILE = get_config_file("AITER_CONFIG_FMOE", "tuned_fmoe")
 
 AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE = get_config_file(
     "AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE",
+    AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE,
     "a8w8_blockscale_bpreshuffle_tuned_gemm",
 )
 
 AITER_CONFIG_A8W8_BATCHED_GEMM_FILE = get_config_file(
-    "AITER_CONFIG_A8W8_BATCHED_GEMM", "a8w8_tuned_batched_gemm"
+    "AITER_CONFIG_A8W8_BATCHED_GEMM",
+    AITER_CONFIG_A8W8_BATCHED_GEMM,
+    "a8w8_tuned_batched_gemm",
 )
 
 AITER_CONFIG_BF16_BATCHED_GEMM_FILE = get_config_file(
-    "AITER_CONFIG_BATCHED_GEMM_BF16", "bf16_tuned_batched_gemm"
+    "AITER_CONFIG_BF16_BATCHED_GEMM",
+    AITER_CONFIG_BF16_BATCHED_GEMM,
+    "bf16_tuned_batched_gemm",
 )
 
 AITER_CONFIG_GEMM_BF16_FILE = get_config_file(
-    "AITER_CONFIG_GEMM_BF16", "bf16_tuned_gemm"
+    "AITER_CONFIG_GEMM_BF16", AITER_CONFIG_GEMM_BF16, "bf16_tuned_gemm"
 )
 
 # config_env end here

diff --git a/aiter/utility/base_tuner.py b/aiter/utility/base_tuner.py
@@ -12,6 +12,7 @@
 from operator import itemgetter
 import time
 from aiter import dtypes
+from aiter import core
 
 INVALID_TIME = -1
 
@@ -175,15 +176,20 @@ def get_untuned_gemm_list(self, untuned_gemm_file):
         filtered_df = untunedf.drop_duplicates().reset_index(drop=True)
         return filtered_df
 
+    def get_out_file(self, tuned_file):
+        """if there are multiple tuned file, then write tuning result to the first file"""
+        path_list = tuned_file.split(os.pathsep) if tuned_file else []
+        assert path_list, f"output tuned file is empty"
+        return path_list[0]
+
     def get_tuned_gemm_list(self, tuned_gemm_file, columns=[]):
-        path_list = tuned_gemm_file.split(os.pathsep) if tuned_gemm_file else []
-        assert len(path_list) <= 1, f"tuning to multiple files is not supported"
-        if os.path.exists(tuned_gemm_file):
-            column_order = pd.read_csv(tuned_gemm_file, nrows=0).columns.tolist()
-            tunedf = pd.read_csv(tuned_gemm_file)
+        all_tuned_file = core.update_config_files(tuned_gemm_file, self.name)
+        if os.path.exists(all_tuned_file):
+            column_order = pd.read_csv(all_tuned_file, nrows=0).columns.tolist()
+            tunedf = pd.read_csv(all_tuned_file)
             tunedf = tunedf[column_order]
         else:
-            print(f"Not exist tuned file: {tuned_gemm_file}")
+            print(f"Not exist tuned file: {all_tuned_file}")
             columns = self.columns if not columns else columns
             tunedf = pd.DataFrame(columns=columns)
         return tunedf
@@ -192,7 +198,7 @@ def get_retune_gemm_list(self, args):
         """get retune gemm list from tune_file and untune_file"""
         if args.untune_file is None:
             raise ValueError("untune_file must be specified for retuning")
-        if args.tune_file == args.untune_file:
+        if self.get_out_file(args.tune_file) == args.untune_file:
             # retune all shapes in tune_file
             self.untunedf = self.get_untuned_gemm_list(args.untune_file)
             self.tunedf = self.untunedf[self.untunedf["cu_num"] != self.get_cu_num()]
@@ -351,19 +357,23 @@ def run(self, args, fast_mode=False):
         """tuner run function"""
         self.pre_process(args)
         print(self.untunedf)
+        output_file = self.get_out_file(args.tune_file)
         if args.verbose:
             logger.info(f"args: {args}")
         if len(self.untunedf) == 0:
             # self.update_tflops_bw(args.tune_file)
-            self.sortResults(args.tune_file, args.sort, self.keys)
-            logger.info(f"no shapes to be tuned, skip tuning")
+            self.sortResults(output_file, args.sort, self.keys)
+            logger.info(
+                f"no shapes to be tuned, skip tuning, tuned file is {args.tune_file}"
+            )
             return self.tunedf if self.tunedf is not None else pd.DataFrame()
         batch_size = min(args.batch, len(self.untunedf))
         total_batches = (len(self.untunedf) + batch_size - 1) // batch_size
         if args.verbose:
             logger.info(
                 f"total shapes to be tuned: {len(self.untunedf) }, total_batches: {total_batches}, batch_size: {batch_size}"
             )
+            logger.info(f"results will be written to {output_file}")
         processed_batches = 0
         results = []
         topk = -1 if fast_mode else 1
@@ -376,13 +386,15 @@ def run(self, args, fast_mode=False):
                 all_results = self.tune(batch, self.tunedf, args)
                 if all_results:
                     results = self.post_process(all_results, args, topk)
-                    self.result_to_csv(results, args.tune_file, not args.all)
+                    self.result_to_csv(results, output_file, not args.all)
                     logger.info(
                         f"processed {processed_batches} batches of {total_batches}, Processing Status ====> {round(processed_batches / total_batches,2)*100:.1f}% tuned in {self.name}"
                     )
                 else:
-                    logger.info("tune result is none or all shape is tuned!")
-            self.sortResults(args.tune_file, args.sort, self.keys)
+                    logger.info(
+                        f"tune result is none or all shape is tuned in {args.tune_file}!"
+                    )
+            self.sortResults(output_file, args.sort, self.keys)
         except KeyboardInterrupt:
             tuning_status = "Interrupted"
             logger.error(

diff --git a/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py b/csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py
@@ -203,6 +203,9 @@ def get_ck_gemm_a8w8_bpreshuffle_tune_task(
     ):
         (cu_num, M, N, K, q_dtype_w) = info_keys
         if eval(q_dtype_w) != dtypes.fp8:
+            print(
+                f"Warning: q_dtype_w only support {dtypes.fp8}, actual q_dtype_w is {q_dtype_w}!"
+            )
             return []
         kernels_num = len(kernels_list)
         gemm_a8w8_idx = [0, 1, 2, 3, 4]  # input index in generate_data

diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py
@@ -29,7 +29,7 @@
 from functools import lru_cache
 from aiter.jit.core import get_asm_dir
 from aiter.jit.utils.chip_info import get_cu_num
-from aiter.jit.core import AITER_CONFIG_GEMM_BF16_FILE, get_asm_dir
+from aiter.jit.core import AITER_CONFIG_GEMM_BF16, get_asm_dir
 from aiter.utility.base_tuner import GemmCommonTuner
 
 aiter.rocb_create_extension()
@@ -554,7 +554,7 @@ def cleanup(self):
 class GemmTuner(GemmCommonTuner):
     ARG_DEFAULTS = {
         **GemmCommonTuner.ARG_DEFAULTS,
-        "tune_file": f"{AITER_CONFIG_GEMM_BF16_FILE}",
+        "tune_file": f"{AITER_CONFIG_GEMM_BF16}",
         "untune_file": "aiter/configs/untuned_gemm.csv",
         "batch": 1,
     }
@@ -563,7 +563,7 @@ def _setup_specific_arguments(self):
         self.parser.add_argument(
             "--tuned_file",
             type=str,
-            default=os.getenv("GTUNE_TUNED", AITER_CONFIG_GEMM_BF16_FILE),
+            default=os.getenv("GTUNE_TUNED", AITER_CONFIG_GEMM_BF16),
             dest="tune_file",
             help="output file for tuned gemm solutions",
         )
@@ -686,7 +686,7 @@ def pre_process(self, args):
                             outdtype=str(ds["outdtype"]),
                             scaleAB=ds["scaleAB"],
                         )
-            self.tunedf = self.get_tuned_gemm_list(args.tune_file)
+            self.tunedf = self.get_tuned_gemm_list(self.get_out_file(args.tune_file))
             self.untunedf["cu_num"] = self.get_cu_num()
             untunedf_cols = self.untunedf.columns
             if len(self.tunedf) != 0:

diff --git a/hsa/gfx942/fmoe_2stages/tune.py b/hsa/gfx942/fmoe_2stages/tune.py
@@ -1849,15 +1849,15 @@ def post_process(self, results, args, topk=-1, fast_mode=False):
                 failedf = pd.DataFrame(ret, columns=self.columns)
                 self.failed = pd.concat([self.failed, failedf], axis=0)
                 continue
-            profileDF["total_us"] = round(profileDF["us1"] + profileDF["us2"], 4)
+            profileDF["us"] = round(profileDF["us1"] + profileDF["us2"], 4)
             results = profileDF.apply(
                 lambda row: self.calculate(
                     (
                         tuple(row[col] for col in self.keys),
                         "",
                         row["kernelName1"],
                         row["block_m"],
-                        row["total_us"],
+                        row["us"],
                         row["err1"],
                     )
                 ),
@@ -1869,9 +1869,9 @@ def post_process(self, results, args, topk=-1, fast_mode=False):
             profileDF.drop(["tflops1", "tflops2", "bw1", "bw2"], axis=1, inplace=True)
             profileDF["err1"] = profileDF["err1"].apply(lambda x: f"{x:.1%}")
             profileDF["err2"] = profileDF["err2"].apply(lambda x: f"{x:.1%}")
-            best_one = profileDF.loc[profileDF["total_us"].idxmin()].copy()
+            best_one = profileDF.loc[profileDF["us"].idxmin()].copy()
             print(
-                f"Tuning result for {key} is {best_one['block_m'] ,best_one['kernelName1'], best_one['kernelName2'], best_one['err1'], best_one['err2'],  best_one['run_1stage']} {best_one['total_us']} us, {best_one['tflops']} TFLOPS, {best_one['bw']} GB/s"
+                f"Tuning result for {key} is {best_one['block_m'] ,best_one['kernelName1'], best_one['kernelName2'], best_one['err1'], best_one['err2'],  best_one['run_1stage']} {best_one['us']} us, {best_one['tflops']} TFLOPS, {best_one['bw']} GB/s"
             )
             best_one["act_type"] = str(best_one["act_type"])
             best_one["q_type"] = str(best_one["q_type"])
@@ -1900,7 +1900,9 @@ def pre_process(self, args):
             self.untunedf = self.get_untuned_gemm_list(args.untune_file)
 
             if not args.all or args.last:
-                self.tunedf = self.get_tuned_gemm_list(args.tune_file)
+                self.tunedf = self.get_tuned_gemm_list(
+                    self.get_out_file(args.tune_file)
+                )
             else:
                 self.tunedf = None
             self.untunedf["cu_num"] = self.get_cu_num()
@@ -1941,7 +1943,7 @@ def pre_process(self, args):
         "us2",
         "kernelName2",
         "err2",
-        "total_us",
+        "us",
         "run_1stage",
         "tflops",
         "bw",

diff --git a/hsa/gfx950/fmoe_2stages/tune.py b/hsa/gfx950/fmoe_2stages/tune.py
@@ -192,7 +192,7 @@ def tune(
         "us2",
         "kernelName2",
         "err2",
-        "total_us",
+        "us",
         "run_1stage",
         "tflops",
         "bw",