Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions aiter/configs/tuned_fmoe.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,total_us,run_1stage,tflops,bw
cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw
80,512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,64,0,373.4158,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf3E,0.0%,268.4886,moe_ck2stages_gemm2_256x64x128x256_1x4_MulABScaleExpertWeight_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.3%,641.9044,0,240.88,955.62
80,512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.int8,torch.int8,QuantType.per_Tensor,1,0,64,0,386.1143,_ZN5aiter49fmoe_stage1_bf16_pertokenInt8_g1u1_64x128_2tg_pf3E,0.0%,250.0186,moe_ck2stages_gemm2_256x64x128x256_1x4_MulABScaleExpertWeight_v3_Nswizzle0_Quant1_MulRoutedWeight1_I8_I8_B16,2.1%,636.1329000000001,0,243.06,964.29
80,4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,17.6606,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,15.126,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.3%,32.7866,0,5.18,2591.37
80,4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,17.8008,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,14.5115,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,32.3123,0,5.26,2629.41
80,56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,203.0534,_ZN5aiter59fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_64x128_2tg_pf3E,5.0%,128.7294,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,5.2%,331.7828,0,50.97,1823.52
80,512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,774.6328,moe_ck2stages_gemm1_256x64x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,459.0113,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCastExpertWeight_v3_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.3%,1233.6441,0,125.34,989.38
256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.4639,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,70.3202,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,200.7841,0,7.02,14040.11
256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,195.38,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,107.5659,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,302.9459,0,9.3,9306.91
Expand Down Expand Up @@ -774,3 +773,4 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,
80,512,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,94.7864,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,76.6968,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,171.4832,0,56.35,1785.51
80,1024,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98
256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,129.9261,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,470.0698,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,599.9959,0,128.85,1027.61
80,56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,228.7482,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.5%,0.0,Null,0.0%,228.7482,1,73.93,2644.88
82 changes: 69 additions & 13 deletions aiter/jit/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,11 @@ def update_config_files(file_path: str, merge_name: str):
if os.path.exists(untuned_path):
untunedf = pd.read_csv(untuned_path)
keys = untunedf.columns
merge_df = merge_df.drop_duplicates(subset=keys, keep="last")
merge_df = (
merge_df.sort_values("us")
.drop_duplicates(subset=keys, keep="first")
.reset_index(drop=True)
)
else:
logger.warning(
f"Untuned config file not found: {untuned_path}. Using all columns for deduplication."
Expand All @@ -112,9 +116,10 @@ def update_config_files(file_path: str, merge_name: str):
return new_file_path


def get_config_file(env_name, tuned_file_name):
# @functools.lru_cache(maxsize=1)
def get_config_file(env_name, default_file, tuned_file_name):
config_env_file = os.getenv(env_name)
default_file = f"{AITER_ROOT_DIR}/aiter/configs/{tuned_file_name}.csv"
# default_file = f"{AITER_ROOT_DIR}/aiter/configs/{tuned_file_name}.csv"
from pathlib import Path

if not config_env_file:
Expand All @@ -130,44 +135,95 @@ def get_config_file(env_name, tuned_file_name):
else:
tuned_files = ":".join(str(p) for p in op_tuned_file_list)
tuned_files = default_file + ":" + tuned_files
print(f"merge tuned file under model_configs/ and configs/")
print(f"merge tuned file under model_configs/ and configs/ ", tuned_files)
config_file = update_config_files(tuned_files, tuned_file_name)
else:
config_file = update_config_files(config_env_file, tuned_file_name)
print(f"get {env_name} from environment ", config_file)
# print(f"get config file from environment ", config_file)
return config_file


AITER_CONFIG_GEMM_A4W4 = os.getenv(
"AITER_CONFIG_GEMM_A4W4",
f"{AITER_ROOT_DIR}/aiter/configs/a4w4_blockscale_tuned_gemm.csv",
)
AITER_CONFIG_GEMM_A8W8 = os.getenv(
"AITER_CONFIG_GEMM_A8W8",
f"{AITER_ROOT_DIR}/aiter/configs/a8w8_tuned_gemm.csv",
)
AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE = os.getenv(
"AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE",
f"{AITER_ROOT_DIR}/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv",
)
AITER_CONFIG_GEMM_A8W8_BLOCKSCALE = os.getenv(
"AITER_CONFIG_GEMM_A8W8_BLOCKSCALE",
f"{AITER_ROOT_DIR}/aiter/configs/a8w8_blockscale_tuned_gemm.csv",
)
AITER_CONFIG_FMOE = os.getenv(
"AITER_CONFIG_FMOE",
f"{AITER_ROOT_DIR}/aiter/configs/tuned_fmoe.csv",
)

AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE = os.getenv(
"AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE",
f"{AITER_ROOT_DIR}/aiter/configs/a8w8_blockscale_bpreshuffle_tuned_gemm.csv",
)

AITER_CONFIG_A8W8_BATCHED_GEMM = os.getenv(
"AITER_CONFIG_A8W8_BATCHED_GEMM",
f"{AITER_ROOT_DIR}/aiter/configs/a8w8_tuned_batched_gemm.csv",
)

AITER_CONFIG_BF16_BATCHED_GEMM = os.getenv(
"AITER_CONFIG_BF16_BATCHED_GEMM",
f"{AITER_ROOT_DIR}/aiter/configs/bf16_tuned_batched_gemm.csv",
)

AITER_CONFIG_GEMM_BF16 = os.getenv(
"AITER_CONFIG_GEMM_BF16",
f"{AITER_ROOT_DIR}/aiter/configs/tuned_gemm.csv",
)
AITER_CONFIG_GEMM_A4W4_FILE = get_config_file(
"AITER_CONFIG_GEMM_A4W4", "a4w4_blockscale_tuned_gemm"
"AITER_CONFIG_GEMM_A4W4", AITER_CONFIG_GEMM_A4W4, "a4w4_blockscale_tuned_gemm"
)

AITER_CONFIG_GEMM_A8W8_FILE = get_config_file(
"AITER_CONFIG_GEMM_A8W8", "a8w8_tuned_gemm"
"AITER_CONFIG_GEMM_A8W8", AITER_CONFIG_GEMM_A8W8, "a8w8_tuned_gemm"
)
AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE = get_config_file(
"AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE", "a8w8_bpreshuffle_tuned_gemm"
"AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE",
AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE,
"a8w8_bpreshuffle_tuned_gemm",
)
AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_FILE = get_config_file(
"AITER_CONFIG_GEMM_A8W8_BLOCKSCALE", "a8w8_blockscale_tuned_gemm"
"AITER_CONFIG_GEMM_A8W8_BLOCKSCALE",
AITER_CONFIG_GEMM_A8W8_BLOCKSCALE,
"a8w8_blockscale_tuned_gemm",
)
AITER_CONFIG_FMOE_FILE = get_config_file(
"AITER_CONFIG_FMOE", AITER_CONFIG_FMOE, "tuned_fmoe"
)
AITER_CONFIG_FMOE_FILE = get_config_file("AITER_CONFIG_FMOE", "tuned_fmoe")

AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_FILE = get_config_file(
"AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE",
AITER_CONFIG_GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE,
"a8w8_blockscale_bpreshuffle_tuned_gemm",
)

AITER_CONFIG_A8W8_BATCHED_GEMM_FILE = get_config_file(
"AITER_CONFIG_A8W8_BATCHED_GEMM", "a8w8_tuned_batched_gemm"
"AITER_CONFIG_A8W8_BATCHED_GEMM",
AITER_CONFIG_A8W8_BATCHED_GEMM,
"a8w8_tuned_batched_gemm",
)

AITER_CONFIG_BF16_BATCHED_GEMM_FILE = get_config_file(
"AITER_CONFIG_BATCHED_GEMM_BF16", "bf16_tuned_batched_gemm"
"AITER_CONFIG_BF16_BATCHED_GEMM",
AITER_CONFIG_BF16_BATCHED_GEMM,
"bf16_tuned_batched_gemm",
)

AITER_CONFIG_GEMM_BF16_FILE = get_config_file(
"AITER_CONFIG_GEMM_BF16", "bf16_tuned_gemm"
"AITER_CONFIG_GEMM_BF16", AITER_CONFIG_GEMM_BF16, "bf16_tuned_gemm"
)

# config_env end here
Expand Down
36 changes: 24 additions & 12 deletions aiter/utility/base_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from operator import itemgetter
import time
from aiter import dtypes
from aiter import core

INVALID_TIME = -1

Expand Down Expand Up @@ -175,15 +176,20 @@ def get_untuned_gemm_list(self, untuned_gemm_file):
filtered_df = untunedf.drop_duplicates().reset_index(drop=True)
return filtered_df

def get_out_file(self, tuned_file):
"""if there are multiple tuned file, then write tuning result to the first file"""
path_list = tuned_file.split(os.pathsep) if tuned_file else []
assert path_list, f"output tuned file is empty"
return path_list[0]

def get_tuned_gemm_list(self, tuned_gemm_file, columns=[]):
path_list = tuned_gemm_file.split(os.pathsep) if tuned_gemm_file else []
assert len(path_list) <= 1, f"tuning to multiple files is not supported"
if os.path.exists(tuned_gemm_file):
column_order = pd.read_csv(tuned_gemm_file, nrows=0).columns.tolist()
tunedf = pd.read_csv(tuned_gemm_file)
all_tuned_file = core.update_config_files(tuned_gemm_file, self.name)
if os.path.exists(all_tuned_file):
column_order = pd.read_csv(all_tuned_file, nrows=0).columns.tolist()
tunedf = pd.read_csv(all_tuned_file)
tunedf = tunedf[column_order]
else:
print(f"Not exist tuned file: {tuned_gemm_file}")
print(f"Not exist tuned file: {all_tuned_file}")
columns = self.columns if not columns else columns
tunedf = pd.DataFrame(columns=columns)
return tunedf
Expand All @@ -192,7 +198,7 @@ def get_retune_gemm_list(self, args):
"""get retune gemm list from tune_file and untune_file"""
if args.untune_file is None:
raise ValueError("untune_file must be specified for retuning")
if args.tune_file == args.untune_file:
if self.get_out_file(args.tune_file) == args.untune_file:
# retune all shapes in tune_file
self.untunedf = self.get_untuned_gemm_list(args.untune_file)
self.tunedf = self.untunedf[self.untunedf["cu_num"] != self.get_cu_num()]
Expand Down Expand Up @@ -351,19 +357,23 @@ def run(self, args, fast_mode=False):
"""tuner run function"""
self.pre_process(args)
print(self.untunedf)
output_file = self.get_out_file(args.tune_file)
if args.verbose:
logger.info(f"args: {args}")
if len(self.untunedf) == 0:
# self.update_tflops_bw(args.tune_file)
self.sortResults(args.tune_file, args.sort, self.keys)
logger.info(f"no shapes to be tuned, skip tuning")
self.sortResults(output_file, args.sort, self.keys)
logger.info(
f"no shapes to be tuned, skip tuning, tuned file is {args.tune_file}"
)
return self.tunedf if self.tunedf is not None else pd.DataFrame()
batch_size = min(args.batch, len(self.untunedf))
total_batches = (len(self.untunedf) + batch_size - 1) // batch_size
if args.verbose:
logger.info(
f"total shapes to be tuned: {len(self.untunedf) }, total_batches: {total_batches}, batch_size: {batch_size}"
)
logger.info(f"results will be written to {output_file}")
processed_batches = 0
results = []
topk = -1 if fast_mode else 1
Expand All @@ -376,13 +386,15 @@ def run(self, args, fast_mode=False):
all_results = self.tune(batch, self.tunedf, args)
if all_results:
results = self.post_process(all_results, args, topk)
self.result_to_csv(results, args.tune_file, not args.all)
self.result_to_csv(results, output_file, not args.all)
logger.info(
f"processed {processed_batches} batches of {total_batches}, Processing Status ====> {round(processed_batches / total_batches,2)*100:.1f}% tuned in {self.name}"
)
else:
logger.info("tune result is none or all shape is tuned!")
self.sortResults(args.tune_file, args.sort, self.keys)
logger.info(
f"tune result is none or all shape is tuned in {args.tune_file}!"
)
self.sortResults(output_file, args.sort, self.keys)
except KeyboardInterrupt:
tuning_status = "Interrupted"
logger.error(
Expand Down
3 changes: 3 additions & 0 deletions csrc/ck_gemm_a8w8_bpreshuffle/gemm_a8w8_bpreshuffle_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ def get_ck_gemm_a8w8_bpreshuffle_tune_task(
):
(cu_num, M, N, K, q_dtype_w) = info_keys
if eval(q_dtype_w) != dtypes.fp8:
print(
f"Warning: q_dtype_w only support {dtypes.fp8}, actual q_dtype_w is {q_dtype_w}!"
)
return []
kernels_num = len(kernels_list)
gemm_a8w8_idx = [0, 1, 2, 3, 4] # input index in generate_data
Expand Down
8 changes: 4 additions & 4 deletions gradlib/gradlib/GemmTuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from functools import lru_cache
from aiter.jit.core import get_asm_dir
from aiter.jit.utils.chip_info import get_cu_num
from aiter.jit.core import AITER_CONFIG_GEMM_BF16_FILE, get_asm_dir
from aiter.jit.core import AITER_CONFIG_GEMM_BF16, get_asm_dir
from aiter.utility.base_tuner import GemmCommonTuner

aiter.rocb_create_extension()
Expand Down Expand Up @@ -554,7 +554,7 @@ def cleanup(self):
class GemmTuner(GemmCommonTuner):
ARG_DEFAULTS = {
**GemmCommonTuner.ARG_DEFAULTS,
"tune_file": f"{AITER_CONFIG_GEMM_BF16_FILE}",
"tune_file": f"{AITER_CONFIG_GEMM_BF16}",
"untune_file": "aiter/configs/untuned_gemm.csv",
"batch": 1,
}
Expand All @@ -563,7 +563,7 @@ def _setup_specific_arguments(self):
self.parser.add_argument(
"--tuned_file",
type=str,
default=os.getenv("GTUNE_TUNED", AITER_CONFIG_GEMM_BF16_FILE),
default=os.getenv("GTUNE_TUNED", AITER_CONFIG_GEMM_BF16),
dest="tune_file",
help="output file for tuned gemm solutions",
)
Expand Down Expand Up @@ -686,7 +686,7 @@ def pre_process(self, args):
outdtype=str(ds["outdtype"]),
scaleAB=ds["scaleAB"],
)
self.tunedf = self.get_tuned_gemm_list(args.tune_file)
self.tunedf = self.get_tuned_gemm_list(self.get_out_file(args.tune_file))
self.untunedf["cu_num"] = self.get_cu_num()
untunedf_cols = self.untunedf.columns
if len(self.tunedf) != 0:
Expand Down
14 changes: 8 additions & 6 deletions hsa/gfx942/fmoe_2stages/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -1849,15 +1849,15 @@ def post_process(self, results, args, topk=-1, fast_mode=False):
failedf = pd.DataFrame(ret, columns=self.columns)
self.failed = pd.concat([self.failed, failedf], axis=0)
continue
profileDF["total_us"] = round(profileDF["us1"] + profileDF["us2"], 4)
profileDF["us"] = round(profileDF["us1"] + profileDF["us2"], 4)
results = profileDF.apply(
lambda row: self.calculate(
(
tuple(row[col] for col in self.keys),
"",
row["kernelName1"],
row["block_m"],
row["total_us"],
row["us"],
row["err1"],
)
),
Expand All @@ -1869,9 +1869,9 @@ def post_process(self, results, args, topk=-1, fast_mode=False):
profileDF.drop(["tflops1", "tflops2", "bw1", "bw2"], axis=1, inplace=True)
profileDF["err1"] = profileDF["err1"].apply(lambda x: f"{x:.1%}")
profileDF["err2"] = profileDF["err2"].apply(lambda x: f"{x:.1%}")
best_one = profileDF.loc[profileDF["total_us"].idxmin()].copy()
best_one = profileDF.loc[profileDF["us"].idxmin()].copy()
print(
f"Tuning result for {key} is {best_one['block_m'] ,best_one['kernelName1'], best_one['kernelName2'], best_one['err1'], best_one['err2'], best_one['run_1stage']} {best_one['total_us']} us, {best_one['tflops']} TFLOPS, {best_one['bw']} GB/s"
f"Tuning result for {key} is {best_one['block_m'] ,best_one['kernelName1'], best_one['kernelName2'], best_one['err1'], best_one['err2'], best_one['run_1stage']} {best_one['us']} us, {best_one['tflops']} TFLOPS, {best_one['bw']} GB/s"
)
best_one["act_type"] = str(best_one["act_type"])
best_one["q_type"] = str(best_one["q_type"])
Expand Down Expand Up @@ -1900,7 +1900,9 @@ def pre_process(self, args):
self.untunedf = self.get_untuned_gemm_list(args.untune_file)

if not args.all or args.last:
self.tunedf = self.get_tuned_gemm_list(args.tune_file)
self.tunedf = self.get_tuned_gemm_list(
self.get_out_file(args.tune_file)
)
else:
self.tunedf = None
self.untunedf["cu_num"] = self.get_cu_num()
Expand Down Expand Up @@ -1941,7 +1943,7 @@ def pre_process(self, args):
"us2",
"kernelName2",
"err2",
"total_us",
"us",
"run_1stage",
"tflops",
"bw",
Expand Down
2 changes: 1 addition & 1 deletion hsa/gfx950/fmoe_2stages/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def tune(
"us2",
"kernelName2",
"err2",
"total_us",
"us",
"run_1stage",
"tflops",
"bw",
Expand Down