From b6d6e0db5b46c605bbf9a9443c78c03e06bb9bdc Mon Sep 17 00:00:00 2001 From: "Ying.Zhou2" Date: Mon, 23 Mar 2026 16:53:24 +0800 Subject: [PATCH 1/3] rm gemm_commona= and quant type bind --- aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv | 21 -------------- aiter/jit/core.py | 28 +++++++++++++----- aiter/jit/optCompilerConfig.json | 13 --------- aiter/ops/enum.py | 29 ++++++++++++------- aiter/ops/gemm_op_common.py | 8 ++--- csrc/include/aiter_enum.h | 1 + csrc/include/gemm_common.h | 11 +++++-- csrc/include/rocm_ops.hpp | 21 -------------- csrc/py_itfs_cu/gemm_common.cu | 20 ++++++------- csrc/pybind/aiter_enum_pybind.cu | 10 ------- csrc/pybind/gemm_common_pybind.cu | 9 ------ setup.py | 3 +- 12 files changed, 64 insertions(+), 110 deletions(-) mode change 100755 => 100644 aiter/jit/optCompilerConfig.json delete mode 100644 csrc/pybind/aiter_enum_pybind.cu delete mode 100644 csrc/pybind/gemm_common_pybind.cu diff --git a/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv b/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv index 7263c6b7be..4bb70de123 100644 --- a/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv +++ b/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv @@ -449,37 +449,16 @@ cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio 80,64,1536,5120,torch.int8,asm,160,6,12.9215,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,77.9,649.2,0.1336 80,80,1536,5120,torch.int8,asm,160,6,13.5695,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,92.73,627.86,0.135 80,128,1536,5120,torch.int8,asm,160,6,14.3572,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,140.23,620.8,0.1335 -80,150,1536,5120,torch.int8,asm,161,6,18.1719,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,129.83,500.39,0.1344 -80,192,1536,5120,torch.int8,asm,161,6,18.2027,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,165.9,518.45,0.1342 80,220,1536,5120,torch.int8,asm,160,3,21.1629,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,163.51,456.77,0.0857 80,256,1536,5120,torch.int8,asm,160,3,21.3733,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,188.39,466.07,0.0853 -80,384,1536,5120,torch.int8,asm,161,3,26.9758,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,223.9,408.15,0.0875 -80,448,1536,5120,torch.int8,asm,161,2,33.1084,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,212.83,348.38,0.0523 -80,512,1536,5120,torch.int8,asm,161,2,33.5781,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,239.83,359.12,0.0534 80,128,8192,1024,torch.int8,asm,160,1,14.9178,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,143.95,711.69,0.0 -80,192,8192,1024,torch.int8,asm,161,1,19.5799,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,164.52,599.13,0.0007 80,256,8192,1024,torch.int8,asm,160,1,27.0157,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,158.98,475.47,0.0 -80,320,8192,1024,torch.int8,asm,161,1,34.1636,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,157.15,408.6,0.0 -80,512,8192,1024,torch.int8,asm,161,1,49.4192,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,173.82,350.1,0.0003 -80,1024,8192,1024,torch.int8,asm,161,1,82.0338,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,209.42,319.56,0.0 -80,2048,8192,1024,torch.int8,asm,161,1,147.0079,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,233.73,299.58,0.0003 -80,4096,8192,1024,torch.int8,asm,161,1,288.7332,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,238.0,276.0,0.0003 -80,8192,8192,1024,torch.int8,asm,161,1,563.934,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,243.71,267.75,0.0006 -80,16384,8192,1024,torch.int8,asm,161,1,1104.9189,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,248.78,265.72,0.0004 -80,128,1280,8192,torch.int8,asm,161,8,18.4707,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,145.33,642.21,0.1588 -80,192,1280,5120,torch.int8,asm,161,8,16.3619,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,153.81,490.66,0.1651 -80,192,1280,8192,torch.int8,asm,161,8,19.5141,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,206.34,643.13,0.1637 80,192,1280,1024,torch.int8,asm,160,6,9.0346,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,55.71,221.24,0.0914 80,192,1536,1024,torch.int8,asm,160,3,10.3979,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,58.09,226.9,0.0725 80,256,1280,8192,torch.int8,asm,160,4,23.0901,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,232.51,573.33,0.1074 -80,320,1280,8192,torch.int8,asm,161,4,29.0859,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,230.73,478.8,0.107 80,512,1280,8192,torch.int8,asm,160,2,38.6745,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,277.64,413.47,0.0542 80,1024,1280,8192,torch.int8,asm,160,1,69.7841,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,307.73,308.03,0.0 80,2048,1280,8192,torch.int8,asm,160,1,132.2413,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,324.78,245.81,0.0 -80,4096,1280,8192,torch.int8,asm,161,1,251.3328,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,341.78,216.95,0.0021 -80,8192,1280,8192,torch.int8,asm,161,1,496.6916,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,345.89,198.45,0.0016 -80,16384,1280,8192,torch.int8,asm,161,1,906.8273,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,378.9,205.82,0.0021 -80,192,8192,5120,torch.int8,asm,161,1,60.0426,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_192x128E,268.25,767.32,0.0026 304,64,1536,5120,torch.int8,asm,0,1,16.7759,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,60.0,500.04,0.0 304,128,1536,5120,torch.int8,asm,0,1,17.9602,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,112.1,496.26,0.0 304,256,1536,5120,torch.int8,asm,0,1,17.6791,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,227.76,563.46,0.0 diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 7e366d5b5e..d38fea5d28 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -1067,18 +1067,29 @@ def _ensure_loaded(): ) lib = ctypes.CDLL(so_path) c_func = getattr(lib, fc_name) - c_func.restype = None hints = typing.get_type_hints(func) + + ret_hint = hints.get("return") + if ret_hint is int: + c_func.restype = ctypes.c_int + elif ret_hint is float: + c_func.restype = ctypes.c_float + else: + c_func.restype = None + argtypes = [] + has_tensor = False for pname in inspect.signature(func).parameters: hint = hints.get(pname) origin = typing.get_origin(hint) type_args = typing.get_args(hint) if hint is torch.Tensor: argtypes.append(ctypes.POINTER(AiterTensor)) + has_tensor = True elif _is_union(origin) and torch.Tensor in type_args: argtypes.append(ctypes.POINTER(AiterTensor)) + has_tensor = True elif _is_union(origin) and int in type_args: argtypes.append(ctypes.c_int) elif _is_union(origin) and str in type_args: @@ -1091,11 +1102,13 @@ def _ensure_loaded(): argtypes.append(ctypes.c_float) else: argtypes.append(ctypes.c_void_p) - argtypes.append(ctypes.c_void_p) # hipStream_t + if has_tensor: + argtypes.append(ctypes.c_void_p) # hipStream_t c_func.argtypes = argtypes _cache["lib"] = lib _cache["c_func"] = c_func + _cache["has_tensor"] = has_tensor def _check_args_before_convert(bound_args, hints): for pname, value in bound_args.items(): @@ -1197,10 +1210,11 @@ def caller(*args, **kwargs): else: c_args.append(value) - c_args.append( - ctypes.c_void_p(torch.cuda.current_stream(tensor_device).cuda_stream) - ) - c_func(*c_args) + if _cache.get("has_tensor"): + c_args.append( + ctypes.c_void_p(torch.cuda.current_stream(tensor_device).cuda_stream) + ) + return c_func(*c_args) return caller @@ -1220,7 +1234,7 @@ def decorator(func): @functools.wraps(func) def ctypes_wrapper(*args, **kwargs): - ctypes_caller(*args, **kwargs) + return ctypes_caller(*args, **kwargs) @torch_compile_guard(device="cuda", calling_func_=func) def ctypes_custom_wrapper(*args, **kwargs): diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json old mode 100755 new mode 100644 index 8528c1599b..a54f31d8be --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -1,16 +1,4 @@ { - "module_aiter_enum": { - "srcs": [ - "f'{AITER_CSRC_DIR}/pybind/aiter_enum_pybind.cu'" - ], - "flags_extra_cc": [], - "flags_extra_hip": [], - "extra_ldflags": "None", - "extra_include": [], - "verbose": "False", - "torch_exclude": "True", - "blob_gen_cmd": "''" - }, "module_activation": { "srcs": [ "f'{AITER_CSRC_DIR}/pybind/activation_pybind.cu'", @@ -198,7 +186,6 @@ }, "module_gemm_common": { "srcs": [ - "f'{AITER_CSRC_DIR}/pybind/gemm_common_pybind.cu'", "f'{AITER_CSRC_DIR}/py_itfs_cu/gemm_common.cu'" ], "flags_extra_cc": [], diff --git a/aiter/ops/enum.py b/aiter/ops/enum.py index edc1ddd671..2fdcfc6f47 100644 --- a/aiter/ops/enum.py +++ b/aiter/ops/enum.py @@ -1,16 +1,25 @@ -from ..jit.core import compile_ops - -# from enum import Enum as Enum -Enum = int +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +# Mirror of csrc/include/aiter_enum.h -- update both when changing enum values +from enum import IntEnum -@compile_ops("module_aiter_enum", "ActivationType") -def _ActivationType(dummy): ... +Enum = int -@compile_ops("module_aiter_enum", "QuantType") -def _QuantType(dummy): ... +class ActivationType(IntEnum): + No = -1 + Silu = 0 + Gelu = 1 + Swiglu = 2 -ActivationType = type(_ActivationType(0)) -QuantType = type(_QuantType(0)) +class QuantType(IntEnum): + No = 0 + per_Tensor = 1 + per_Token = 2 + per_1x32 = 3 + per_1x128 = 4 + per_128x128 = 5 + per_256x128 = 6 + per_1024x128 = 7 diff --git a/aiter/ops/gemm_op_common.py b/aiter/ops/gemm_op_common.py index 114e71963b..a5e893516e 100644 --- a/aiter/ops/gemm_op_common.py +++ b/aiter/ops/gemm_op_common.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. -from ..jit.core import ( - compile_ops, -) +from ..jit.core import compile_ops -@compile_ops("module_gemm_common") +@compile_ops("module_gemm_common", fc_name="getPaddedM", ffi_type="ctypes") def get_padded_m(M: int, N: int, K: int, gl: int) -> int: ... diff --git a/csrc/include/aiter_enum.h b/csrc/include/aiter_enum.h index fc33faa179..902a85c7d9 100644 --- a/csrc/include/aiter_enum.h +++ b/csrc/include/aiter_enum.h @@ -1,6 +1,7 @@ #pragma once // SPDX-License-Identifier: MIT // Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +// Single source of truth: aiter/ops/enum.py parses enums from this file #include diff --git a/csrc/include/gemm_common.h b/csrc/include/gemm_common.h index da1c33d115..debb85fe53 100644 --- a/csrc/include/gemm_common.h +++ b/csrc/include/gemm_common.h @@ -1,6 +1,13 @@ #pragma once // SPDX-License-Identifier: MIT -// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. -#include +// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. + +#ifdef __cplusplus +extern "C" { +#endif int getPaddedM(int M, int N, int K, int gl /*granularity level*/); + +#ifdef __cplusplus +} +#endif diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index 3cc63807f9..dcbd9a139f 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -1673,27 +1673,6 @@ namespace py = pybind11; m.def("rocb_mm", &RocSolIdxBlas, "mm"); \ m.def("rocb_findallsols", &RocFindAllSolIdxBlas, "rocblas_find_all_sols"); -#define AITER_ENUM_PYBIND \ - pybind11::enum_(m, "QuantType") \ - .value("No", QuantType::No) \ - .value("per_Tensor", QuantType::per_Tensor) \ - .value("per_Token", QuantType::per_Token) \ - .value("per_1x32", QuantType::per_1x32) \ - .value("per_1x128", QuantType::per_1x128) \ - .value("per_128x128", QuantType::per_128x128) \ - .value("per_256x128", QuantType::per_256x128) \ - .value("per_1024x128", QuantType::per_1024x128) \ - .export_values(); \ - pybind11::enum_(m, "ActivationType") \ - .value("No", ActivationType::No) \ - .value("Silu", ActivationType::Silu) \ - .value("Gelu", ActivationType::Gelu) \ - .value("Swiglu", ActivationType::Swiglu) \ - .export_values(); \ - pybind11::implicitly_convertible(); \ - pybind11::implicitly_convertible(); -#define GEMM_COMMON_PYBIND \ - m.def("get_padded_m", &getPaddedM, py::arg("M"), py::arg("N"), py::arg("K"), py::arg("gl")); #define TOP_K_PER_ROW_PYBIND \ m.def("top_k_per_row_prefill", \ diff --git a/csrc/py_itfs_cu/gemm_common.cu b/csrc/py_itfs_cu/gemm_common.cu index 743b3c98f9..b1d0b0059f 100644 --- a/csrc/py_itfs_cu/gemm_common.cu +++ b/csrc/py_itfs_cu/gemm_common.cu @@ -1,6 +1,6 @@ - -#include -#include "gemm_common.h" +// SPDX-License-Identifier: MIT +// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +#include static constexpr int nextPow2(unsigned int num) { @@ -9,25 +9,26 @@ static constexpr int nextPow2(unsigned int num) return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); } +extern "C" __attribute__((visibility("default"))) int getPaddedM(int M, int N, int K, int gl) { int padded_m = M; // granularity level, gl = 0, Fine-grained search if (gl == 0) { if(M <= 256) { - padded_m = (M + 15) / 16 * 16; // Round up to the next multiple of 16 + padded_m = (M + 15) / 16 * 16; } else if(M <= 1024) { - padded_m = (M + 31) / 32 * 32; // Round up to the next multiple of 32 + padded_m = (M + 31) / 32 * 32; } else if(M <= 4096) { - padded_m = (M + 63) / 64 * 64; // Round up to the next multiple of 64 + padded_m = (M + 63) / 64 * 64; } else { - padded_m = (M + 127) / 128 * 128; // Round up to the next multiple of 128 + padded_m = (M + 127) / 128 * 128; } } else if (gl == 1) { if (M > 8192 && N > 4096) { @@ -35,7 +36,6 @@ int getPaddedM(int M, int N, int K, int gl) { } else { padded_m = nextPow2(M); } - } + } return padded_m; - -} \ No newline at end of file +} diff --git a/csrc/pybind/aiter_enum_pybind.cu b/csrc/pybind/aiter_enum_pybind.cu deleted file mode 100644 index 165b6cf6ca..0000000000 --- a/csrc/pybind/aiter_enum_pybind.cu +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#include -#include "rocm_ops.hpp" -#include "aiter_enum.h" - -PYBIND11_MODULE(module_aiter_enum, m) -{ - AITER_ENUM_PYBIND; -} diff --git a/csrc/pybind/gemm_common_pybind.cu b/csrc/pybind/gemm_common_pybind.cu deleted file mode 100644 index cc8874cf9d..0000000000 --- a/csrc/pybind/gemm_common_pybind.cu +++ /dev/null @@ -1,9 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. -#include "rocm_ops.hpp" -#include "gemm_common.h" - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - GEMM_COMMON_PYBIND; -} \ No newline at end of file diff --git a/setup.py b/setup.py index 605cc94716..f7c1c1607f 100644 --- a/setup.py +++ b/setup.py @@ -165,10 +165,9 @@ def get_exclude_ops(): ): exclude_ops.append(module) elif PREBUILD_KERNELS == 3: - # Keep only module_fmha_v3* and module_aiter_enum + # Keep only module_fmha_v3* if not ( module.startswith("module_fmha_v3") - or module == "module_aiter_enum" or module == "module_gemm_mi350_a8w8_blockscale_asm" ): exclude_ops.append(module) From 1d3f455089ebdade04188fd2efba5e22b8d938b4 Mon Sep 17 00:00:00 2001 From: "Ying.Zhou2" Date: Mon, 23 Mar 2026 17:11:46 +0800 Subject: [PATCH 2/3] retune failed shape in a8w8_bpreshuffle_tuned_gemm.csv --- aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv | 1007 +++++++++-------- 1 file changed, 514 insertions(+), 493 deletions(-) diff --git a/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv b/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv index 4bb70de123..cd001e869b 100644 --- a/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv +++ b/aiter/configs/a8w8_bpreshuffle_tuned_gemm.csv @@ -1,493 +1,514 @@ -cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio -80,192,1536,5120,torch.float8_e4m3fnuz,cktile,9,0,18.9229,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,159.59,498.72,0.0 -256,1,800,5120,torch.float8_e4m3fn,ck,10,0,11.3133,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.72,362.65,0.0 -256,1,1280,8192,torch.float8_e4m3fn,ck,10,0,12.9809,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.62,808.61,0.0 -256,1,2304,16384,torch.float8_e4m3fn,ck,10,0,28.7056,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.63,1315.76,0.0 -256,1,2560,8192,torch.float8_e4m3fn,ck,10,0,12.8571,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,3.26,1632.16,0.0 -256,1,4608,16384,torch.float8_e4m3fn,ck,10,0,29.8957,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.05,2526.22,0.0 -256,1,5120,640,torch.float8_e4m3fn,ck,23,0,5.7466,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,1.14,572.11,0.0 -256,1,5120,1280,torch.float8_e4m3fn,ck,108,0,6.8374,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1.92,960.18,0.0 -256,1,5120,3200,torch.float8_e4m3fn,ck,23,0,15.188,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,2.16,1079.63,0.0 -256,1,5120,5120,torch.float8_e4m3fn,ck,10,0,11.7201,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,4.47,2238.02,0.0 -256,1,5120,6400,torch.float8_e4m3fn,ck,108,0,19.1846,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.42,1708.9,0.0 -256,1,5120,25600,torch.float8_e4m3fn,ck,10,0,46.1466,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.68,2841.12,0.0 -256,1,6400,5120,torch.float8_e4m3fn,ck,24,0,12.5759,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,5.21,2607.04,0.0 -256,1,7168,8192,torch.float8_e4m3fn,ck,24,0,15.0501,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,7.8,3903.15,0.0 -256,1,8192,1024,torch.float8_e4m3fn,ck,108,0,5.9704,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2.81,1407.95,0.0 -256,1,8192,2048,torch.float8_e4m3fn,ck,10,0,5.8009,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.78,2895.35,0.0 -256,1,8192,3584,torch.float8_e4m3fn,ck,10,0,7.8125,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,7.52,3760.65,0.0 -256,1,8192,7168,torch.float8_e4m3fn,ck,24,0,13.9797,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,8.4,4202.08,0.0 -256,1,8192,8192,torch.float8_e4m3fn,ck,10,0,14.8978,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.01,4506.27,0.0 -256,1,8192,28672,torch.float8_e4m3fn,ck,10,0,46.0411,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.2,5102.53,0.0 -256,1,9216,16384,torch.float8_e4m3fn,ck,10,0,35.2763,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.56,4281.34,0.0 -256,1,10240,8192,torch.float8_e4m3fn,ck,10,0,16.9524,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.9,4950.02,0.0 -256,1,12800,5120,torch.float8_e4m3fn,ck,10,0,14.7141,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.91,4456.05,0.0 -256,1,13312,16384,torch.float8_e4m3fn,ck,10,0,40.7795,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.7,5349.42,0.0 -256,1,14336,8192,torch.float8_e4m3fn,ck,10,0,21.766,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.79,5397.29,0.0 -256,1,16384,2048,torch.float8_e4m3fn,ck,24,0,9.9256,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,6.76,3384.1,0.0 -256,1,16384,4096,torch.float8_e4m3fn,ck,24,0,14.6481,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,9.16,4583.92,0.0 -256,1,16384,6656,torch.float8_e4m3fn,ck,24,0,21.1312,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,10.32,5162.57,0.0 -256,1,16384,8192,torch.float8_e4m3fn,ck,10,0,25.6037,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.48,5243.72,0.0 -256,1,16384,13312,torch.float8_e4m3fn,ck,11,0,39.322,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,11.09,5547.78,0.0 -256,1,16384,26624,torch.float8_e4m3fn,ck,5,0,73.9656,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,11.79,5898.24,0.0 -256,1,26624,16384,torch.float8_e4m3fn,ck,6,0,76.8576,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.35,5676.44,0.0 -256,1,51200,5120,torch.float8_e4m3fn,ck,16,0,45.1392,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.61,5809.84,0.0 -256,1,53248,16384,torch.float8_e4m3fn,ck,23,0,150.2505,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,11.61,5807.22,0.0 -256,1,57344,8192,torch.float8_e4m3fn,ck,9,0,77.922,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,12.06,6030.2,0.0 -256,4,16384,6656,torch.float8_e4m3fn,ck,11,0,20.7235,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,42.1,5269.84,0.0 -256,8,6656,16384,torch.float8_e4m3fn,ck,10,0,31.2392,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,55.85,3498.47,0.0 -256,8,16384,6656,torch.float8_e4m3fn,ck,11,0,20.8171,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,83.82,5253.72,0.0 -256,16,800,5120,torch.float8_e4m3fn,ck,24,0,10.5404,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,12.44,398.8,0.0 -256,16,1280,8192,torch.float8_e4m3fn,ck,10,0,12.0223,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,27.91,886.5,0.0 -256,16,2304,16384,torch.float8_e4m3fn,ck,24,0,24.7047,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,48.9,1541.59,0.0 -256,16,2560,8192,torch.float8_e4m3fn,ck,10,0,11.95,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,56.16,1772.76,0.0 -256,16,4608,16384,torch.float8_e4m3fn,ck,24,0,25.8456,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,93.48,2936.94,0.0 -256,16,5120,640,torch.float8_e4m3fn,ck,23,0,5.7485,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,18.24,600.31,0.0 -256,16,5120,1280,torch.float8_e4m3fn,ck,29,0,6.7718,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,30.97,995.0,0.0 -256,16,5120,3200,torch.float8_e4m3fn,ck,23,0,14.71,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,35.64,1128.42,0.0 -256,16,5120,5120,torch.float8_e4m3fn,ck,24,0,10.554,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,79.48,2507.12,0.0 -256,16,5120,6400,torch.float8_e4m3fn,ck,15,0,19.3187,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,54.28,1709.96,0.0 -256,16,5120,25600,torch.float8_e4m3fn,ck,24,0,39.1761,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,107.06,3360.35,0.0 -256,16,6400,5120,torch.float8_e4m3fn,ck,10,0,11.3305,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,92.54,2917.32,0.0 -256,16,7168,8192,torch.float8_e4m3fn,ck,10,0,13.6022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,138.14,4343.47,0.0 -256,16,8192,1024,torch.float8_e4m3fn,ck,15,0,5.9167,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,45.37,1464.86,0.0 -256,16,8192,2048,torch.float8_e4m3fn,ck,10,0,5.6214,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,95.5,3036.99,0.0 -256,16,8192,3584,torch.float8_e4m3fn,ck,10,0,7.8314,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,119.97,3789.82,0.0 -256,16,8192,7168,torch.float8_e4m3fn,ck,10,0,12.5267,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,150.0,4717.69,0.0 -256,16,8192,8192,torch.float8_e4m3fn,ck,10,0,13.9737,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,153.68,4830.65,0.0 -256,16,8192,28672,torch.float8_e4m3fn,ck,10,0,41.1742,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,182.55,5722.08,0.0 -256,16,9216,16384,torch.float8_e4m3fn,ck,24,0,32.3786,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,149.23,4680.62,0.0 -256,16,10240,8192,torch.float8_e4m3fn,ck,10,0,17.892,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,150.03,4714.11,0.0 -256,16,12800,5120,torch.float8_e4m3fn,ck,5,0,14.0716,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,149.03,4692.25,0.0 -256,16,13312,16384,torch.float8_e4m3fn,ck,10,0,40.3808,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,172.84,5418.22,0.0 -256,16,14336,8192,torch.float8_e4m3fn,ck,10,0,21.2685,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,176.7,5549.54,0.0 -256,16,16384,2048,torch.float8_e4m3fn,ck,10,0,7.7604,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,138.36,4395.58,0.0 -256,16,16384,4096,torch.float8_e4m3fn,ck,26,0,17.4151,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v2,123.31,3887.36,0.0 -256,16,16384,6656,torch.float8_e4m3fn,ck,24,0,20.8707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,167.2,5255.34,0.0 -256,16,16384,8192,torch.float8_e4m3fn,ck,10,0,25.0145,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,171.7,5391.8,0.0 -256,16,16384,13312,torch.float8_e4m3fn,ck,5,0,37.8531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,184.38,5781.33,0.0 -256,16,16384,26624,torch.float8_e4m3fn,ck,11,0,72.6906,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,192.03,6013.95,0.0 -256,16,26624,16384,torch.float8_e4m3fn,ck,6,0,77.3342,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,180.5,5654.96,0.0 -256,16,51200,5120,torch.float8_e4m3fn,ck,17,0,46.1585,a8w8_bpreshuffle_256x16x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,181.73,5716.48,0.0 -256,16,53248,16384,torch.float8_e4m3fn,ck,31,0,151.3677,a8w8_bpreshuffle_256x16x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v2,184.43,5776.54,0.0 -256,16,57344,8192,torch.float8_e4m3fn,ck,23,0,79.5497,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,188.97,5929.98,0.0 -256,32,800,5120,torch.float8_e4m3fn,ck,24,0,10.2279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,25.63,421.5,0.0 -256,32,1280,8192,torch.float8_e4m3fn,ck,10,0,11.3752,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,59.0,952.06,0.0 -256,32,2304,16384,torch.float8_e4m3fn,ck,24,0,25.2915,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,95.52,1519.11,0.0 -256,32,2560,8192,torch.float8_e4m3fn,ck,10,0,12.0316,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,111.55,1778.44,0.0 -256,32,4608,16384,torch.float8_e4m3fn,ck,24,0,29.2704,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,165.08,2607.3,0.0 -256,32,5120,640,torch.float8_e4m3fn,ck,23,0,6.4386,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,32.57,563.0,0.0 -256,32,5120,1280,torch.float8_e4m3fn,ck,29,0,6.9951,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,59.96,989.58,0.0 -256,32,5120,3200,torch.float8_e4m3fn,ck,9,0,15.6619,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,66.95,1073.57,0.0 -256,32,5120,5120,torch.float8_e4m3fn,ck,11,0,11.6951,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,143.46,2283.51,0.0 -256,32,5120,6400,torch.float8_e4m3fn,ck,15,0,19.3178,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,108.56,1723.82,0.0 -256,32,5120,25600,torch.float8_e4m3fn,ck,24,0,44.1556,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,189.98,2994.39,0.0 -256,32,6400,5120,torch.float8_e4m3fn,ck,5,0,12.3982,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,169.15,2689.22,0.0 -256,32,7168,8192,torch.float8_e4m3fn,ck,11,0,15.5322,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,241.96,3826.96,0.0 -256,32,8192,1024,torch.float8_e4m3fn,ck,108,0,6.0451,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,88.81,1479.82,0.0 -256,32,8192,2048,torch.float8_e4m3fn,ck,5,0,6.2663,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,171.35,2771.5,0.0 -256,32,8192,3584,torch.float8_e4m3fn,ck,5,0,7.9538,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,236.25,3771.67,0.0 -256,32,8192,7168,torch.float8_e4m3fn,ck,11,0,14.5895,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,257.59,4076.49,0.0 -256,32,8192,8192,torch.float8_e4m3fn,ck,5,0,15.7818,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,272.15,4302.13,0.0 -256,32,8192,28672,torch.float8_e4m3fn,ck,5,0,47.2548,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,318.11,5001.03,0.0 -256,32,9216,16384,torch.float8_e4m3fn,ck,24,0,39.3729,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,245.44,3863.29,0.0 -256,32,10240,8192,torch.float8_e4m3fn,ck,10,0,22.5861,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,237.7,3754.68,0.0 -256,32,12800,5120,torch.float8_e4m3fn,ck,12,0,17.5533,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,238.95,3789.55,0.0 -256,32,13312,16384,torch.float8_e4m3fn,ck,12,0,46.3062,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,301.44,4739.76,0.0 -256,32,14336,8192,torch.float8_e4m3fn,ck,6,0,26.4468,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,284.2,4485.24,0.0 -256,32,16384,2048,torch.float8_e4m3fn,ck,6,0,9.6867,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,221.69,3578.98,0.0 -256,32,16384,4096,torch.float8_e4m3fn,ck,26,0,17.6799,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v2,242.93,3862.49,0.0 -256,32,16384,6656,torch.float8_e4m3fn,ck,12,0,23.1146,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,301.94,4772.46,0.0 -256,32,16384,8192,torch.float8_e4m3fn,ck,19,0,28.8166,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,298.09,4703.14,0.0 -256,32,16384,13312,torch.float8_e4m3fn,ck,12,0,40.7803,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,342.29,5384.42,0.0 -256,32,16384,26624,torch.float8_e4m3fn,ck,12,0,77.4146,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,360.62,5659.24,0.0 -256,32,26624,16384,torch.float8_e4m3fn,ck,119,0,80.3452,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,347.47,5456.9,0.0 -256,32,51200,5120,torch.float8_e4m3fn,ck,133,0,47.8343,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,350.74,5552.18,0.0 -256,32,53248,16384,torch.float8_e4m3fn,ck,136,0,154.4861,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,361.42,5672.66,0.0 -256,32,57344,8192,torch.float8_e4m3fn,ck,85,0,86.5188,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.49,5475.04,0.0 -256,64,192,1024,torch.float8_e4m3fn,ck,15,0,4.5229,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,5.56,63.39,0.0 -256,64,800,5120,torch.float8_e4m3fn,ck,10,0,10.5456,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,49.72,429.19,0.0 -256,64,1280,8192,torch.float8_e4m3fn,ck,10,0,11.9979,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,111.87,931.32,0.0 -256,64,2304,16384,torch.float8_e4m3fn,ck,19,0,27.9421,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,172.92,1399.04,0.0 -256,64,2560,8192,torch.float8_e4m3fn,ck,5,0,12.8533,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,208.85,1697.89,0.0 -256,64,4608,16384,torch.float8_e4m3fn,ck,10,0,36.4754,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,264.94,2114.74,0.0 -256,64,5120,640,torch.float8_e4m3fn,ck,23,0,6.7207,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,62.41,591.18,0.0 -256,64,5120,1280,torch.float8_e4m3fn,ck,29,0,7.6564,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,109.56,952.26,0.0 -256,64,5120,3200,torch.float8_e4m3fn,ck,23,0,15.6284,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,134.19,1103.39,0.0 -256,64,5120,5120,torch.float8_e4m3fn,ck,10,0,14.3407,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,233.98,1896.52,0.0 -256,64,5120,6400,torch.float8_e4m3fn,ck,15,0,19.7146,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,212.75,1716.14,0.0 -256,64,5120,25600,torch.float8_e4m3fn,ck,24,0,52.8565,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,317.41,2523.17,0.0 -256,64,6400,5120,torch.float8_e4m3fn,ck,12,0,14.7961,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,283.47,2292.15,0.0 -256,64,7168,8192,torch.float8_e4m3fn,ck,12,0,20.6714,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,363.6,2910.4,0.0 -256,64,8192,2048,torch.float8_e4m3fn,ck,5,0,7.5112,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,285.9,2390.68,0.0 -256,64,8192,3584,torch.float8_e4m3fn,ck,12,0,11.0813,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,339.14,2764.85,0.0 -256,64,8192,7168,torch.float8_e4m3fn,ck,12,0,19.2039,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,391.39,3136.22,0.0 -256,64,8192,8192,torch.float8_e4m3fn,ck,12,0,21.1081,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,406.95,3253.81,0.0 -256,64,8192,28672,torch.float8_e4m3fn,ck,12,0,64.9256,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,463.06,3662.11,0.0 -256,64,9216,16384,torch.float8_e4m3fn,ck,114,0,56.6696,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,341.05,2703.8,0.0 -256,64,10240,8192,torch.float8_e4m3fn,ck,114,0,25.5687,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,419.94,3352.58,0.0 -256,64,12800,5120,torch.float8_e4m3fn,ck,119,0,21.9262,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,382.58,3078.6,0.0 -256,64,13312,16384,torch.float8_e4m3fn,ck,114,0,60.16,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,464.05,3671.15,0.0 -256,64,14336,8192,torch.float8_e4m3fn,ck,114,0,29.5848,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,508.11,4049.37,0.0 -256,64,16384,2048,torch.float8_e4m3fn,ck,119,0,11.4341,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,375.63,3129.47,0.0 -256,64,16384,4096,torch.float8_e4m3fn,ck,119,0,20.8735,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,411.52,3328.06,0.0 -256,64,16384,6656,torch.float8_e4m3fn,ck,119,0,29.3009,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,476.39,3807.9,0.0 -256,64,16384,8192,torch.float8_e4m3fn,ck,119,0,35.6664,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,481.68,3836.64,0.0 -256,64,16384,13312,torch.float8_e4m3fn,ck,119,0,53.1879,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,524.88,4156.08,0.0 -256,64,16384,26624,torch.float8_e4m3fn,ck,119,0,102.4795,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,544.84,4293.63,0.0 -256,64,26624,16384,torch.float8_e4m3fn,ck,121,0,91.6538,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,609.19,4807.92,0.0 -256,64,51200,5120,torch.float8_e4m3fn,ck,135,0,54.3316,a8w8_bpreshuffle_256x64x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,617.59,4951.54,0.0 -256,64,53248,16384,torch.float8_e4m3fn,ck,135,0,156.5996,a8w8_bpreshuffle_256x64x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,713.09,5621.21,0.0 -256,64,57344,8192,torch.float8_e4m3fn,ck,85,0,87.6062,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,686.36,5451.97,0.0 -256,128,800,5120,torch.float8_e4m3fn,ck,24,0,9.9852,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,105.01,496.35,0.0 -256,128,1280,8192,torch.float8_e4m3fn,ck,5,0,12.228,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,219.53,970.07,0.0 -256,128,2304,16384,torch.float8_e4m3fn,ck,24,0,33.0153,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,292.7,1224.76,0.0 -256,128,2560,8192,torch.float8_e4m3fn,ck,10,0,17.8034,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,301.56,1273.66,0.0 -256,128,4608,16384,torch.float8_e4m3fn,ck,108,0,54.3179,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,355.82,1450.25,0.0 -256,128,5120,640,torch.float8_e4m3fn,ck,23,0,6.3513,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,132.08,735.19,0.0 -256,128,5120,1280,torch.float8_e4m3fn,ck,29,0,7.7817,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,215.6,1031.67,0.0 -256,128,5120,3200,torch.float8_e4m3fn,ck,23,0,16.0101,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,261.98,1130.81,0.0 -256,128,5120,5120,torch.float8_e4m3fn,ck,108,0,19.63,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,341.87,1435.58,0.0 -256,128,5120,6400,torch.float8_e4m3fn,ck,29,0,22.9859,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,364.95,1518.23,0.0 -256,128,5120,25600,torch.float8_e4m3fn,ck,114,0,83.5847,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,401.44,1623.02,0.0 -256,128,6400,5120,torch.float8_e4m3fn,ck,114,0,20.5908,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,407.4,1702.79,0.0 -256,128,7168,8192,torch.float8_e4m3fn,ck,114,0,23.7352,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,633.34,2595.46,0.0 -256,128,8192,1024,torch.float8_e4m3fn,ck,119,0,6.9407,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,309.4,1529.65,0.0 -256,128,8192,2048,torch.float8_e4m3fn,ck,114,0,8.8614,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,484.68,2159.54,0.0 -256,128,8192,3584,torch.float8_e4m3fn,ck,114,0,12.978,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,579.15,2459.24,0.0 -256,128,8192,7168,torch.float8_e4m3fn,ck,114,0,21.6976,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,692.81,2845.24,0.0 -256,128,8192,8192,torch.float8_e4m3fn,ck,114,0,24.8294,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,691.92,2829.49,0.0 -256,128,8192,28672,torch.float8_e4m3fn,ck,114,0,75.1872,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,799.73,3200.65,0.0 -256,128,9216,16384,torch.float8_e4m3fn,ck,126,0,72.8197,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,530.83,2134.74,0.0 -256,128,10240,8192,torch.float8_e4m3fn,ck,113,0,30.4124,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,706.12,2878.96,0.0 -256,128,12800,5120,torch.float8_e4m3fn,ck,65,0,28.9421,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,579.68,2400.25,0.0 -256,128,13312,16384,torch.float8_e4m3fn,ck,144,0,79.9061,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,698.75,2798.4,0.0 -256,128,14336,8192,torch.float8_e4m3fn,ck,144,0,36.1842,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,830.88,3376.03,0.0 -256,128,16384,2048,torch.float8_e4m3fn,ck,119,0,15.5043,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,554.04,2451.63,0.0 -256,128,16384,4096,torch.float8_e4m3fn,ck,121,0,26.5935,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,646.02,2700.94,0.0 -256,128,16384,6656,torch.float8_e4m3fn,ck,65,0,39.0387,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,715.12,2922.69,0.0 -256,128,16384,8192,torch.float8_e4m3fn,ck,121,0,45.9107,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,748.4,3037.65,0.0 -256,128,16384,13312,torch.float8_e4m3fn,ck,65,0,70.6454,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,790.35,3170.79,0.0 -256,128,16384,26624,torch.float8_e4m3fn,ck,144,0,134.0597,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,832.98,3310.54,0.0 -256,128,26624,16384,torch.float8_e4m3fn,ck,139,0,108.1175,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1032.85,4117.01,0.0 -256,128,51200,5120,torch.float8_e4m3fn,ck,154,0,65.6842,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1021.69,4200.5,0.0 -256,128,53248,16384,torch.float8_e4m3fn,ck,154,0,184.7497,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1208.87,4807.28,0.0 -256,128,57344,8192,torch.float8_e4m3fn,ck,54,0,95.8445,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1254.73,5065.4,0.0 -256,256,800,5120,torch.float8_e4m3fn,ck,24,0,11.6817,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,179.52,497.9,0.0 -256,256,1280,8192,torch.float8_e4m3fn,ck,12,0,17.1516,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,313.02,771.84,0.0 -256,256,2304,16384,torch.float8_e4m3fn,ck,15,0,52.6426,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,367.14,819.16,0.0 -256,256,2560,8192,torch.float8_e4m3fn,ck,114,0,21.4545,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,500.47,1136.33,0.0 -256,256,4608,16384,torch.float8_e4m3fn,ck,116,0,68.5478,a8w8_bpreshuffle_256x96x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,563.91,1196.99,0.0 -256,256,5120,640,torch.float8_e4m3fn,ck,76,0,7.7921,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,215.31,777.98,0.0 -256,256,5120,1280,torch.float8_e4m3fn,ck,119,0,10.1726,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,329.85,934.15,0.0 -256,256,5120,3200,torch.float8_e4m3fn,ck,77,0,19.1458,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,438.14,1035.46,0.0 -256,256,5120,5120,torch.float8_e4m3fn,ck,116,0,25.9267,a8w8_bpreshuffle_256x96x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,517.68,1162.76,0.0 -256,256,5120,6400,torch.float8_e4m3fn,ck,65,0,28.085,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,597.37,1318.42,0.0 -256,256,5120,25600,torch.float8_e4m3fn,ck,116,0,106.6527,a8w8_bpreshuffle_256x96x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,629.23,1314.99,0.0 -256,256,6400,5120,torch.float8_e4m3fn,ck,65,0,26.9347,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,622.88,1386.89,0.0 -256,256,7168,8192,torch.float8_e4m3fn,ck,65,0,29.6374,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1014.42,2175.88,0.0 -256,256,8192,1024,torch.float8_e4m3fn,ck,77,0,9.7085,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,442.39,1323.07,0.0 -256,256,8192,2048,torch.float8_e4m3fn,ck,114,0,11.9681,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,717.74,1796.09,0.0 -256,256,8192,3584,torch.float8_e4m3fn,ck,144,0,16.6491,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,902.89,2070.5,0.0 -256,256,8192,7168,torch.float8_e4m3fn,ck,65,0,28.6205,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1050.46,2262.35,0.0 -256,256,8192,8192,torch.float8_e4m3fn,ck,144,0,30.9086,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1111.66,2374.75,0.0 -256,256,8192,28672,torch.float8_e4m3fn,ck,65,0,95.3629,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1261.07,2583.98,0.0 -256,256,9216,16384,torch.float8_e4m3fn,ck,124,0,84.3085,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,916.98,1896.7,0.0 -256,256,10240,8192,torch.float8_e4m3fn,ck,123,0,38.8017,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1106.9,2351.09,0.0 -256,256,12800,5120,torch.float8_e4m3fn,ck,139,0,34.8457,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,962.94,2106.44,0.0 -256,256,13312,16384,torch.float8_e4m3fn,ck,139,0,92.6483,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1205.3,2472.94,0.0 -256,256,14336,8192,torch.float8_e4m3fn,ck,139,0,45.4901,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1321.82,2789.13,0.0 -256,256,16384,2048,torch.float8_e4m3fn,ck,139,0,18.9985,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,904.28,2235.3,0.0 -256,256,16384,4096,torch.float8_e4m3fn,ck,63,0,32.6482,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1052.42,2344.57,0.0 -256,256,16384,6656,torch.float8_e4m3fn,ck,139,0,46.695,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1195.73,2551.55,0.0 -256,256,16384,8192,torch.float8_e4m3fn,ck,139,0,55.672,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1234.36,2599.21,0.0 -256,256,16384,13312,torch.float8_e4m3fn,ck,139,0,83.1524,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1342.95,2764.81,0.0 -256,256,16384,26624,torch.float8_e4m3fn,ck,139,0,163.3311,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1367.4,2763.78,0.0 -256,256,26624,16384,torch.float8_e4m3fn,ck,154,0,141.6113,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1577.12,3206.19,0.0 -256,256,51200,5120,torch.float8_e4m3fn,ck,154,0,97.921,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1370.67,2958.19,0.0 -256,256,53248,16384,torch.float8_e4m3fn,ck,154,0,276.3785,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1616.18,3270.42,0.0 -256,256,57344,8192,torch.float8_e4m3fn,ck,33,0,129.4795,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1857.58,3871.03,0.0 -256,512,800,5120,torch.float8_e4m3fn,ck,10,0,15.082,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,278.1,499.71,0.0 -256,512,1280,8192,torch.float8_e4m3fn,ck,114,0,21.033,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,510.5,760.27,0.0 -256,512,2304,16384,torch.float8_e4m3fn,ck,115,0,63.7872,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,605.99,760.29,0.0 -256,512,2560,8192,torch.float8_e4m3fn,ck,113,0,27.2155,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,789.07,1021.01,0.0 -256,512,4608,16384,torch.float8_e4m3fn,ck,124,0,84.5982,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,913.84,1047.36,0.0 -256,512,5120,640,torch.float8_e4m3fn,ck,77,0,9.0563,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,370.51,976.93,0.0 -256,512,5120,1280,torch.float8_e4m3fn,ck,77,0,12.8224,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,523.37,971.1,0.0 -256,512,5120,3200,torch.float8_e4m3fn,ck,85,0,23.0489,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,727.9,1009.39,0.0 -256,512,5120,5120,torch.float8_e4m3fn,ck,124,0,31.9895,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,839.14,1065.31,0.0 -256,512,5120,6400,torch.float8_e4m3fn,ck,139,0,36.8067,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,911.64,1121.74,0.0 -256,512,5120,25600,torch.float8_e4m3fn,ck,124,0,128.0705,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1048.0,1166.72,0.0 -256,512,6400,5120,torch.float8_e4m3fn,ck,124,0,33.3791,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1005.25,1256.57,0.0 -256,512,7168,8192,torch.float8_e4m3fn,ck,63,0,42.0669,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1429.38,1670.07,0.0 -256,512,8192,1024,torch.float8_e4m3fn,ck,85,0,11.5306,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,744.97,1500.49,0.0 -256,512,8192,2048,torch.float8_e4m3fn,ck,139,0,15.6554,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1097.38,1674.46,0.0 -256,512,8192,3584,torch.float8_e4m3fn,ck,139,0,22.7592,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1320.99,1739.24,0.0 -256,512,8192,7168,torch.float8_e4m3fn,ck,139,0,39.3552,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1527.87,1798.46,0.0 -256,512,8192,8192,torch.float8_e4m3fn,ck,139,0,43.6902,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1572.88,1824.02,0.0 -256,512,8192,28672,torch.float8_e4m3fn,ck,139,0,133.9721,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1795.29,1925.4,0.0 -256,512,9216,16384,torch.float8_e4m3fn,ck,149,0,104.8693,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1474.4,1609.82,0.0 -256,512,10240,8192,torch.float8_e4m3fn,ck,143,0,55.4465,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1549.23,1777.68,0.0 -256,512,12800,5120,torch.float8_e4m3fn,ck,138,0,46.2157,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1452.08,1758.38,0.0 -256,512,13312,16384,torch.float8_e4m3fn,ck,154,0,127.8746,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1746.54,1877.81,0.0 -256,512,14336,8192,torch.float8_e4m3fn,ck,154,0,63.6195,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1890.29,2142.66,0.0 -256,512,16384,2048,torch.float8_e4m3fn,ck,154,0,27.0054,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1272.33,1902.59,0.0 -256,512,16384,4096,torch.float8_e4m3fn,ck,154,0,43.8782,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1566.14,1959.59,0.0 -256,512,16384,6656,torch.float8_e4m3fn,ck,154,0,64.427,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1733.27,2005.94,0.0 -256,512,16384,8192,torch.float8_e4m3fn,ck,154,0,73.6131,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1867.04,2108.17,0.0 -256,512,16384,13312,torch.float8_e4m3fn,ck,154,0,114.1614,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1956.34,2117.15,0.0 -256,512,16384,26624,torch.float8_e4m3fn,ck,154,0,214.713,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2080.34,2173.21,0.0 -256,512,26624,16384,torch.float8_e4m3fn,ck,154,0,250.2793,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1784.71,1885.33,0.0 -256,512,51200,5120,torch.float8_e4m3fn,ck,158,0,174.8476,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1535.25,1814.12,0.0 -256,512,53248,16384,torch.float8_e4m3fn,ck,154,0,492.8785,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1812.52,1897.69,0.0 -256,512,57344,8192,torch.float8_e4m3fn,ck,33,0,227.0683,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2118.47,2345.89,0.0 -256,1024,800,5120,torch.float8_e4m3fn,ck,24,0,24.558,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,341.58,446.99,0.0 -256,1024,1280,8192,torch.float8_e4m3fn,ck,113,0,26.819,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,800.73,801.51,0.0 -256,1024,2304,16384,torch.float8_e4m3fn,ck,124,0,81.3585,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,950.23,728.19,0.0 -256,1024,2560,8192,torch.float8_e4m3fn,ck,123,0,37.4372,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1147.25,924.29,0.0 -256,1024,4608,16384,torch.float8_e4m3fn,ck,149,0,104.0599,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1485.86,977.44,0.0 -256,1024,5120,640,torch.float8_e4m3fn,ck,86,0,11.1521,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,601.76,1292.84,0.0 -256,1024,5120,1280,torch.float8_e4m3fn,ck,86,0,16.1465,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,831.25,1136.47,0.0 -256,1024,5120,3200,torch.float8_e4m3fn,ck,51,0,29.1651,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1150.5,1033.65,0.0 -256,1024,5120,5120,torch.float8_e4m3fn,ck,137,0,40.2614,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1333.46,1041.77,0.0 -256,1024,5120,6400,torch.float8_e4m3fn,ck,137,0,48.5222,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1383.05,1026.49,0.0 -256,1024,5120,25600,torch.float8_e4m3fn,ck,137,0,162.5939,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1650.96,1031.85,0.0 -256,1024,6400,5120,torch.float8_e4m3fn,ck,138,0,45.101,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1487.97,1133.41,0.0 -256,1024,7168,8192,torch.float8_e4m3fn,ck,154,0,62.2696,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1931.26,1313.46,0.0 -256,1024,8192,1024,torch.float8_e4m3fn,ck,0,0,16.8199,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1021.4,1558.53,0.0 -256,1024,8192,2048,torch.float8_e4m3fn,ck,154,0,23.1654,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1483.24,1539.0,0.0 -256,1024,8192,3584,torch.float8_e4m3fn,ck,154,0,34.8454,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1725.61,1429.38,0.0 -256,1024,8192,7168,torch.float8_e4m3fn,ck,154,0,59.0054,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2038.1,1403.9,0.0 -256,1024,8192,8192,torch.float8_e4m3fn,ck,154,0,64.2479,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2139.2,1436.23,0.0 -256,1024,8192,28672,torch.float8_e4m3fn,ck,154,0,197.2769,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2438.38,1424.49,0.0 -256,1024,9216,16384,torch.float8_e4m3fn,ck,54,0,178.1558,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1735.77,1047.66,0.0 -256,1024,10240,8192,torch.float8_e4m3fn,ck,158,0,83.5863,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2055.35,1354.84,0.0 -256,1024,12800,5120,torch.float8_e4m3fn,ck,138,0,86.9605,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1543.43,1115.37,0.0 -256,1024,13312,16384,torch.float8_e4m3fn,ck,154,0,243.5736,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1833.85,1076.24,0.0 -256,1024,14336,8192,torch.float8_e4m3fn,ck,33,0,112.839,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2131.52,1375.32,0.0 -256,1024,16384,2048,torch.float8_e4m3fn,ck,33,0,49.7279,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1381.91,1391.69,0.0 -256,1024,16384,4096,torch.float8_e4m3fn,ck,33,0,81.4525,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1687.35,1287.35,0.0 -256,1024,16384,6656,torch.float8_e4m3fn,ck,2,0,120.523,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1853.08,1239.78,0.0 -256,1024,16384,8192,torch.float8_e4m3fn,ck,154,0,136.0178,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2020.9,1295.13,0.0 -256,1024,16384,13312,torch.float8_e4m3fn,ck,154,0,214.8294,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2079.22,1234.89,0.0 -256,1024,16384,26624,torch.float8_e4m3fn,ck,154,0,412.5087,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2165.66,1204.88,0.0 -256,1024,26624,16384,torch.float8_e4m3fn,ck,33,0,467.1211,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1912.47,1086.47,0.0 -256,1024,51200,5120,torch.float8_e4m3fn,ck,40,0,288.5662,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1860.48,1289.98,0.0 -256,1024,53248,16384,torch.float8_e4m3fn,ck,154,0,850.624,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2100.47,1173.54,0.0 -256,1024,57344,8192,torch.float8_e4m3fn,ck,33,0,409.7151,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2348.15,1453.67,0.0 -256,2048,800,5120,torch.float8_e4m3fn,ck,24,0,41.7422,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,401.92,427.83,0.0 -256,2048,1280,8192,torch.float8_e4m3fn,ck,123,0,36.7116,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1169.92,885.44,0.0 -256,2048,2304,16384,torch.float8_e4m3fn,ck,131,0,101.2648,a8w8_bpreshuffle_256x112x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1526.88,797.32,0.0 -256,2048,2560,8192,torch.float8_e4m3fn,ck,138,0,55.1683,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1557.04,874.32,0.0 -256,2048,4608,16384,torch.float8_e4m3fn,ck,68,0,176.1003,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1756.03,726.44,0.0 -256,2048,5120,640,torch.float8_e4m3fn,ck,102,0,16.8369,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,797.16,1518.04,0.0 -256,2048,5120,1280,torch.float8_e4m3fn,ck,102,0,24.8167,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1081.67,1214.77,0.0 -256,2048,5120,3200,torch.float8_e4m3fn,ck,158,0,41.6304,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1612.02,1054.74,0.0 -256,2048,5120,5120,torch.float8_e4m3fn,ck,102,0,71.8612,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1494.19,802.54,0.0 -256,2048,5120,6400,torch.float8_e4m3fn,ck,158,0,76.511,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1754.23,873.69,0.0 -256,2048,5120,25600,torch.float8_e4m3fn,ck,137,0,301.0835,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1783.13,679.12,0.0 -256,2048,6400,5120,torch.float8_e4m3fn,ck,138,0,84.6213,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1586.1,820.93,0.0 -256,2048,7168,8192,torch.float8_e4m3fn,ck,33,0,112.617,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2135.72,931.1,0.0 -256,2048,8192,1024,torch.float8_e4m3fn,ck,86,0,28.9708,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1186.01,1520.16,0.0 -256,2048,8192,2048,torch.float8_e4m3fn,ck,33,0,42.2356,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1627.05,1291.0,0.0 -256,2048,8192,3584,torch.float8_e4m3fn,ck,33,0,61.9146,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1942.34,1134.7,0.0 -256,2048,8192,7168,torch.float8_e4m3fn,ck,33,0,105.8446,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2272.37,1010.49,0.0 -256,2048,8192,8192,torch.float8_e4m3fn,ck,33,0,117.6729,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2335.95,998.03,0.0 -256,2048,8192,28672,torch.float8_e4m3fn,ck,33,0,357.8032,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2688.83,914.35,0.0 -256,2048,9216,16384,torch.float8_e4m3fn,ck,68,0,305.2233,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2026.3,728.31,0.0 -256,2048,10240,8192,torch.float8_e4m3fn,ck,158,0,156.1052,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2201.06,913.53,0.0 -256,2048,12800,5120,torch.float8_e4m3fn,ck,40,0,154.2962,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1739.74,832.49,0.0 -256,2048,13312,16384,torch.float8_e4m3fn,ck,138,0,457.646,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1952.06,669.04,0.0 -256,2048,14336,8192,torch.float8_e4m3fn,ck,54,0,205.0365,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2346.1,940.99,0.0 -256,2048,16384,2048,torch.float8_e4m3fn,ck,33,0,89.7238,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1531.8,1168.67,0.0 -256,2048,16384,4096,torch.float8_e4m3fn,ck,33,0,146.1468,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1880.83,975.77,0.0 -256,2048,16384,6656,torch.float8_e4m3fn,ck,33,0,216.3109,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2064.97,877.4,0.0 -256,2048,16384,8192,torch.float8_e4m3fn,ck,33,0,253.8314,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2165.83,859.25,0.0 -256,2048,16384,13312,torch.float8_e4m3fn,ck,33,0,388.9416,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2296.88,803.4,0.0 -256,2048,16384,26624,torch.float8_e4m3fn,ck,154,0,777.8647,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2296.94,717.15,0.0 -256,2048,26624,16384,torch.float8_e4m3fn,ck,154,0,848.1746,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2106.53,682.42,0.0 -256,2048,51200,5120,torch.float8_e4m3fn,ck,33,0,524.9075,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2045.58,918.91,0.0 -256,2048,53248,16384,torch.float8_e4m3fn,ck,33,0,1541.2203,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2318.56,729.34,0.0 -256,2048,57344,8192,torch.float8_e4m3fn,ck,33,0,762.0358,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2525.01,946.7,0.0 -256,4096,800,5120,torch.float8_e4m3fn,ck,69,0,50.1312,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,669.33,630.77,0.0 -256,4096,1280,8192,torch.float8_e4m3fn,ck,137,0,53.9659,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1591.73,1010.38,0.0 -256,4096,2304,16384,torch.float8_e4m3fn,ck,68,0,177.9892,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1737.4,695.17,0.0 -256,4096,2560,8192,torch.float8_e4m3fn,ck,158,0,83.4719,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2058.16,904.47,0.0 -256,4096,4608,16384,torch.float8_e4m3fn,ck,68,0,307.0339,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2014.36,587.41,0.0 -256,4096,5120,640,torch.float8_e4m3fn,ck,102,0,28.9248,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,928.05,1653.99,0.0 -256,4096,5120,1280,torch.float8_e4m3fn,ck,102,0,42.5949,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1260.41,1261.64,0.0 -256,4096,5120,3200,torch.float8_e4m3fn,ck,47,0,80.2905,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1671.65,889.7,0.0 -256,4096,5120,5120,torch.float8_e4m3fn,ck,138,0,121.2408,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1771.25,735.14,0.0 -256,4096,5120,6400,torch.float8_e4m3fn,ck,158,0,135.91,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1975.1,742.59,0.0 -256,4096,5120,25600,torch.float8_e4m3fn,ck,138,0,505.012,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2126.17,550.23,0.0 -256,4096,6400,5120,torch.float8_e4m3fn,ck,40,0,148.8986,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1802.81,713.02,0.0 -256,4096,7168,8192,torch.float8_e4m3fn,ck,156,0,206.7372,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2326.8,730.37,0.0 -256,4096,8192,1024,torch.float8_e4m3fn,ck,86,0,49.3525,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1392.42,1614.75,0.0 -256,4096,8192,2048,torch.float8_e4m3fn,ck,143,0,80.1603,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1714.55,1151.13,0.0 -256,4096,8192,3584,torch.float8_e4m3fn,ck,33,0,117.1442,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2053.18,948.82,0.0 -256,4096,8192,7168,torch.float8_e4m3fn,ck,33,0,196.6543,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2446.1,789.15,0.0 -256,4096,8192,8192,torch.float8_e4m3fn,ck,33,0,221.6286,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2480.53,757.0,0.0 -256,4096,8192,28672,torch.float8_e4m3fn,ck,33,0,686.9896,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2800.84,610.53,0.0 -256,4096,9216,16384,torch.float8_e4m3fn,ck,95,0,554.8485,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2229.35,529.16,0.0 -256,4096,10240,8192,torch.float8_e4m3fn,ck,106,0,283.3828,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2424.97,710.44,0.0 -256,4096,12800,5120,torch.float8_e4m3fn,ck,40,0,270.4291,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1985.26,707.64,0.0 -256,4096,13312,16384,torch.float8_e4m3fn,ck,154,0,842.3709,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2121.04,468.04,0.0 -256,4096,14336,8192,torch.float8_e4m3fn,ck,158,0,387.5867,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2482.21,692.58,0.0 -256,4096,16384,2048,torch.float8_e4m3fn,ck,102,0,160.9167,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1708.2,1094.73,0.0 -256,4096,16384,4096,torch.float8_e4m3fn,ck,33,0,255.2978,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2153.39,854.31,0.0 -256,4096,16384,6656,torch.float8_e4m3fn,ck,33,0,407.5508,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2192.0,663.8,0.0 -256,4096,16384,8192,torch.float8_e4m3fn,ck,33,0,471.5371,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2331.76,640.44,0.0 -256,4096,16384,13312,torch.float8_e4m3fn,ck,33,0,740.5043,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2412.82,549.42,0.0 -256,4096,16384,26624,torch.float8_e4m3fn,ck,33,0,1511.0293,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2364.89,449.68,0.0 -256,4096,26624,16384,torch.float8_e4m3fn,ck,33,0,1553.947,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2299.57,464.25,0.0 -256,4096,51200,5120,torch.float8_e4m3fn,ck,33,0,979.1344,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2193.25,717.52,0.0 -256,4096,53248,16384,torch.float8_e4m3fn,ck,33,0,2910.6751,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2455.38,472.65,0.0 -256,4096,57344,8192,torch.float8_e4m3fn,ck,33,0,1497.7395,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2569.4,649.7,0.0 -256,8192,800,5120,torch.float8_e4m3fn,ck,69,0,67.6989,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,991.28,873.67,0.0 -256,8192,1280,8192,torch.float8_e4m3fn,ck,158,0,83.0796,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2067.88,1186.41,0.0 -256,8192,2304,16384,torch.float8_e4m3fn,ck,149,0,311.0895,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1988.09,674.13,0.0 -256,8192,2560,8192,torch.float8_e4m3fn,ck,158,0,155.4584,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2210.22,836.39,0.0 -256,8192,4608,16384,torch.float8_e4m3fn,ck,154,0,561.2735,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2203.83,508.15,0.0 -256,8192,5120,640,torch.float8_e4m3fn,ck,143,0,50.4892,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1063.34,1830.21,0.0 -256,8192,5120,1280,torch.float8_e4m3fn,ck,143,0,77.0321,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1393.89,1310.17,0.0 -256,8192,5120,3200,torch.float8_e4m3fn,ck,40,0,139.3777,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1925.96,907.49,0.0 -256,8192,5120,5120,torch.float8_e4m3fn,ck,154,0,215.948,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1988.89,704.07,0.0 -256,8192,5120,6400,torch.float8_e4m3fn,ck,40,0,242.9206,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2210.07,696.04,0.0 -256,8192,5120,25600,torch.float8_e4m3fn,ck,154,0,938.5635,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2288.05,452.47,0.0 -256,8192,6400,5120,torch.float8_e4m3fn,ck,40,0,270.3545,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1985.8,664.2,0.0 -256,8192,7168,8192,torch.float8_e4m3fn,ck,158,0,385.4824,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2495.76,631.08,0.0 -256,8192,8192,1024,torch.float8_e4m3fn,ck,51,0,92.436,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1486.86,1633.51,0.0 -256,8192,8192,2048,torch.float8_e4m3fn,ck,51,0,141.6683,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1940.29,1184.26,0.0 -256,8192,8192,3584,torch.float8_e4m3fn,ck,143,0,222.4514,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2162.43,867.33,0.0 -256,8192,8192,7168,torch.float8_e4m3fn,ck,33,0,383.1227,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2511.13,656.86,0.0 -256,8192,8192,8192,torch.float8_e4m3fn,ck,33,0,424.8317,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2588.11,631.86,0.0 -256,8192,8192,28672,torch.float8_e4m3fn,ck,33,0,1389.0293,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2770.49,434.82,0.0 -256,8192,9216,16384,torch.float8_e4m3fn,ck,154,0,1095.3193,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2258.61,398.25,0.0 -256,8192,10240,8192,torch.float8_e4m3fn,ck,33,0,533.4106,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2576.61,597.6,0.0 -256,8192,12800,5120,torch.float8_e4m3fn,ck,158,0,519.2638,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2067.82,610.85,0.0 -256,8192,13312,16384,torch.float8_e4m3fn,ck,33,0,1546.1027,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2311.24,368.94,0.0 -256,8192,14336,8192,torch.float8_e4m3fn,ck,33,0,734.9548,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2618.05,570.69,0.0 -256,8192,16384,2048,torch.float8_e4m3fn,ck,102,0,303.9224,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1808.87,1048.84,0.0 -256,8192,16384,4096,torch.float8_e4m3fn,ck,33,0,497.9337,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2208.15,741.26,0.0 -256,8192,16384,6656,torch.float8_e4m3fn,ck,33,0,770.3086,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2319.47,560.83,0.0 -256,8192,16384,8192,torch.float8_e4m3fn,ck,33,0,903.531,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2433.81,519.92,0.0 -256,8192,16384,13312,torch.float8_e4m3fn,ck,107,0,1430.2158,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2498.51,416.43,0.0 -256,8192,16384,26624,torch.float8_e4m3fn,ck,154,0,3033.7821,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2355.75,304.16,0.0 -256,8192,26624,16384,torch.float8_e4m3fn,ck,33,0,2904.3718,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2460.71,346.59,0.0 -256,8192,51200,5120,torch.float8_e4m3fn,ck,33,0,1857.9691,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2311.65,615.16,0.0 -256,8192,53248,16384,torch.float8_e4m3fn,ck,33,0,5722.4616,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2497.82,328.36,0.0 -256,8192,57344,8192,torch.float8_e4m3fn,ck,33,0,2991.7292,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2572.62,493.49,0.0 -256,16384,800,5120,torch.float8_e4m3fn,ck,69,0,108.6586,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1235.22,1050.97,0.0 -256,16384,1280,8192,torch.float8_e4m3fn,ck,158,0,157.7142,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2178.61,1183.45,0.0 -256,16384,2304,16384,torch.float8_e4m3fn,ck,154,0,577.0581,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2143.55,661.43,0.0 -256,16384,2560,8192,torch.float8_e4m3fn,ck,154,0,283.4813,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2424.13,843.35,0.0 -256,16384,4608,16384,torch.float8_e4m3fn,ck,154,0,1050.0065,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2356.08,471.36,0.0 -256,16384,5120,640,torch.float8_e4m3fn,ck,143,0,91.8209,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1169.39,1977.05,0.0 -256,16384,5120,1280,torch.float8_e4m3fn,ck,143,0,139.3615,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1540.94,1401.37,0.0 -256,16384,5120,3200,torch.float8_e4m3fn,ck,33,0,250.8978,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2139.8,942.95,0.0 -256,16384,5120,5120,torch.float8_e4m3fn,ck,33,0,403.9327,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2126.58,687.92,0.0 -256,16384,5120,6400,torch.float8_e4m3fn,ck,107,0,443.5168,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2420.97,688.58,0.0 -256,16384,5120,25600,torch.float8_e4m3fn,ck,154,0,1817.1823,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2363.53,395.27,0.0 -256,16384,6400,5120,torch.float8_e4m3fn,ck,154,0,516.4428,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2079.11,631.96,0.0 -256,16384,7168,8192,torch.float8_e4m3fn,ck,154,0,761.2665,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2527.56,561.98,0.0 -256,16384,8192,1024,torch.float8_e4m3fn,ck,143,0,172.4777,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1593.7,1702.26,0.0 -256,16384,8192,2048,torch.float8_e4m3fn,ck,51,0,272.4657,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2017.71,1169.93,0.0 -256,16384,8192,3584,torch.float8_e4m3fn,ck,33,0,421.1579,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2284.35,846.51,0.0 -256,16384,8192,7168,torch.float8_e4m3fn,ck,158,0,753.6754,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2553.02,589.9,0.0 -256,16384,8192,8192,torch.float8_e4m3fn,ck,158,0,852.0249,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2580.94,551.35,0.0 -256,16384,8192,28672,torch.float8_e4m3fn,ck,33,0,2785.3542,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2763.23,349.36,0.0 -256,16384,9216,16384,torch.float8_e4m3fn,ck,33,0,2047.2744,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2416.78,352.38,0.0 -256,16384,10240,8192,torch.float8_e4m3fn,ck,33,0,1068.7185,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2572.03,518.05,0.0 -256,16384,12800,5120,torch.float8_e4m3fn,ck,33,0,974.5645,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2203.53,583.7,0.0 -256,16384,13312,16384,torch.float8_e4m3fn,ck,33,0,2958.4415,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2415.74,311.9,0.0 -256,16384,14336,8192,torch.float8_e4m3fn,ck,158,0,1477.5648,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2604.48,488.25,0.0 -256,16384,16384,2048,torch.float8_e4m3fn,ck,102,0,598.3663,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1837.52,1009.38,0.0 -256,16384,16384,4096,torch.float8_e4m3fn,ck,33,0,988.8674,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2223.78,678.64,0.0 -256,16384,16384,6656,torch.float8_e4m3fn,ck,33,0,1537.3785,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2324.35,491.08,0.0 -256,16384,16384,8192,torch.float8_e4m3fn,ck,33,0,1794.5054,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2450.84,448.76,0.0 -256,16384,16384,13312,torch.float8_e4m3fn,ck,33,0,2976.2278,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2401.3,326.95,0.0 -256,16384,16384,26624,torch.float8_e4m3fn,ck,154,0,6167.0855,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2317.73,228.52,0.0 -256,16384,26624,16384,torch.float8_e4m3fn,ck,33,0,5797.7818,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2465.37,272.01,0.0 -256,16384,51200,5120,torch.float8_e4m3fn,ck,33,0,3711.513,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2314.4,545.26,0.0 -256,16384,53248,16384,torch.float8_e4m3fn,ck,33,0,11284.2706,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2533.38,255.73,0.0 -256,16384,57344,8192,torch.float8_e4m3fn,ck,33,0,5984.2692,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2572.27,414.93,0.0 -256,32768,800,5120,torch.float8_e4m3fn,ck,69,0,196.771,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1364.2,1139.89,0.0 -256,32768,1280,8192,torch.float8_e4m3fn,ck,154,0,288.2725,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2383.84,1258.56,0.0 -256,32768,2304,16384,torch.float8_e4m3fn,ck,154,0,1073.694,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2304.1,675.81,0.0 -256,32768,2560,8192,torch.float8_e4m3fn,ck,158,0,549.7961,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2499.82,831.54,0.0 -256,32768,4608,16384,torch.float8_e4m3fn,ck,154,0,2120.2853,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2333.55,431.24,0.0 -256,32768,5120,640,torch.float8_e4m3fn,ck,143,0,181.1291,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1185.61,1986.39,0.0 -256,32768,5120,1280,torch.float8_e4m3fn,ck,102,0,269.995,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1590.76,1422.4,0.0 -256,32768,5120,3200,torch.float8_e4m3fn,ck,33,0,492.1153,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2181.89,928.21,0.0 -256,32768,5120,5120,torch.float8_e4m3fn,ck,33,0,764.8154,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2246.28,692.36,0.0 -256,32768,5120,6400,torch.float8_e4m3fn,ck,33,0,868.9521,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2471.35,665.2,0.0 -256,32768,5120,25600,torch.float8_e4m3fn,ck,154,0,3714.3301,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2312.65,351.47,0.0 -256,32768,6400,5120,torch.float8_e4m3fn,ck,33,0,980.8335,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2189.45,632.09,0.0 -256,32768,7168,8192,torch.float8_e4m3fn,ck,158,0,1484.7467,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2591.88,536.74,0.0 -256,32768,8192,1024,torch.float8_e4m3fn,ck,143,0,340.0384,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1616.75,1702.2,0.0 -256,32768,8192,2048,torch.float8_e4m3fn,ck,143,0,536.6149,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2048.98,1156.8,0.0 -256,32768,8192,3584,torch.float8_e4m3fn,ck,51,0,841.2106,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2287.35,812.72,0.0 -256,32768,8192,7168,torch.float8_e4m3fn,ck,158,0,1500.7026,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2564.33,553.39,0.0 -256,32768,8192,8192,torch.float8_e4m3fn,ck,158,0,1696.016,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2593.16,514.39,0.0 -256,32768,8192,28672,torch.float8_e4m3fn,ck,33,0,5558.3544,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2769.37,307.87,0.0 -256,32768,9216,16384,torch.float8_e4m3fn,ck,33,0,4103.0357,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2411.78,314.85,0.0 -256,32768,10240,8192,torch.float8_e4m3fn,ck,158,0,2102.9024,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2614.27,486.67,0.0 -256,32768,12800,5120,torch.float8_e4m3fn,ck,33,0,1888.3681,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2274.43,567.78,0.0 -256,32768,13312,16384,torch.float8_e4m3fn,ck,33,0,6036.8682,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2367.73,269.58,0.0 -256,32768,14336,8192,torch.float8_e4m3fn,ck,158,0,2945.0068,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2613.43,450.05,0.0 -256,32768,16384,2048,torch.float8_e4m3fn,ck,102,0,1180.9193,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1862.13,994.48,0.0 -256,32768,16384,4096,torch.float8_e4m3fn,ck,33,0,1989.9333,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2210.15,640.76,0.0 -256,32768,16384,6656,torch.float8_e4m3fn,ck,33,0,3099.1632,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2306.05,452.02,0.0 -256,32768,16384,8192,torch.float8_e4m3fn,ck,33,0,3588.7344,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2451.03,411.4,0.0 -256,32768,16384,13312,torch.float8_e4m3fn,ck,33,0,5598.3345,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2553.2,308.67,0.0 -256,32768,16384,26624,torch.float8_e4m3fn,ck,154,0,12187.7899,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2345.57,195.47,0.0 -256,32768,26624,16384,torch.float8_e4m3fn,ck,33,0,11524.899,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2480.48,235.83,0.0 -256,32768,51200,5120,torch.float8_e4m3fn,ck,33,0,7575.0119,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2267.97,499.72,0.0 -256,32768,53248,16384,torch.float8_e4m3fn,ck,33,0,22600.3968,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2529.81,216.76,0.0 -256,32768,57344,8192,torch.float8_e4m3fn,ck,158,0,11963.3054,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2573.4,375.84,0.0 -80,192,128,5120,torch.int8,asm,160,8,12.1485,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,20.72,138.91,0.1606 -80,64,1536,5120,torch.int8,asm,160,6,12.9215,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,77.9,649.2,0.1336 -80,80,1536,5120,torch.int8,asm,160,6,13.5695,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,92.73,627.86,0.135 -80,128,1536,5120,torch.int8,asm,160,6,14.3572,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,140.23,620.8,0.1335 -80,220,1536,5120,torch.int8,asm,160,3,21.1629,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,163.51,456.77,0.0857 -80,256,1536,5120,torch.int8,asm,160,3,21.3733,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,188.39,466.07,0.0853 -80,128,8192,1024,torch.int8,asm,160,1,14.9178,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,143.95,711.69,0.0 -80,256,8192,1024,torch.int8,asm,160,1,27.0157,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,158.98,475.47,0.0 -80,192,1280,1024,torch.int8,asm,160,6,9.0346,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,55.71,221.24,0.0914 -80,192,1536,1024,torch.int8,asm,160,3,10.3979,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,58.09,226.9,0.0725 -80,256,1280,8192,torch.int8,asm,160,4,23.0901,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,232.51,573.33,0.1074 -80,512,1280,8192,torch.int8,asm,160,2,38.6745,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,277.64,413.47,0.0542 -80,1024,1280,8192,torch.int8,asm,160,1,69.7841,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,307.73,308.03,0.0 -80,2048,1280,8192,torch.int8,asm,160,1,132.2413,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,324.78,245.81,0.0 -304,64,1536,5120,torch.int8,asm,0,1,16.7759,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,60.0,500.04,0.0 -304,128,1536,5120,torch.int8,asm,0,1,17.9602,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,112.1,496.26,0.0 -304,256,1536,5120,torch.int8,asm,0,1,17.6791,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,227.76,563.46,0.0 -304,512,1536,5120,torch.int8,asm,1,1,18.7067,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,430.49,644.62,0.0 -304,1024,1536,5120,torch.int8,asm,2,1,21.0125,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,766.5,773.49,0.0 -304,1664,1536,5120,torch.int8,asm,4,1,27.6261,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,947.38,778.1,0.0 -304,4096,1536,5120,torch.int8,asm,6,1,63.8361,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1009.22,648.83,0.0 -304,8192,1536,5120,torch.int8,asm,6,1,128.8567,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,999.94,581.83,0.0 -304,10240,1536,5120,torch.int8,asm,6,1,142.1441,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1133.08,645.47,0.0 -304,12288,1536,5120,torch.int8,asm,7,1,185.0056,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,1044.69,586.62,0.0 -304,16384,1536,5120,torch.int8,asm,6,1,211.7054,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1217.25,671.13,0.0 -304,20480,1536,5120,torch.int8,asm,6,1,271.7016,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1185.57,646.43,0.0 -304,24576,1536,5120,torch.int8,asm,6,1,328.3654,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1177.19,637.07,0.0 -304,30720,1536,5120,torch.int8,asm,6,1,402.8387,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1199.45,644.23,0.0 -304,32768,1536,5120,torch.int8,asm,6,1,406.391,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1268.23,679.89,0.0 -304,40960,1536,5120,torch.int8,asm,6,1,525.9013,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1225.03,652.99,0.0 -304,64,5120,1280,torch.int8,asm,0,1,7.7424,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,108.35,941.68,0.0 -304,128,5120,1280,torch.int8,asm,1,1,8.5797,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,195.55,935.72,0.0 -304,256,5120,1280,torch.int8,asm,2,1,9.2104,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,364.31,1031.74,0.0 -304,512,5120,1280,torch.int8,asm,4,1,11.2508,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,596.48,1106.75,0.0 -304,1024,5120,1280,torch.int8,asm,6,1,18.2637,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,734.89,1004.73,0.0 -304,1664,5120,1280,torch.int8,asm,6,1,21.5241,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1013.3,1195.07,0.0 -304,4096,5120,1280,torch.int8,asm,6,1,58.5804,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,916.47,917.36,0.0 -304,8192,5120,1280,torch.int8,asm,6,1,105.657,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1016.25,955.22,0.0 -304,10240,5120,1280,torch.int8,asm,6,1,133.9571,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1001.95,929.54,0.0 -304,12288,5120,1280,torch.int8,asm,6,1,153.8919,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1046.59,962.44,0.0 -304,16384,5120,1280,torch.int8,asm,6,1,195.4489,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1098.74,999.22,0.0 -304,20480,5120,1280,torch.int8,asm,6,1,245.7639,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1092.25,986.65,0.0 -304,24576,5120,1280,torch.int8,asm,6,1,290.5831,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1108.54,996.85,0.0 -304,30720,5120,1280,torch.int8,asm,6,1,361.4235,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1114.08,997.3,0.0 -304,32768,5120,1280,torch.int8,asm,6,1,379.1231,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1132.87,1012.97,0.0 -304,40960,5120,1280,torch.int8,asm,6,1,475.7676,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1128.43,1005.56,0.0 +cu_num,M,N,K,q_dtype_w,libtype,kernelId,splitK,us,kernelName,tflops,bw,errRatio +80,192,128,5120,torch.int8,asm,160,8,12.1485,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,20.72,138.91,0.1606 +80,128,1280,8192,torch.int8,asm,0,1,25.8063,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,104.02,459.66,0.0 +80,192,1280,1024,torch.int8,asm,160,6,9.0346,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,55.71,221.24,0.0914 +80,192,1280,5120,torch.int8,asm,1,1,19.5191,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,128.93,411.3,0.0 +80,192,1280,8192,torch.int8,asm,1,1,28.2354,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,142.61,444.48,0.0 +80,256,1280,8192,torch.int8,asm,160,4,23.0901,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,232.51,573.33,0.1074 +80,320,1280,8192,torch.int8,asm,2,1,30.0661,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,223.2,463.19,0.0 +80,512,1280,8192,torch.int8,asm,160,2,38.6745,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,277.64,413.47,0.0542 +80,1024,1280,8192,torch.int8,asm,160,1,69.7841,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,307.73,308.03,0.0 +80,2048,1280,8192,torch.int8,asm,160,1,132.2413,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,324.78,245.81,0.0 +80,4096,1280,8192,torch.int8,asm,7,1,232.4914,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,369.47,234.53,0.0 +80,8192,1280,8192,torch.int8,asm,6,1,453.9863,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,378.42,217.11,0.0 +80,16384,1280,8192,torch.int8,asm,6,1,904.0096,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,380.08,206.47,0.0 +80,64,1536,5120,torch.int8,asm,160,6,12.9215,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,77.9,649.2,0.1336 +80,80,1536,5120,torch.int8,asm,160,6,13.5695,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,92.73,627.86,0.135 +80,128,1536,5120,torch.int8,asm,160,6,14.3572,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,140.23,620.8,0.1335 +80,150,1536,5120,torch.int8,asm,1,1,19.8819,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,118.67,457.36,0.0 +80,192,1536,1024,torch.int8,asm,160,3,10.3979,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,58.09,226.9,0.0725 +80,192,1536,5120,torch.float8_e4m3fnuz,cktile,9,0,18.9229,a8w8_bpreshuffle_cktile_0x0x8x4x1x0x0x0x0x1_32x64x512_1x4x1_16x16x64_default,159.59,498.72,0.0 +80,192,1536,5120,torch.int8,asm,1,1,19.8492,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,152.14,475.44,0.0 +80,220,1536,5120,torch.int8,asm,160,3,21.1629,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,163.51,456.77,0.0857 +80,256,1536,5120,torch.int8,asm,160,3,21.3733,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,188.39,466.07,0.0853 +80,384,1536,5120,torch.int8,asm,3,1,23.2453,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,259.83,473.65,0.0 +80,448,1536,5120,torch.int8,asm,4,1,27.3473,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,257.66,421.77,0.0 +80,512,1536,5120,torch.int8,asm,5,1,31.6855,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_96x128E,254.16,380.57,0.0 +80,128,8192,1024,torch.int8,asm,160,1,14.9178,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,143.95,711.69,0.0 +80,192,8192,1024,torch.int8,asm,6,1,17.6599,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,182.4,664.27,0.0 +80,192,8192,5120,torch.int8,asm,6,1,59.744,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,269.59,771.15,0.0 +80,256,8192,1024,torch.int8,asm,160,1,27.0157,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,158.98,475.47,0.0 +80,320,8192,1024,torch.int8,asm,3,1,32.0456,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_64x128E,167.53,435.6,0.0 +80,512,8192,1024,torch.int8,asm,6,1,33.4552,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,256.76,517.15,0.0 +80,1024,8192,1024,torch.int8,asm,6,1,63.8033,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,269.26,410.86,0.0 +80,2048,8192,1024,torch.int8,asm,6,1,123.7788,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,277.59,355.8,0.0 +80,4096,8192,1024,torch.int8,asm,6,1,232.0475,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,296.14,343.43,0.0 +80,8192,8192,1024,torch.int8,asm,6,1,459.8929,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,298.85,328.33,0.0 +80,16384,8192,1024,torch.int8,asm,6,1,902.3644,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,304.62,325.37,0.0 +256,64,192,1024,torch.float8_e4m3fn,ck,15,0,4.5229,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,5.56,63.39,0.0 +256,1,800,5120,torch.float8_e4m3fn,ck,10,0,11.3133,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,0.72,362.65,0.0 +256,16,800,5120,torch.float8_e4m3fn,ck,24,0,10.5404,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,12.44,398.8,0.0 +256,32,800,5120,torch.float8_e4m3fn,ck,24,0,10.2279,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,25.63,421.5,0.0 +256,64,800,5120,torch.float8_e4m3fn,ck,10,0,10.5456,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,49.72,429.19,0.0 +256,128,800,5120,torch.float8_e4m3fn,ck,24,0,9.9852,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,105.01,496.35,0.0 +256,256,800,5120,torch.float8_e4m3fn,ck,24,0,11.6817,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,179.52,497.9,0.0 +256,512,800,5120,torch.float8_e4m3fn,ck,10,0,15.082,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,278.1,499.71,0.0 +256,1024,800,5120,torch.float8_e4m3fn,ck,24,0,24.558,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,341.58,446.99,0.0 +256,2048,800,5120,torch.float8_e4m3fn,ck,24,0,41.7422,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,401.92,427.83,0.0 +256,4096,800,5120,torch.float8_e4m3fn,ck,69,0,50.1312,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,669.33,630.77,0.0 +256,8192,800,5120,torch.float8_e4m3fn,ck,69,0,67.6989,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,991.28,873.67,0.0 +256,16384,800,5120,torch.float8_e4m3fn,ck,69,0,108.6586,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1235.22,1050.97,0.0 +256,32768,800,5120,torch.float8_e4m3fn,ck,69,0,196.771,a8w8_bpreshuffle_256x128x160x128_16x16_16x16_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3,1364.2,1139.89,0.0 +256,1,1280,8192,torch.float8_e4m3fn,ck,10,0,12.9809,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,1.62,808.61,0.0 +256,16,1280,8192,torch.float8_e4m3fn,ck,10,0,12.0223,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,27.91,886.5,0.0 +256,32,1280,8192,torch.float8_e4m3fn,ck,10,0,11.3752,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,59.0,952.06,0.0 +256,64,1280,8192,torch.float8_e4m3fn,ck,10,0,11.9979,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,111.87,931.32,0.0 +256,128,1280,8192,torch.float8_e4m3fn,ck,5,0,12.228,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,219.53,970.07,0.0 +256,256,1280,8192,torch.float8_e4m3fn,ck,12,0,17.1516,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,313.02,771.84,0.0 +256,512,1280,8192,torch.float8_e4m3fn,ck,114,0,21.033,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,510.5,760.27,0.0 +256,1024,1280,8192,torch.float8_e4m3fn,ck,113,0,26.819,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,800.73,801.51,0.0 +256,2048,1280,8192,torch.float8_e4m3fn,ck,123,0,36.7116,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1169.92,885.44,0.0 +256,4096,1280,8192,torch.float8_e4m3fn,ck,137,0,53.9659,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1591.73,1010.38,0.0 +256,8192,1280,8192,torch.float8_e4m3fn,ck,158,0,83.0796,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2067.88,1186.41,0.0 +256,16384,1280,8192,torch.float8_e4m3fn,ck,158,0,157.7142,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2178.61,1183.45,0.0 +256,32768,1280,8192,torch.float8_e4m3fn,ck,154,0,288.2725,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2383.84,1258.56,0.0 +256,1,2304,16384,torch.float8_e4m3fn,ck,10,0,28.7056,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,2.63,1315.76,0.0 +256,16,2304,16384,torch.float8_e4m3fn,ck,24,0,24.7047,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,48.9,1541.59,0.0 +256,32,2304,16384,torch.float8_e4m3fn,ck,24,0,25.2915,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,95.52,1519.11,0.0 +256,64,2304,16384,torch.float8_e4m3fn,ck,19,0,27.9421,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,172.92,1399.04,0.0 +256,128,2304,16384,torch.float8_e4m3fn,ck,24,0,33.0153,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,292.7,1224.76,0.0 +256,256,2304,16384,torch.float8_e4m3fn,ck,15,0,52.6426,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,367.14,819.16,0.0 +256,512,2304,16384,torch.float8_e4m3fn,ck,115,0,63.7872,a8w8_bpreshuffle_256x80x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,605.99,760.29,0.0 +256,1024,2304,16384,torch.float8_e4m3fn,ck,124,0,81.3585,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,950.23,728.19,0.0 +256,2048,2304,16384,torch.float8_e4m3fn,ck,131,0,101.2648,a8w8_bpreshuffle_256x112x192x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v3,1526.88,797.32,0.0 +256,4096,2304,16384,torch.float8_e4m3fn,ck,68,0,177.9892,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1737.4,695.17,0.0 +256,8192,2304,16384,torch.float8_e4m3fn,ck,149,0,311.0895,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1988.09,674.13,0.0 +256,16384,2304,16384,torch.float8_e4m3fn,ck,154,0,577.0581,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2143.55,661.43,0.0 +256,32768,2304,16384,torch.float8_e4m3fn,ck,154,0,1073.694,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2304.1,675.81,0.0 +256,1,2560,8192,torch.float8_e4m3fn,ck,10,0,12.8571,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,3.26,1632.16,0.0 +256,16,2560,8192,torch.float8_e4m3fn,ck,10,0,11.95,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,56.16,1772.76,0.0 +256,32,2560,8192,torch.float8_e4m3fn,ck,10,0,12.0316,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,111.55,1778.44,0.0 +256,64,2560,8192,torch.float8_e4m3fn,ck,5,0,12.8533,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,208.85,1697.89,0.0 +256,128,2560,8192,torch.float8_e4m3fn,ck,10,0,17.8034,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,301.56,1273.66,0.0 +256,256,2560,8192,torch.float8_e4m3fn,ck,114,0,21.4545,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,500.47,1136.33,0.0 +256,512,2560,8192,torch.float8_e4m3fn,ck,113,0,27.2155,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,789.07,1021.01,0.0 +256,1024,2560,8192,torch.float8_e4m3fn,ck,123,0,37.4372,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1147.25,924.29,0.0 +256,2048,2560,8192,torch.float8_e4m3fn,ck,138,0,55.1683,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1557.04,874.32,0.0 +256,4096,2560,8192,torch.float8_e4m3fn,ck,158,0,83.4719,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2058.16,904.47,0.0 +256,8192,2560,8192,torch.float8_e4m3fn,ck,158,0,155.4584,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2210.22,836.39,0.0 +256,16384,2560,8192,torch.float8_e4m3fn,ck,154,0,283.4813,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2424.13,843.35,0.0 +256,32768,2560,8192,torch.float8_e4m3fn,ck,158,0,549.7961,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2499.82,831.54,0.0 +256,1,4608,16384,torch.float8_e4m3fn,ck,10,0,29.8957,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.05,2526.22,0.0 +256,16,4608,16384,torch.float8_e4m3fn,ck,24,0,25.8456,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,93.48,2936.94,0.0 +256,32,4608,16384,torch.float8_e4m3fn,ck,24,0,29.2704,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,165.08,2607.3,0.0 +256,64,4608,16384,torch.float8_e4m3fn,ck,10,0,36.4754,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,264.94,2114.74,0.0 +256,128,4608,16384,torch.float8_e4m3fn,ck,108,0,54.3179,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,355.82,1450.25,0.0 +256,256,4608,16384,torch.float8_e4m3fn,ck,116,0,68.5478,a8w8_bpreshuffle_256x96x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,563.91,1196.99,0.0 +256,512,4608,16384,torch.float8_e4m3fn,ck,124,0,84.5982,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,913.84,1047.36,0.0 +256,1024,4608,16384,torch.float8_e4m3fn,ck,149,0,104.0599,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1485.86,977.44,0.0 +256,2048,4608,16384,torch.float8_e4m3fn,ck,68,0,176.1003,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1756.03,726.44,0.0 +256,4096,4608,16384,torch.float8_e4m3fn,ck,68,0,307.0339,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2014.36,587.41,0.0 +256,8192,4608,16384,torch.float8_e4m3fn,ck,154,0,561.2735,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2203.83,508.15,0.0 +256,16384,4608,16384,torch.float8_e4m3fn,ck,154,0,1050.0065,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2356.08,471.36,0.0 +256,32768,4608,16384,torch.float8_e4m3fn,ck,154,0,2120.2853,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2333.55,431.24,0.0 +256,1,5120,640,torch.float8_e4m3fn,ck,23,0,5.7466,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,1.14,572.11,0.0 +256,1,5120,1280,torch.float8_e4m3fn,ck,108,0,6.8374,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,1.92,960.18,0.0 +256,1,5120,3200,torch.float8_e4m3fn,ck,23,0,15.188,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,2.16,1079.63,0.0 +256,1,5120,5120,torch.float8_e4m3fn,ck,10,0,11.7201,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,4.47,2238.02,0.0 +256,1,5120,6400,torch.float8_e4m3fn,ck,108,0,19.1846,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,3.42,1708.9,0.0 +256,1,5120,25600,torch.float8_e4m3fn,ck,10,0,46.1466,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.68,2841.12,0.0 +256,16,5120,640,torch.float8_e4m3fn,ck,23,0,5.7485,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,18.24,600.31,0.0 +256,16,5120,1280,torch.float8_e4m3fn,ck,29,0,6.7718,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,30.97,995.0,0.0 +256,16,5120,3200,torch.float8_e4m3fn,ck,23,0,14.71,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,35.64,1128.42,0.0 +256,16,5120,5120,torch.float8_e4m3fn,ck,24,0,10.554,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,79.48,2507.12,0.0 +256,16,5120,6400,torch.float8_e4m3fn,ck,15,0,19.3187,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,54.28,1709.96,0.0 +256,16,5120,25600,torch.float8_e4m3fn,ck,24,0,39.1761,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,107.06,3360.35,0.0 +256,32,5120,640,torch.float8_e4m3fn,ck,23,0,6.4386,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,32.57,563.0,0.0 +256,32,5120,1280,torch.float8_e4m3fn,ck,29,0,6.9951,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,59.96,989.58,0.0 +256,32,5120,3200,torch.float8_e4m3fn,ck,9,0,15.6619,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,66.95,1073.57,0.0 +256,32,5120,5120,torch.float8_e4m3fn,ck,11,0,11.6951,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,143.46,2283.51,0.0 +256,32,5120,6400,torch.float8_e4m3fn,ck,15,0,19.3178,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,108.56,1723.82,0.0 +256,32,5120,25600,torch.float8_e4m3fn,ck,24,0,44.1556,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,189.98,2994.39,0.0 +256,64,5120,640,torch.float8_e4m3fn,ck,23,0,6.7207,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,62.41,591.18,0.0 +256,64,5120,1280,torch.float8_e4m3fn,ck,29,0,7.6564,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,109.56,952.26,0.0 +256,64,5120,3200,torch.float8_e4m3fn,ck,23,0,15.6284,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,134.19,1103.39,0.0 +256,64,5120,5120,torch.float8_e4m3fn,ck,10,0,14.3407,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,233.98,1896.52,0.0 +256,64,5120,6400,torch.float8_e4m3fn,ck,15,0,19.7146,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,212.75,1716.14,0.0 +256,64,5120,25600,torch.float8_e4m3fn,ck,24,0,52.8565,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,317.41,2523.17,0.0 +256,128,5120,640,torch.float8_e4m3fn,ck,23,0,6.3513,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,132.08,735.19,0.0 +256,128,5120,1280,torch.float8_e4m3fn,ck,29,0,7.7817,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,215.6,1031.67,0.0 +256,128,5120,3200,torch.float8_e4m3fn,ck,23,0,16.0101,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,261.98,1130.81,0.0 +256,128,5120,5120,torch.float8_e4m3fn,ck,108,0,19.63,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,341.87,1435.58,0.0 +256,128,5120,6400,torch.float8_e4m3fn,ck,29,0,22.9859,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v2,364.95,1518.23,0.0 +256,128,5120,25600,torch.float8_e4m3fn,ck,114,0,83.5847,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,401.44,1623.02,0.0 +256,256,5120,640,torch.float8_e4m3fn,ck,76,0,7.7921,a8w8_bpreshuffle_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v1,215.31,777.98,0.0 +256,256,5120,1280,torch.float8_e4m3fn,ck,119,0,10.1726,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,329.85,934.15,0.0 +256,256,5120,3200,torch.float8_e4m3fn,ck,77,0,19.1458,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,438.14,1035.46,0.0 +256,256,5120,5120,torch.float8_e4m3fn,ck,116,0,25.9267,a8w8_bpreshuffle_256x96x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,517.68,1162.76,0.0 +256,256,5120,6400,torch.float8_e4m3fn,ck,65,0,28.085,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,597.37,1318.42,0.0 +256,256,5120,25600,torch.float8_e4m3fn,ck,116,0,106.6527,a8w8_bpreshuffle_256x96x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,629.23,1314.99,0.0 +256,512,5120,640,torch.float8_e4m3fn,ck,77,0,9.0563,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,370.51,976.93,0.0 +256,512,5120,1280,torch.float8_e4m3fn,ck,77,0,12.8224,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,523.37,971.1,0.0 +256,512,5120,3200,torch.float8_e4m3fn,ck,85,0,23.0489,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,727.9,1009.39,0.0 +256,512,5120,5120,torch.float8_e4m3fn,ck,124,0,31.9895,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,839.14,1065.31,0.0 +256,512,5120,6400,torch.float8_e4m3fn,ck,139,0,36.8067,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,911.64,1121.74,0.0 +256,512,5120,25600,torch.float8_e4m3fn,ck,124,0,128.0705,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1048.0,1166.72,0.0 +256,1024,5120,640,torch.float8_e4m3fn,ck,86,0,11.1521,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,601.76,1292.84,0.0 +256,1024,5120,1280,torch.float8_e4m3fn,ck,86,0,16.1465,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,831.25,1136.47,0.0 +256,1024,5120,3200,torch.float8_e4m3fn,ck,51,0,29.1651,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1150.5,1033.65,0.0 +256,1024,5120,5120,torch.float8_e4m3fn,ck,137,0,40.2614,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1333.46,1041.77,0.0 +256,1024,5120,6400,torch.float8_e4m3fn,ck,137,0,48.5222,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1383.05,1026.49,0.0 +256,1024,5120,25600,torch.float8_e4m3fn,ck,137,0,162.5939,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1650.96,1031.85,0.0 +256,2048,5120,640,torch.float8_e4m3fn,ck,102,0,16.8369,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,797.16,1518.04,0.0 +256,2048,5120,1280,torch.float8_e4m3fn,ck,102,0,24.8167,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1081.67,1214.77,0.0 +256,2048,5120,3200,torch.float8_e4m3fn,ck,158,0,41.6304,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1612.02,1054.74,0.0 +256,2048,5120,5120,torch.float8_e4m3fn,ck,102,0,71.8612,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1494.19,802.54,0.0 +256,2048,5120,6400,torch.float8_e4m3fn,ck,158,0,76.511,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1754.23,873.69,0.0 +256,2048,5120,25600,torch.float8_e4m3fn,ck,137,0,301.0835,a8w8_bpreshuffle_256x96x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1783.13,679.12,0.0 +256,4096,5120,640,torch.float8_e4m3fn,ck,102,0,28.9248,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,928.05,1653.99,0.0 +256,4096,5120,1280,torch.float8_e4m3fn,ck,102,0,42.5949,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1260.41,1261.64,0.0 +256,4096,5120,3200,torch.float8_e4m3fn,ck,47,0,80.2905,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1671.65,889.7,0.0 +256,4096,5120,5120,torch.float8_e4m3fn,ck,138,0,121.2408,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1771.25,735.14,0.0 +256,4096,5120,6400,torch.float8_e4m3fn,ck,158,0,135.91,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1975.1,742.59,0.0 +256,4096,5120,25600,torch.float8_e4m3fn,ck,138,0,505.012,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2126.17,550.23,0.0 +256,8192,5120,640,torch.float8_e4m3fn,ck,143,0,50.4892,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1063.34,1830.21,0.0 +256,8192,5120,1280,torch.float8_e4m3fn,ck,143,0,77.0321,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1393.89,1310.17,0.0 +256,8192,5120,3200,torch.float8_e4m3fn,ck,40,0,139.3777,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1925.96,907.49,0.0 +256,8192,5120,5120,torch.float8_e4m3fn,ck,154,0,215.948,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1988.89,704.07,0.0 +256,8192,5120,6400,torch.float8_e4m3fn,ck,40,0,242.9206,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2210.07,696.04,0.0 +256,8192,5120,25600,torch.float8_e4m3fn,ck,154,0,938.5635,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2288.05,452.47,0.0 +256,16384,5120,640,torch.float8_e4m3fn,ck,143,0,91.8209,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1169.39,1977.05,0.0 +256,16384,5120,1280,torch.float8_e4m3fn,ck,143,0,139.3615,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1540.94,1401.37,0.0 +256,16384,5120,3200,torch.float8_e4m3fn,ck,33,0,250.8978,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2139.8,942.95,0.0 +256,16384,5120,5120,torch.float8_e4m3fn,ck,33,0,403.9327,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2126.58,687.92,0.0 +256,16384,5120,6400,torch.float8_e4m3fn,ck,107,0,443.5168,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2420.97,688.58,0.0 +256,16384,5120,25600,torch.float8_e4m3fn,ck,154,0,1817.1823,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2363.53,395.27,0.0 +256,32768,5120,640,torch.float8_e4m3fn,ck,143,0,181.1291,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1185.61,1986.39,0.0 +256,32768,5120,1280,torch.float8_e4m3fn,ck,102,0,269.995,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1590.76,1422.4,0.0 +256,32768,5120,3200,torch.float8_e4m3fn,ck,33,0,492.1153,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2181.89,928.21,0.0 +256,32768,5120,5120,torch.float8_e4m3fn,ck,33,0,764.8154,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2246.28,692.36,0.0 +256,32768,5120,6400,torch.float8_e4m3fn,ck,33,0,868.9521,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2471.35,665.2,0.0 +256,32768,5120,25600,torch.float8_e4m3fn,ck,154,0,3714.3301,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2312.65,351.47,0.0 +256,1,6400,5120,torch.float8_e4m3fn,ck,24,0,12.5759,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,5.21,2607.04,0.0 +256,16,6400,5120,torch.float8_e4m3fn,ck,10,0,11.3305,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,92.54,2917.32,0.0 +256,32,6400,5120,torch.float8_e4m3fn,ck,5,0,12.3982,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,169.15,2689.22,0.0 +256,64,6400,5120,torch.float8_e4m3fn,ck,12,0,14.7961,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,283.47,2292.15,0.0 +256,128,6400,5120,torch.float8_e4m3fn,ck,114,0,20.5908,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,407.4,1702.79,0.0 +256,256,6400,5120,torch.float8_e4m3fn,ck,65,0,26.9347,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,622.88,1386.89,0.0 +256,512,6400,5120,torch.float8_e4m3fn,ck,124,0,33.3791,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1005.25,1256.57,0.0 +256,1024,6400,5120,torch.float8_e4m3fn,ck,138,0,45.101,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1487.97,1133.41,0.0 +256,2048,6400,5120,torch.float8_e4m3fn,ck,138,0,84.6213,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1586.1,820.93,0.0 +256,4096,6400,5120,torch.float8_e4m3fn,ck,40,0,148.8986,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1802.81,713.02,0.0 +256,8192,6400,5120,torch.float8_e4m3fn,ck,40,0,270.3545,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1985.8,664.2,0.0 +256,16384,6400,5120,torch.float8_e4m3fn,ck,154,0,516.4428,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2079.11,631.96,0.0 +256,32768,6400,5120,torch.float8_e4m3fn,ck,33,0,980.8335,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2189.45,632.09,0.0 +256,8,6656,16384,torch.float8_e4m3fn,ck,10,0,31.2392,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,55.85,3498.47,0.0 +256,1,7168,8192,torch.float8_e4m3fn,ck,24,0,15.0501,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,7.8,3903.15,0.0 +256,16,7168,8192,torch.float8_e4m3fn,ck,10,0,13.6022,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,138.14,4343.47,0.0 +256,32,7168,8192,torch.float8_e4m3fn,ck,11,0,15.5322,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,241.96,3826.96,0.0 +256,64,7168,8192,torch.float8_e4m3fn,ck,12,0,20.6714,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,363.6,2910.4,0.0 +256,128,7168,8192,torch.float8_e4m3fn,ck,114,0,23.7352,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,633.34,2595.46,0.0 +256,256,7168,8192,torch.float8_e4m3fn,ck,65,0,29.6374,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1014.42,2175.88,0.0 +256,512,7168,8192,torch.float8_e4m3fn,ck,63,0,42.0669,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1429.38,1670.07,0.0 +256,1024,7168,8192,torch.float8_e4m3fn,ck,154,0,62.2696,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1931.26,1313.46,0.0 +256,2048,7168,8192,torch.float8_e4m3fn,ck,33,0,112.617,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2135.72,931.1,0.0 +256,4096,7168,8192,torch.float8_e4m3fn,ck,156,0,206.7372,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2326.8,730.37,0.0 +256,8192,7168,8192,torch.float8_e4m3fn,ck,158,0,385.4824,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2495.76,631.08,0.0 +256,16384,7168,8192,torch.float8_e4m3fn,ck,154,0,761.2665,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2527.56,561.98,0.0 +256,32768,7168,8192,torch.float8_e4m3fn,ck,158,0,1484.7467,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2591.88,536.74,0.0 +256,1,8192,1024,torch.float8_e4m3fn,ck,108,0,5.9704,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,2.81,1407.95,0.0 +256,1,8192,2048,torch.float8_e4m3fn,ck,10,0,5.8009,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,5.78,2895.35,0.0 +256,1,8192,3584,torch.float8_e4m3fn,ck,10,0,7.8125,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,7.52,3760.65,0.0 +256,1,8192,7168,torch.float8_e4m3fn,ck,24,0,13.9797,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,8.4,4202.08,0.0 +256,1,8192,8192,torch.float8_e4m3fn,ck,10,0,14.8978,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.01,4506.27,0.0 +256,1,8192,28672,torch.float8_e4m3fn,ck,10,0,46.0411,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.2,5102.53,0.0 +256,16,8192,1024,torch.float8_e4m3fn,ck,15,0,5.9167,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,45.37,1464.86,0.0 +256,16,8192,2048,torch.float8_e4m3fn,ck,10,0,5.6214,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,95.5,3036.99,0.0 +256,16,8192,3584,torch.float8_e4m3fn,ck,10,0,7.8314,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,119.97,3789.82,0.0 +256,16,8192,7168,torch.float8_e4m3fn,ck,10,0,12.5267,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,150.0,4717.69,0.0 +256,16,8192,8192,torch.float8_e4m3fn,ck,10,0,13.9737,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,153.68,4830.65,0.0 +256,16,8192,28672,torch.float8_e4m3fn,ck,10,0,41.1742,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,182.55,5722.08,0.0 +256,32,8192,1024,torch.float8_e4m3fn,ck,108,0,6.0451,a8w8_bpreshuffle_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,88.81,1479.82,0.0 +256,32,8192,2048,torch.float8_e4m3fn,ck,5,0,6.2663,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,171.35,2771.5,0.0 +256,32,8192,3584,torch.float8_e4m3fn,ck,5,0,7.9538,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,236.25,3771.67,0.0 +256,32,8192,7168,torch.float8_e4m3fn,ck,11,0,14.5895,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,257.59,4076.49,0.0 +256,32,8192,8192,torch.float8_e4m3fn,ck,5,0,15.7818,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,272.15,4302.13,0.0 +256,32,8192,28672,torch.float8_e4m3fn,ck,5,0,47.2548,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,318.11,5001.03,0.0 +256,64,8192,2048,torch.float8_e4m3fn,ck,5,0,7.5112,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,285.9,2390.68,0.0 +256,64,8192,3584,torch.float8_e4m3fn,ck,12,0,11.0813,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,339.14,2764.85,0.0 +256,64,8192,7168,torch.float8_e4m3fn,ck,12,0,19.2039,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,391.39,3136.22,0.0 +256,64,8192,8192,torch.float8_e4m3fn,ck,12,0,21.1081,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,406.95,3253.81,0.0 +256,64,8192,28672,torch.float8_e4m3fn,ck,12,0,64.9256,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,463.06,3662.11,0.0 +256,128,8192,1024,torch.float8_e4m3fn,ck,119,0,6.9407,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,309.4,1529.65,0.0 +256,128,8192,2048,torch.float8_e4m3fn,ck,114,0,8.8614,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,484.68,2159.54,0.0 +256,128,8192,3584,torch.float8_e4m3fn,ck,114,0,12.978,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,579.15,2459.24,0.0 +256,128,8192,7168,torch.float8_e4m3fn,ck,114,0,21.6976,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,692.81,2845.24,0.0 +256,128,8192,8192,torch.float8_e4m3fn,ck,114,0,24.8294,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,691.92,2829.49,0.0 +256,128,8192,28672,torch.float8_e4m3fn,ck,114,0,75.1872,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,799.73,3200.65,0.0 +256,256,8192,1024,torch.float8_e4m3fn,ck,77,0,9.7085,a8w8_bpreshuffle_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,442.39,1323.07,0.0 +256,256,8192,2048,torch.float8_e4m3fn,ck,114,0,11.9681,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,717.74,1796.09,0.0 +256,256,8192,3584,torch.float8_e4m3fn,ck,144,0,16.6491,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,902.89,2070.5,0.0 +256,256,8192,7168,torch.float8_e4m3fn,ck,65,0,28.6205,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1050.46,2262.35,0.0 +256,256,8192,8192,torch.float8_e4m3fn,ck,144,0,30.9086,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1111.66,2374.75,0.0 +256,256,8192,28672,torch.float8_e4m3fn,ck,65,0,95.3629,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1261.07,2583.98,0.0 +256,512,8192,1024,torch.float8_e4m3fn,ck,85,0,11.5306,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,744.97,1500.49,0.0 +256,512,8192,2048,torch.float8_e4m3fn,ck,139,0,15.6554,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1097.38,1674.46,0.0 +256,512,8192,3584,torch.float8_e4m3fn,ck,139,0,22.7592,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1320.99,1739.24,0.0 +256,512,8192,7168,torch.float8_e4m3fn,ck,139,0,39.3552,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1527.87,1798.46,0.0 +256,512,8192,8192,torch.float8_e4m3fn,ck,139,0,43.6902,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1572.88,1824.02,0.0 +256,512,8192,28672,torch.float8_e4m3fn,ck,139,0,133.9721,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1795.29,1925.4,0.0 +256,1024,8192,1024,torch.float8_e4m3fn,ck,0,0,16.8199,a8w8_bpreshuffle_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1021.4,1558.53,0.0 +256,1024,8192,2048,torch.float8_e4m3fn,ck,154,0,23.1654,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1483.24,1539.0,0.0 +256,1024,8192,3584,torch.float8_e4m3fn,ck,154,0,34.8454,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1725.61,1429.38,0.0 +256,1024,8192,7168,torch.float8_e4m3fn,ck,154,0,59.0054,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2038.1,1403.9,0.0 +256,1024,8192,8192,torch.float8_e4m3fn,ck,154,0,64.2479,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2139.2,1436.23,0.0 +256,1024,8192,28672,torch.float8_e4m3fn,ck,154,0,197.2769,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2438.38,1424.49,0.0 +256,2048,8192,1024,torch.float8_e4m3fn,ck,86,0,28.9708,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1186.01,1520.16,0.0 +256,2048,8192,2048,torch.float8_e4m3fn,ck,33,0,42.2356,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1627.05,1291.0,0.0 +256,2048,8192,3584,torch.float8_e4m3fn,ck,33,0,61.9146,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1942.34,1134.7,0.0 +256,2048,8192,7168,torch.float8_e4m3fn,ck,33,0,105.8446,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2272.37,1010.49,0.0 +256,2048,8192,8192,torch.float8_e4m3fn,ck,33,0,117.6729,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2335.95,998.03,0.0 +256,2048,8192,28672,torch.float8_e4m3fn,ck,33,0,357.8032,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2688.83,914.35,0.0 +256,4096,8192,1024,torch.float8_e4m3fn,ck,86,0,49.3525,a8w8_bpreshuffle_256x96x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1392.42,1614.75,0.0 +256,4096,8192,2048,torch.float8_e4m3fn,ck,143,0,80.1603,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1714.55,1151.13,0.0 +256,4096,8192,3584,torch.float8_e4m3fn,ck,33,0,117.1442,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2053.18,948.82,0.0 +256,4096,8192,7168,torch.float8_e4m3fn,ck,33,0,196.6543,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2446.1,789.15,0.0 +256,4096,8192,8192,torch.float8_e4m3fn,ck,33,0,221.6286,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2480.53,757.0,0.0 +256,4096,8192,28672,torch.float8_e4m3fn,ck,33,0,686.9896,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2800.84,610.53,0.0 +256,8192,8192,1024,torch.float8_e4m3fn,ck,51,0,92.436,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1486.86,1633.51,0.0 +256,8192,8192,2048,torch.float8_e4m3fn,ck,51,0,141.6683,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1940.29,1184.26,0.0 +256,8192,8192,3584,torch.float8_e4m3fn,ck,143,0,222.4514,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2162.43,867.33,0.0 +256,8192,8192,7168,torch.float8_e4m3fn,ck,33,0,383.1227,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2511.13,656.86,0.0 +256,8192,8192,8192,torch.float8_e4m3fn,ck,33,0,424.8317,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2588.11,631.86,0.0 +256,8192,8192,28672,torch.float8_e4m3fn,ck,33,0,1389.0293,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2770.49,434.82,0.0 +256,16384,8192,1024,torch.float8_e4m3fn,ck,143,0,172.4777,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1593.7,1702.26,0.0 +256,16384,8192,2048,torch.float8_e4m3fn,ck,51,0,272.4657,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2017.71,1169.93,0.0 +256,16384,8192,3584,torch.float8_e4m3fn,ck,33,0,421.1579,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2284.35,846.51,0.0 +256,16384,8192,7168,torch.float8_e4m3fn,ck,158,0,753.6754,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2553.02,589.9,0.0 +256,16384,8192,8192,torch.float8_e4m3fn,ck,158,0,852.0249,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2580.94,551.35,0.0 +256,16384,8192,28672,torch.float8_e4m3fn,ck,33,0,2785.3542,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2763.23,349.36,0.0 +256,32768,8192,1024,torch.float8_e4m3fn,ck,143,0,340.0384,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1616.75,1702.2,0.0 +256,32768,8192,2048,torch.float8_e4m3fn,ck,143,0,536.6149,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2048.98,1156.8,0.0 +256,32768,8192,3584,torch.float8_e4m3fn,ck,51,0,841.2106,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2287.35,812.72,0.0 +256,32768,8192,7168,torch.float8_e4m3fn,ck,158,0,1500.7026,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2564.33,553.39,0.0 +256,32768,8192,8192,torch.float8_e4m3fn,ck,158,0,1696.016,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2593.16,514.39,0.0 +256,32768,8192,28672,torch.float8_e4m3fn,ck,33,0,5558.3544,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2769.37,307.87,0.0 +256,1,9216,16384,torch.float8_e4m3fn,ck,10,0,35.2763,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.56,4281.34,0.0 +256,16,9216,16384,torch.float8_e4m3fn,ck,24,0,32.3786,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,149.23,4680.62,0.0 +256,32,9216,16384,torch.float8_e4m3fn,ck,24,0,39.3729,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,245.44,3863.29,0.0 +256,64,9216,16384,torch.float8_e4m3fn,ck,114,0,56.6696,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,341.05,2703.8,0.0 +256,128,9216,16384,torch.float8_e4m3fn,ck,126,0,72.8197,a8w8_bpreshuffle_256x32x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v1,530.83,2134.74,0.0 +256,256,9216,16384,torch.float8_e4m3fn,ck,124,0,84.3085,a8w8_bpreshuffle_256x112x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,916.98,1896.7,0.0 +256,512,9216,16384,torch.float8_e4m3fn,ck,149,0,104.8693,a8w8_bpreshuffle_256x128x192x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1474.4,1609.82,0.0 +256,1024,9216,16384,torch.float8_e4m3fn,ck,54,0,178.1558,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1735.77,1047.66,0.0 +256,2048,9216,16384,torch.float8_e4m3fn,ck,68,0,305.2233,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2026.3,728.31,0.0 +256,4096,9216,16384,torch.float8_e4m3fn,ck,95,0,554.8485,a8w8_bpreshuffle_256x128x192x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2229.35,529.16,0.0 +256,8192,9216,16384,torch.float8_e4m3fn,ck,154,0,1095.3193,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2258.61,398.25,0.0 +256,16384,9216,16384,torch.float8_e4m3fn,ck,33,0,2047.2744,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2416.78,352.38,0.0 +256,32768,9216,16384,torch.float8_e4m3fn,ck,33,0,4103.0357,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2411.78,314.85,0.0 +256,1,10240,8192,torch.float8_e4m3fn,ck,10,0,16.9524,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,9.9,4950.02,0.0 +256,16,10240,8192,torch.float8_e4m3fn,ck,10,0,17.892,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,150.03,4714.11,0.0 +256,32,10240,8192,torch.float8_e4m3fn,ck,10,0,22.5861,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,237.7,3754.68,0.0 +256,64,10240,8192,torch.float8_e4m3fn,ck,114,0,25.5687,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,419.94,3352.58,0.0 +256,128,10240,8192,torch.float8_e4m3fn,ck,113,0,30.4124,a8w8_bpreshuffle_256x48x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4x4x1_1x1_intrawave_v1,706.12,2878.96,0.0 +256,256,10240,8192,torch.float8_e4m3fn,ck,123,0,38.8017,a8w8_bpreshuffle_256x96x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1106.9,2351.09,0.0 +256,512,10240,8192,torch.float8_e4m3fn,ck,143,0,55.4465,a8w8_bpreshuffle_256x192x128x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1549.23,1777.68,0.0 +256,1024,10240,8192,torch.float8_e4m3fn,ck,158,0,83.5863,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2055.35,1354.84,0.0 +256,2048,10240,8192,torch.float8_e4m3fn,ck,158,0,156.1052,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2201.06,913.53,0.0 +256,4096,10240,8192,torch.float8_e4m3fn,ck,106,0,283.3828,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2424.97,710.44,0.0 +256,8192,10240,8192,torch.float8_e4m3fn,ck,33,0,533.4106,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2576.61,597.6,0.0 +256,16384,10240,8192,torch.float8_e4m3fn,ck,33,0,1068.7185,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2572.03,518.05,0.0 +256,32768,10240,8192,torch.float8_e4m3fn,ck,158,0,2102.9024,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2614.27,486.67,0.0 +256,1,12800,5120,torch.float8_e4m3fn,ck,10,0,14.7141,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,8.91,4456.05,0.0 +256,16,12800,5120,torch.float8_e4m3fn,ck,5,0,14.0716,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,149.03,4692.25,0.0 +256,32,12800,5120,torch.float8_e4m3fn,ck,12,0,17.5533,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,238.95,3789.55,0.0 +256,64,12800,5120,torch.float8_e4m3fn,ck,119,0,21.9262,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,382.58,3078.6,0.0 +256,128,12800,5120,torch.float8_e4m3fn,ck,65,0,28.9421,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,579.68,2400.25,0.0 +256,256,12800,5120,torch.float8_e4m3fn,ck,139,0,34.8457,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,962.94,2106.44,0.0 +256,512,12800,5120,torch.float8_e4m3fn,ck,138,0,46.2157,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1452.08,1758.38,0.0 +256,1024,12800,5120,torch.float8_e4m3fn,ck,138,0,86.9605,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1543.43,1115.37,0.0 +256,2048,12800,5120,torch.float8_e4m3fn,ck,40,0,154.2962,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1739.74,832.49,0.0 +256,4096,12800,5120,torch.float8_e4m3fn,ck,40,0,270.4291,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1985.26,707.64,0.0 +256,8192,12800,5120,torch.float8_e4m3fn,ck,158,0,519.2638,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2067.82,610.85,0.0 +256,16384,12800,5120,torch.float8_e4m3fn,ck,33,0,974.5645,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2203.53,583.7,0.0 +256,32768,12800,5120,torch.float8_e4m3fn,ck,33,0,1888.3681,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2274.43,567.78,0.0 +256,1,13312,16384,torch.float8_e4m3fn,ck,10,0,40.7795,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.7,5349.42,0.0 +256,16,13312,16384,torch.float8_e4m3fn,ck,10,0,40.3808,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,172.84,5418.22,0.0 +256,32,13312,16384,torch.float8_e4m3fn,ck,12,0,46.3062,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,301.44,4739.76,0.0 +256,64,13312,16384,torch.float8_e4m3fn,ck,114,0,60.16,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,464.05,3671.15,0.0 +256,128,13312,16384,torch.float8_e4m3fn,ck,144,0,79.9061,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,698.75,2798.4,0.0 +256,256,13312,16384,torch.float8_e4m3fn,ck,139,0,92.6483,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1205.3,2472.94,0.0 +256,512,13312,16384,torch.float8_e4m3fn,ck,154,0,127.8746,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1746.54,1877.81,0.0 +256,1024,13312,16384,torch.float8_e4m3fn,ck,154,0,243.5736,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1833.85,1076.24,0.0 +256,2048,13312,16384,torch.float8_e4m3fn,ck,138,0,457.646,a8w8_bpreshuffle_256x112x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1952.06,669.04,0.0 +256,4096,13312,16384,torch.float8_e4m3fn,ck,154,0,842.3709,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2121.04,468.04,0.0 +256,8192,13312,16384,torch.float8_e4m3fn,ck,33,0,1546.1027,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2311.24,368.94,0.0 +256,16384,13312,16384,torch.float8_e4m3fn,ck,33,0,2958.4415,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2415.74,311.9,0.0 +256,32768,13312,16384,torch.float8_e4m3fn,ck,33,0,6036.8682,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2367.73,269.58,0.0 +256,1,14336,8192,torch.float8_e4m3fn,ck,10,0,21.766,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.79,5397.29,0.0 +256,16,14336,8192,torch.float8_e4m3fn,ck,10,0,21.2685,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,176.7,5549.54,0.0 +256,32,14336,8192,torch.float8_e4m3fn,ck,6,0,26.4468,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,284.2,4485.24,0.0 +256,64,14336,8192,torch.float8_e4m3fn,ck,114,0,29.5848,a8w8_bpreshuffle_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,508.11,4049.37,0.0 +256,128,14336,8192,torch.float8_e4m3fn,ck,144,0,36.1842,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,830.88,3376.03,0.0 +256,256,14336,8192,torch.float8_e4m3fn,ck,139,0,45.4901,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1321.82,2789.13,0.0 +256,512,14336,8192,torch.float8_e4m3fn,ck,154,0,63.6195,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1890.29,2142.66,0.0 +256,1024,14336,8192,torch.float8_e4m3fn,ck,33,0,112.839,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2131.52,1375.32,0.0 +256,2048,14336,8192,torch.float8_e4m3fn,ck,54,0,205.0365,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2346.1,940.99,0.0 +256,4096,14336,8192,torch.float8_e4m3fn,ck,158,0,387.5867,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2482.21,692.58,0.0 +256,8192,14336,8192,torch.float8_e4m3fn,ck,33,0,734.9548,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2618.05,570.69,0.0 +256,16384,14336,8192,torch.float8_e4m3fn,ck,158,0,1477.5648,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2604.48,488.25,0.0 +256,32768,14336,8192,torch.float8_e4m3fn,ck,158,0,2945.0068,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2613.43,450.05,0.0 +256,1,16384,2048,torch.float8_e4m3fn,ck,24,0,9.9256,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,6.76,3384.1,0.0 +256,1,16384,4096,torch.float8_e4m3fn,ck,24,0,14.6481,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,9.16,4583.92,0.0 +256,1,16384,6656,torch.float8_e4m3fn,ck,24,0,21.1312,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,10.32,5162.57,0.0 +256,1,16384,8192,torch.float8_e4m3fn,ck,10,0,25.6037,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,10.48,5243.72,0.0 +256,1,16384,13312,torch.float8_e4m3fn,ck,11,0,39.322,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,11.09,5547.78,0.0 +256,1,16384,26624,torch.float8_e4m3fn,ck,5,0,73.9656,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,11.79,5898.24,0.0 +256,4,16384,6656,torch.float8_e4m3fn,ck,11,0,20.7235,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,42.1,5269.84,0.0 +256,8,16384,6656,torch.float8_e4m3fn,ck,11,0,20.8171,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,83.82,5253.72,0.0 +256,16,16384,2048,torch.float8_e4m3fn,ck,10,0,7.7604,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,138.36,4395.58,0.0 +256,16,16384,4096,torch.float8_e4m3fn,ck,26,0,17.4151,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v2,123.31,3887.36,0.0 +256,16,16384,6656,torch.float8_e4m3fn,ck,24,0,20.8707,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2,167.2,5255.34,0.0 +256,16,16384,8192,torch.float8_e4m3fn,ck,10,0,25.0145,a8w8_bpreshuffle_128x16x32x512_16x16_16x16_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v1,171.7,5391.8,0.0 +256,16,16384,13312,torch.float8_e4m3fn,ck,5,0,37.8531,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,184.38,5781.33,0.0 +256,16,16384,26624,torch.float8_e4m3fn,ck,11,0,72.6906,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v1,192.03,6013.95,0.0 +256,32,16384,2048,torch.float8_e4m3fn,ck,6,0,9.6867,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,221.69,3578.98,0.0 +256,32,16384,4096,torch.float8_e4m3fn,ck,26,0,17.6799,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v2,242.93,3862.49,0.0 +256,32,16384,6656,torch.float8_e4m3fn,ck,12,0,23.1146,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,301.94,4772.46,0.0 +256,32,16384,8192,torch.float8_e4m3fn,ck,19,0,28.8166,a8w8_bpreshuffle_256x16x64x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2,298.09,4703.14,0.0 +256,32,16384,13312,torch.float8_e4m3fn,ck,12,0,40.7803,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,342.29,5384.42,0.0 +256,32,16384,26624,torch.float8_e4m3fn,ck,12,0,77.4146,a8w8_bpreshuffle_256x32x64x512_16x16_16x16_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v1,360.62,5659.24,0.0 +256,64,16384,2048,torch.float8_e4m3fn,ck,119,0,11.4341,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,375.63,3129.47,0.0 +256,64,16384,4096,torch.float8_e4m3fn,ck,119,0,20.8735,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,411.52,3328.06,0.0 +256,64,16384,6656,torch.float8_e4m3fn,ck,119,0,29.3009,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,476.39,3807.9,0.0 +256,64,16384,8192,torch.float8_e4m3fn,ck,119,0,35.6664,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,481.68,3836.64,0.0 +256,64,16384,13312,torch.float8_e4m3fn,ck,119,0,53.1879,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,524.88,4156.08,0.0 +256,64,16384,26624,torch.float8_e4m3fn,ck,119,0,102.4795,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,544.84,4293.63,0.0 +256,128,16384,2048,torch.float8_e4m3fn,ck,119,0,15.5043,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,554.04,2451.63,0.0 +256,128,16384,4096,torch.float8_e4m3fn,ck,121,0,26.5935,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,646.02,2700.94,0.0 +256,128,16384,6656,torch.float8_e4m3fn,ck,65,0,39.0387,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,715.12,2922.69,0.0 +256,128,16384,8192,torch.float8_e4m3fn,ck,121,0,45.9107,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,748.4,3037.65,0.0 +256,128,16384,13312,torch.float8_e4m3fn,ck,65,0,70.6454,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,790.35,3170.79,0.0 +256,128,16384,26624,torch.float8_e4m3fn,ck,144,0,134.0597,a8w8_bpreshuffle_256x128x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,832.98,3310.54,0.0 +256,256,16384,2048,torch.float8_e4m3fn,ck,139,0,18.9985,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,904.28,2235.3,0.0 +256,256,16384,4096,torch.float8_e4m3fn,ck,63,0,32.6482,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1052.42,2344.57,0.0 +256,256,16384,6656,torch.float8_e4m3fn,ck,139,0,46.695,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1195.73,2551.55,0.0 +256,256,16384,8192,torch.float8_e4m3fn,ck,139,0,55.672,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1234.36,2599.21,0.0 +256,256,16384,13312,torch.float8_e4m3fn,ck,139,0,83.1524,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1342.95,2764.81,0.0 +256,256,16384,26624,torch.float8_e4m3fn,ck,139,0,163.3311,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1367.4,2763.78,0.0 +256,512,16384,2048,torch.float8_e4m3fn,ck,154,0,27.0054,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1272.33,1902.59,0.0 +256,512,16384,4096,torch.float8_e4m3fn,ck,154,0,43.8782,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1566.14,1959.59,0.0 +256,512,16384,6656,torch.float8_e4m3fn,ck,154,0,64.427,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1733.27,2005.94,0.0 +256,512,16384,8192,torch.float8_e4m3fn,ck,154,0,73.6131,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1867.04,2108.17,0.0 +256,512,16384,13312,torch.float8_e4m3fn,ck,154,0,114.1614,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1956.34,2117.15,0.0 +256,512,16384,26624,torch.float8_e4m3fn,ck,154,0,214.713,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2080.34,2173.21,0.0 +256,1024,16384,2048,torch.float8_e4m3fn,ck,33,0,49.7279,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1381.91,1391.69,0.0 +256,1024,16384,4096,torch.float8_e4m3fn,ck,33,0,81.4525,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1687.35,1287.35,0.0 +256,1024,16384,6656,torch.float8_e4m3fn,ck,2,0,120.523,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3,1853.08,1239.78,0.0 +256,1024,16384,8192,torch.float8_e4m3fn,ck,154,0,136.0178,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2020.9,1295.13,0.0 +256,1024,16384,13312,torch.float8_e4m3fn,ck,154,0,214.8294,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2079.22,1234.89,0.0 +256,1024,16384,26624,torch.float8_e4m3fn,ck,154,0,412.5087,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2165.66,1204.88,0.0 +256,2048,16384,2048,torch.float8_e4m3fn,ck,33,0,89.7238,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1531.8,1168.67,0.0 +256,2048,16384,4096,torch.float8_e4m3fn,ck,33,0,146.1468,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1880.83,975.77,0.0 +256,2048,16384,6656,torch.float8_e4m3fn,ck,33,0,216.3109,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2064.97,877.4,0.0 +256,2048,16384,8192,torch.float8_e4m3fn,ck,33,0,253.8314,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2165.83,859.25,0.0 +256,2048,16384,13312,torch.float8_e4m3fn,ck,33,0,388.9416,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2296.88,803.4,0.0 +256,2048,16384,26624,torch.float8_e4m3fn,ck,154,0,777.8647,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2296.94,717.15,0.0 +256,4096,16384,2048,torch.float8_e4m3fn,ck,102,0,160.9167,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1708.2,1094.73,0.0 +256,4096,16384,4096,torch.float8_e4m3fn,ck,33,0,255.2978,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2153.39,854.31,0.0 +256,4096,16384,6656,torch.float8_e4m3fn,ck,33,0,407.5508,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2192.0,663.8,0.0 +256,4096,16384,8192,torch.float8_e4m3fn,ck,33,0,471.5371,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2331.76,640.44,0.0 +256,4096,16384,13312,torch.float8_e4m3fn,ck,33,0,740.5043,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2412.82,549.42,0.0 +256,4096,16384,26624,torch.float8_e4m3fn,ck,33,0,1511.0293,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2364.89,449.68,0.0 +256,8192,16384,2048,torch.float8_e4m3fn,ck,102,0,303.9224,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1808.87,1048.84,0.0 +256,8192,16384,4096,torch.float8_e4m3fn,ck,33,0,497.9337,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2208.15,741.26,0.0 +256,8192,16384,6656,torch.float8_e4m3fn,ck,33,0,770.3086,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2319.47,560.83,0.0 +256,8192,16384,8192,torch.float8_e4m3fn,ck,33,0,903.531,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2433.81,519.92,0.0 +256,8192,16384,13312,torch.float8_e4m3fn,ck,107,0,1430.2158,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_4x4x1_1x1_intrawave_v3,2498.51,416.43,0.0 +256,8192,16384,26624,torch.float8_e4m3fn,ck,154,0,3033.7821,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2355.75,304.16,0.0 +256,16384,16384,2048,torch.float8_e4m3fn,ck,102,0,598.3663,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1837.52,1009.38,0.0 +256,16384,16384,4096,torch.float8_e4m3fn,ck,33,0,988.8674,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2223.78,678.64,0.0 +256,16384,16384,6656,torch.float8_e4m3fn,ck,33,0,1537.3785,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2324.35,491.08,0.0 +256,16384,16384,8192,torch.float8_e4m3fn,ck,33,0,1794.5054,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2450.84,448.76,0.0 +256,16384,16384,13312,torch.float8_e4m3fn,ck,33,0,2976.2278,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2401.3,326.95,0.0 +256,16384,16384,26624,torch.float8_e4m3fn,ck,154,0,6167.0855,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2317.73,228.52,0.0 +256,32768,16384,2048,torch.float8_e4m3fn,ck,102,0,1180.9193,a8w8_bpreshuffle_256x96x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1862.13,994.48,0.0 +256,32768,16384,4096,torch.float8_e4m3fn,ck,33,0,1989.9333,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2210.15,640.76,0.0 +256,32768,16384,6656,torch.float8_e4m3fn,ck,33,0,3099.1632,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2306.05,452.02,0.0 +256,32768,16384,8192,torch.float8_e4m3fn,ck,33,0,3588.7344,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2451.03,411.4,0.0 +256,32768,16384,13312,torch.float8_e4m3fn,ck,33,0,5598.3345,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2553.2,308.67,0.0 +256,32768,16384,26624,torch.float8_e4m3fn,ck,154,0,12187.7899,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2345.57,195.47,0.0 +256,1,26624,16384,torch.float8_e4m3fn,ck,6,0,76.8576,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.35,5676.44,0.0 +256,16,26624,16384,torch.float8_e4m3fn,ck,6,0,77.3342,a8w8_bpreshuffle_256x16x128x512_16x16_16x16_32x8x1_32x8x1_1x16x1x16_8x8x1_1x2_intrawave_v1,180.5,5654.96,0.0 +256,32,26624,16384,torch.float8_e4m3fn,ck,119,0,80.3452,a8w8_bpreshuffle_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,347.47,5456.9,0.0 +256,64,26624,16384,torch.float8_e4m3fn,ck,121,0,91.6538,a8w8_bpreshuffle_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,609.19,4807.92,0.0 +256,128,26624,16384,torch.float8_e4m3fn,ck,139,0,108.1175,a8w8_bpreshuffle_256x128x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1032.85,4117.01,0.0 +256,256,26624,16384,torch.float8_e4m3fn,ck,154,0,141.6113,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1577.12,3206.19,0.0 +256,512,26624,16384,torch.float8_e4m3fn,ck,154,0,250.2793,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1784.71,1885.33,0.0 +256,1024,26624,16384,torch.float8_e4m3fn,ck,33,0,467.1211,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1912.47,1086.47,0.0 +256,2048,26624,16384,torch.float8_e4m3fn,ck,154,0,848.1746,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2106.53,682.42,0.0 +256,4096,26624,16384,torch.float8_e4m3fn,ck,33,0,1553.947,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2299.57,464.25,0.0 +256,8192,26624,16384,torch.float8_e4m3fn,ck,33,0,2904.3718,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2460.71,346.59,0.0 +256,16384,26624,16384,torch.float8_e4m3fn,ck,33,0,5797.7818,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2465.37,272.01,0.0 +256,32768,26624,16384,torch.float8_e4m3fn,ck,33,0,11524.899,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2480.48,235.83,0.0 +256,1,51200,5120,torch.float8_e4m3fn,ck,16,0,45.1392,a8w8_bpreshuffle_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,11.61,5809.84,0.0 +256,16,51200,5120,torch.float8_e4m3fn,ck,17,0,46.1585,a8w8_bpreshuffle_256x16x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,181.73,5716.48,0.0 +256,32,51200,5120,torch.float8_e4m3fn,ck,133,0,47.8343,a8w8_bpreshuffle_256x32x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v1,350.74,5552.18,0.0 +256,64,51200,5120,torch.float8_e4m3fn,ck,135,0,54.3316,a8w8_bpreshuffle_256x64x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,617.59,4951.54,0.0 +256,128,51200,5120,torch.float8_e4m3fn,ck,154,0,65.6842,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1021.69,4200.5,0.0 +256,256,51200,5120,torch.float8_e4m3fn,ck,154,0,97.921,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1370.67,2958.19,0.0 +256,512,51200,5120,torch.float8_e4m3fn,ck,158,0,174.8476,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1535.25,1814.12,0.0 +256,1024,51200,5120,torch.float8_e4m3fn,ck,40,0,288.5662,a8w8_bpreshuffle_256x224x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1860.48,1289.98,0.0 +256,2048,51200,5120,torch.float8_e4m3fn,ck,33,0,524.9075,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2045.58,918.91,0.0 +256,4096,51200,5120,torch.float8_e4m3fn,ck,33,0,979.1344,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2193.25,717.52,0.0 +256,8192,51200,5120,torch.float8_e4m3fn,ck,33,0,1857.9691,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2311.65,615.16,0.0 +256,16384,51200,5120,torch.float8_e4m3fn,ck,33,0,3711.513,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2314.4,545.26,0.0 +256,32768,51200,5120,torch.float8_e4m3fn,ck,33,0,7575.0119,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2267.97,499.72,0.0 +256,1,53248,16384,torch.float8_e4m3fn,ck,23,0,150.2505,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,11.61,5807.22,0.0 +256,16,53248,16384,torch.float8_e4m3fn,ck,31,0,151.3677,a8w8_bpreshuffle_256x16x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v2,184.43,5776.54,0.0 +256,32,53248,16384,torch.float8_e4m3fn,ck,136,0,154.4861,a8w8_bpreshuffle_256x80x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,361.42,5672.66,0.0 +256,64,53248,16384,torch.float8_e4m3fn,ck,135,0,156.5996,a8w8_bpreshuffle_256x64x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,713.09,5621.21,0.0 +256,128,53248,16384,torch.float8_e4m3fn,ck,154,0,184.7497,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1208.87,4807.28,0.0 +256,256,53248,16384,torch.float8_e4m3fn,ck,154,0,276.3785,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1616.18,3270.42,0.0 +256,512,53248,16384,torch.float8_e4m3fn,ck,154,0,492.8785,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,1812.52,1897.69,0.0 +256,1024,53248,16384,torch.float8_e4m3fn,ck,154,0,850.624,a8w8_bpreshuffle_256x128x256x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2100.47,1173.54,0.0 +256,2048,53248,16384,torch.float8_e4m3fn,ck,33,0,1541.2203,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2318.56,729.34,0.0 +256,4096,53248,16384,torch.float8_e4m3fn,ck,33,0,2910.6751,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2455.38,472.65,0.0 +256,8192,53248,16384,torch.float8_e4m3fn,ck,33,0,5722.4616,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2497.82,328.36,0.0 +256,16384,53248,16384,torch.float8_e4m3fn,ck,33,0,11284.2706,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2533.38,255.73,0.0 +256,32768,53248,16384,torch.float8_e4m3fn,ck,33,0,22600.3968,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2529.81,216.76,0.0 +256,1,57344,8192,torch.float8_e4m3fn,ck,9,0,77.922,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v1,12.06,6030.2,0.0 +256,16,57344,8192,torch.float8_e4m3fn,ck,23,0,79.5497,a8w8_bpreshuffle_128x16x32x128_16x16_16x16_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_intrawave_v2,188.97,5929.98,0.0 +256,32,57344,8192,torch.float8_e4m3fn,ck,85,0,86.5188,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,347.49,5475.04,0.0 +256,64,57344,8192,torch.float8_e4m3fn,ck,85,0,87.6062,a8w8_bpreshuffle_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,686.36,5451.97,0.0 +256,128,57344,8192,torch.float8_e4m3fn,ck,54,0,95.8445,a8w8_bpreshuffle_256x160x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1254.73,5065.4,0.0 +256,256,57344,8192,torch.float8_e4m3fn,ck,33,0,129.4795,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,1857.58,3871.03,0.0 +256,512,57344,8192,torch.float8_e4m3fn,ck,33,0,227.0683,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2118.47,2345.89,0.0 +256,1024,57344,8192,torch.float8_e4m3fn,ck,33,0,409.7151,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2348.15,1453.67,0.0 +256,2048,57344,8192,torch.float8_e4m3fn,ck,33,0,762.0358,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2525.01,946.7,0.0 +256,4096,57344,8192,torch.float8_e4m3fn,ck,33,0,1497.7395,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2569.4,649.7,0.0 +256,8192,57344,8192,torch.float8_e4m3fn,ck,33,0,2991.7292,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2572.62,493.49,0.0 +256,16384,57344,8192,torch.float8_e4m3fn,ck,33,0,5984.2692,a8w8_bpreshuffle_256x256x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8x8x1_2x1_intrawave_v3,2572.27,414.93,0.0 +256,32768,57344,8192,torch.float8_e4m3fn,ck,158,0,11963.3054,a8w8_bpreshuffle_256x192x256x128_16x16_16x16_8x32x1_8x32x1_1x16x1x16_8x8x1_1x2_intrawave_v3,2573.4,375.84,0.0 +304,64,1536,5120,torch.int8,asm,0,1,16.7759,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,60.0,500.04,0.0 +304,128,1536,5120,torch.int8,asm,0,1,17.9602,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,112.1,496.26,0.0 +304,256,1536,5120,torch.int8,asm,0,1,17.6791,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,227.76,563.46,0.0 +304,512,1536,5120,torch.int8,asm,1,1,18.7067,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,430.49,644.62,0.0 +304,1024,1536,5120,torch.int8,asm,2,1,21.0125,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,766.5,773.49,0.0 +304,1664,1536,5120,torch.int8,asm,4,1,27.6261,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,947.38,778.1,0.0 +304,4096,1536,5120,torch.int8,asm,6,1,63.8361,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1009.22,648.83,0.0 +304,8192,1536,5120,torch.int8,asm,6,1,128.8567,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,999.94,581.83,0.0 +304,10240,1536,5120,torch.int8,asm,6,1,142.1441,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1133.08,645.47,0.0 +304,12288,1536,5120,torch.int8,asm,7,1,185.0056,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_128x128E,1044.69,586.62,0.0 +304,16384,1536,5120,torch.int8,asm,6,1,211.7054,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1217.25,671.13,0.0 +304,20480,1536,5120,torch.int8,asm,6,1,271.7016,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1185.57,646.43,0.0 +304,24576,1536,5120,torch.int8,asm,6,1,328.3654,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1177.19,637.07,0.0 +304,30720,1536,5120,torch.int8,asm,6,1,402.8387,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1199.45,644.23,0.0 +304,32768,1536,5120,torch.int8,asm,6,1,406.391,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1268.23,679.89,0.0 +304,40960,1536,5120,torch.int8,asm,6,1,525.9013,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1225.03,652.99,0.0 +304,64,5120,1280,torch.int8,asm,0,1,7.7424,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_16x128E,108.35,941.68,0.0 +304,128,5120,1280,torch.int8,asm,1,1,8.5797,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_32x128E,195.55,935.72,0.0 +304,256,5120,1280,torch.int8,asm,2,1,9.2104,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_48x128E,364.31,1031.74,0.0 +304,512,5120,1280,torch.int8,asm,4,1,11.2508,_ZN5aiter41I8gemm_bf16_perTokenI8_BpreShuffle_80x128E,596.48,1106.75,0.0 +304,1024,5120,1280,torch.int8,asm,6,1,18.2637,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,734.89,1004.73,0.0 +304,1664,5120,1280,torch.int8,asm,6,1,21.5241,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1013.3,1195.07,0.0 +304,4096,5120,1280,torch.int8,asm,6,1,58.5804,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,916.47,917.36,0.0 +304,8192,5120,1280,torch.int8,asm,6,1,105.657,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1016.25,955.22,0.0 +304,10240,5120,1280,torch.int8,asm,6,1,133.9571,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1001.95,929.54,0.0 +304,12288,5120,1280,torch.int8,asm,6,1,153.8919,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1046.59,962.44,0.0 +304,16384,5120,1280,torch.int8,asm,6,1,195.4489,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1098.74,999.22,0.0 +304,20480,5120,1280,torch.int8,asm,6,1,245.7639,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1092.25,986.65,0.0 +304,24576,5120,1280,torch.int8,asm,6,1,290.5831,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1108.54,996.85,0.0 +304,30720,5120,1280,torch.int8,asm,6,1,361.4235,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1114.08,997.3,0.0 +304,32768,5120,1280,torch.int8,asm,6,1,379.1231,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1132.87,1012.97,0.0 +304,40960,5120,1280,torch.int8,asm,6,1,475.7676,_ZN5aiter42I8gemm_bf16_perTokenI8_BpreShuffle_112x256E,1128.43,1005.56,0.0 From 60f06c32aad91183c31d93071984d0be424fbbfa Mon Sep 17 00:00:00 2001 From: "Ying.Zhou2" Date: Mon, 23 Mar 2026 18:05:01 +0800 Subject: [PATCH 3/3] recover enum --- aiter/jit/optCompilerConfig.json | 12 ++++++++++++ aiter/ops/enum.py | 29 ++++++++++------------------- csrc/pybind/aiter_enum_pybind.cu | 26 ++++++++++++++++++++++++++ setup.py | 19 ++++++------------- 4 files changed, 54 insertions(+), 32 deletions(-) create mode 100644 csrc/pybind/aiter_enum_pybind.cu diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json index 749b64309f..099d78e08b 100644 --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -1,4 +1,16 @@ { + "module_aiter_enum": { + "srcs": [ + "f'{AITER_CSRC_DIR}/pybind/aiter_enum_pybind.cu'" + ], + "flags_extra_cc": [], + "flags_extra_hip": [], + "extra_ldflags": "None", + "extra_include": [], + "verbose": "False", + "torch_exclude": "True", + "blob_gen_cmd": "''" + }, "module_activation": { "srcs": [ "f'{AITER_CSRC_DIR}/pybind/activation_pybind.cu'", diff --git a/aiter/ops/enum.py b/aiter/ops/enum.py index 2fdcfc6f47..edc1ddd671 100644 --- a/aiter/ops/enum.py +++ b/aiter/ops/enum.py @@ -1,25 +1,16 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. - -# Mirror of csrc/include/aiter_enum.h -- update both when changing enum values -from enum import IntEnum +from ..jit.core import compile_ops +# from enum import Enum as Enum Enum = int -class ActivationType(IntEnum): - No = -1 - Silu = 0 - Gelu = 1 - Swiglu = 2 +@compile_ops("module_aiter_enum", "ActivationType") +def _ActivationType(dummy): ... + + +@compile_ops("module_aiter_enum", "QuantType") +def _QuantType(dummy): ... -class QuantType(IntEnum): - No = 0 - per_Tensor = 1 - per_Token = 2 - per_1x32 = 3 - per_1x128 = 4 - per_128x128 = 5 - per_256x128 = 6 - per_1024x128 = 7 +ActivationType = type(_ActivationType(0)) +QuantType = type(_QuantType(0)) diff --git a/csrc/pybind/aiter_enum_pybind.cu b/csrc/pybind/aiter_enum_pybind.cu new file mode 100644 index 0000000000..ee775e2067 --- /dev/null +++ b/csrc/pybind/aiter_enum_pybind.cu @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. +#include +#include "aiter_enum.h" + +PYBIND11_MODULE(module_aiter_enum, m) +{ + pybind11::enum_(m, "QuantType") + .value("No", QuantType::No) + .value("per_Tensor", QuantType::per_Tensor) + .value("per_Token", QuantType::per_Token) + .value("per_1x32", QuantType::per_1x32) + .value("per_1x128", QuantType::per_1x128) + .value("per_128x128", QuantType::per_128x128) + .value("per_256x128", QuantType::per_256x128) + .value("per_1024x128", QuantType::per_1024x128) + .export_values(); + pybind11::enum_(m, "ActivationType") + .value("No", ActivationType::No) + .value("Silu", ActivationType::Silu) + .value("Gelu", ActivationType::Gelu) + .value("Swiglu", ActivationType::Swiglu) + .export_values(); + pybind11::implicitly_convertible(); + pybind11::implicitly_convertible(); +} diff --git a/setup.py b/setup.py index f7c1c1607f..30047fa5a5 100644 --- a/setup.py +++ b/setup.py @@ -149,7 +149,7 @@ def get_exclude_ops(): for module in all_modules: if PREBUILD_KERNELS == 1: - if "_tune" in module or module == "module_gemm_mi350_a8w8_blockscale_asm": + if "_tune" in module: exclude_ops.append(module) if "mha" in module and module not in [ "module_fmha_v3_fwd", @@ -157,23 +157,16 @@ def get_exclude_ops(): ]: exclude_ops.append(module) elif PREBUILD_KERNELS == 2: - # Exclude _bwd, _tune, and specific module - if ( - "_bwd" in module - or "_tune" in module - or module == "module_gemm_mi350_a8w8_blockscale_asm" - ): + # Exclude _bwd and _tune + if "_bwd" in module or "_tune" in module: exclude_ops.append(module) elif PREBUILD_KERNELS == 3: # Keep only module_fmha_v3* - if not ( - module.startswith("module_fmha_v3") - or module == "module_gemm_mi350_a8w8_blockscale_asm" - ): + if not module.startswith("module_fmha_v3"): exclude_ops.append(module) else: - # Default behavior: exclude tunes and specific mi350 module - if "_tune" in module or module == "module_gemm_mi350_a8w8_blockscale_asm": + # Default behavior: exclude tunes + if "_tune" in module: exclude_ops.append(module) return exclude_ops