diff --git a/aiter/ops/triton/_triton_kernels/batched_gemm_afp4wfp4_pre_quant.py b/aiter/ops/triton/_triton_kernels/batched_gemm_a16wfp4.py similarity index 92% rename from aiter/ops/triton/_triton_kernels/batched_gemm_afp4wfp4_pre_quant.py rename to aiter/ops/triton/_triton_kernels/batched_gemm_a16wfp4.py index 86f7748acf..1ffabd7fcc 100755 --- a/aiter/ops/triton/_triton_kernels/batched_gemm_afp4wfp4_pre_quant.py +++ b/aiter/ops/triton/_triton_kernels/batched_gemm_a16wfp4.py @@ -14,8 +14,8 @@ from ..utils.core import AITER_TRITON_CONFIGS_PATH from .quant import _mxfp4_quant_op -_batched_gemm_afp4_wfp4_pre_quant_repr = make_kernel_repr( - "_batched_gemm_afp4_wfp4_pre_quant_kernel", +_batched_gemm_a16wfp4_repr = make_kernel_repr( + "_batched_gemm_a16wfp4_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", @@ -25,13 +25,15 @@ "SPLITK_BLOCK_SIZE", "EVEN_K", "GRID_MN", + "PRE_QUANT", + "HAVE_Y_SCALE", "cache_modifier", ], ) -_batched_gemm_afp4_wfp4_pre_quant_reduce_repr = make_kernel_repr( - "_batched_gemm_afp4_wfp4_pre_quant_reduce_kernel", +_batched_gemm_a16wfp4_reduce_repr = make_kernel_repr( + "_batched_gemm_a16wfp4_reduce_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", @@ -50,12 +52,13 @@ * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), } ) -@triton.jit(repr=_batched_gemm_afp4_wfp4_pre_quant_repr) -def _batched_gemm_afp4_wfp4_pre_quant_kernel( +@triton.jit(repr=_batched_gemm_a16wfp4_repr) +def _batched_gemm_a16wfp4_kernel( a_ptr, b_ptr, c_ptr, b_scales_ptr, + c_scale_ptr, M, N, K, @@ -81,6 +84,8 @@ def _batched_gemm_afp4_wfp4_pre_quant_kernel( SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, GRID_MN: tl.constexpr, + PRE_QUANT: tl.constexpr, + HAVE_Y_SCALE: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -121,6 +126,12 @@ def _batched_gemm_afp4_wfp4_pre_quant_kernel( stride_cb = tl.cast(stride_cb, tl.int64) pid_batch = tl.cast(pid_batch, tl.int64) + if HAVE_Y_SCALE: + c_scale = tl.load(c_scale_ptr) + else: + c_scale = 1 + c_scale_rcprl = (1 / c_scale).to(tl.float32) + if NUM_KSPLIT == 1: remap_xcd(pid, GRID_MN) @@ -189,7 +200,8 @@ def _batched_gemm_afp4_wfp4_pre_quant_kernel( b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 ) - a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) + if PRE_QUANT: # TODO add PRE_QUANT = False + a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") @@ -198,6 +210,9 @@ def _batched_gemm_afp4_wfp4_pre_quant_kernel( b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk + if HAVE_Y_SCALE: + accumulator = accumulator * c_scale_rcprl + c = accumulator.to(c_ptr.type.element_ty) # Write back the block of the output matrix C with masks. @@ -214,8 +229,8 @@ def _batched_gemm_afp4_wfp4_pre_quant_kernel( tl.store(c_ptrs, c, mask=c_mask) -@triton.jit(repr=_batched_gemm_afp4_wfp4_pre_quant_reduce_repr) -def _batched_gemm_afp4_wfp4_pre_quant_reduce_kernel( +@triton.jit(repr=_batched_gemm_a16wfp4_reduce_repr) +def _batched_gemm_a16wfp4_reduce_kernel( c_in_ptr, c_out_ptr, M, diff --git a/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_a16w16.py b/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_a16w16.py new file mode 100644 index 0000000000..f1a49d150f --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_a16w16.py @@ -0,0 +1,867 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +from typing import Optional +import functools +import json +import os +import torch +import triton +import triton.language as tl +from ..utils._triton.pid_preprocessing import pid_grid, remap_xcd +from ..utils._triton import arch_info +from ..utils.core import AITER_TRITON_CONFIGS_PATH +from ..utils._triton.kernel_repr import make_kernel_repr + + +_fused_gemm_afp4wfp4_a16w16_repr = make_kernel_repr( + "_fused_gemm_afp4wfp4_a16w16_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", + "cache_modifier", + "NUM_KSPLIT", + ], +) + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN_FP4": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N_fp4"], args["BLOCK_SIZE_N"]), + "GRID_MN_BF16": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N_bf16"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit(repr=_fused_gemm_afp4wfp4_a16w16_repr) +def _fused_gemm_afp4wfp4_a16w16_kernel( + # Pointers to matrices + a_fp4_ptr, + b_fp4_ptr, + bias_fp4_ptr, + a_fp4_scale_ptr, + b_fp4_scale_ptr, + c_fp4_ptr, + a_bf16_ptr, + b_bf16_ptr, + bias_bf16_ptr, + c_bf16_ptr, + # Matrix dimensions + M, + N_fp4, + N_bf16, + K, + stride_a_fp4_m, + stride_a_fp4_k, + stride_b_fp4_k, + stride_b_fp4_n, + stride_a_fp4_scale_m, + stride_a_fp4_scale_k, + stride_b_fp4_scale_n, + stride_b_fp4_scale_k, + stride_c_fp4_k, + stride_c_fp4_m, + stride_c_fp4_n, + stride_a_bf16_m, + stride_a_bf16_k, + stride_b_bf16_k, + stride_b_bf16_n, + stride_c_bf16_k, + stride_c_bf16_m, + stride_c_bf16_n, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + ADD_BIAS_FP4: tl.constexpr, + ADD_BIAS_BF16: tl.constexpr, + EVEN_K: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, + GRID_MN_FP4: tl.constexpr, + GRID_MN_BF16: tl.constexpr, + SKIP_REDUCE: tl.constexpr, + cache_modifier: tl.constexpr, +): + + tl.assume(stride_a_fp4_m > 0) + tl.assume(stride_a_fp4_k > 0) + tl.assume(stride_b_fp4_k > 0) + tl.assume(stride_b_fp4_n > 0) + tl.assume(stride_c_fp4_k > 0) + tl.assume(stride_c_fp4_m > 0) + tl.assume(stride_c_fp4_n > 0) + tl.assume(stride_a_fp4_scale_m > 0) + tl.assume(stride_a_fp4_scale_k > 0) + tl.assume(stride_b_fp4_scale_k > 0) + tl.assume(stride_b_fp4_scale_n > 0) + + tl.assume(stride_a_bf16_m > 0) + tl.assume(stride_a_bf16_k > 0) + tl.assume(stride_b_bf16_k > 0) + tl.assume(stride_b_bf16_n > 0) + tl.assume(stride_c_bf16_m > 0) + tl.assume(stride_c_bf16_n > 0) + + SCALE_GROUP_SIZE: tl.constexpr = 32 + GRID_MN: tl.constexpr = GRID_MN_FP4 + GRID_MN_BF16 + + pid_unified = tl.program_id(axis=0) + pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) + + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n_fp4 = tl.cdiv(N_fp4, BLOCK_SIZE_N) + num_pid_n_bf16 = tl.cdiv(N_bf16, BLOCK_SIZE_N) + num_pid_n = num_pid_n_fp4 + num_pid_n_bf16 + + if NUM_KSPLIT == 1: + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + tl.assume(pid_k >= 0) + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + + if pid_n < num_pid_n_fp4: + offs_k_fp4 = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_fp4_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k_fp4 + offs_b_fp4_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_fp4 + a_fp4_ptrs = a_fp4_ptr + ( + offs_am[:, None] * stride_a_fp4_m + + offs_k_fp4_split[None, :] * stride_a_fp4_k + ) + b_fp4_ptrs = b_fp4_ptr + ( + offs_k_fp4_split[:, None] * stride_b_fp4_k + + offs_b_fp4_n[None, :] * stride_b_fp4_n + ) + + offs_k_fp4_scale = ( + pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) + ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + a_fp4_scale_ptrs = ( + a_fp4_scale_ptr + + offs_am[:, None] * stride_a_fp4_scale_m + + offs_k_fp4_scale[None, :] * stride_a_fp4_scale_k + ) + # B scales are N x K even though B operand is K x N. + b_fp4_scale_ptrs = ( + b_fp4_scale_ptr + + offs_b_fp4_n[:, None] * stride_b_fp4_scale_n + + offs_k_fp4_scale[None, :] * stride_b_fp4_scale_k + ) + + if ADD_BIAS_FP4: + if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): + accumulator_fp4 = tl.load(bias_fp4_ptr + offs_b_fp4_n).to( + dtype=tl.float32 + ) + accumulator_fp4 = tl.broadcast_to( + accumulator_fp4[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) + ) + else: + accumulator_fp4 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + else: + accumulator_fp4 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + a_scale = tl.load(a_fp4_scale_ptrs) + b_scale = tl.load(b_fp4_scale_ptrs, cache_modifier=cache_modifier) + + if EVEN_K: + a = tl.load(a_fp4_ptrs) + b = tl.load(b_fp4_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_fp4_ptrs, + mask=offs_k_fp4[None, :] < K - k * (BLOCK_SIZE_K // 2), + other=0.0, + ) + b = tl.load( + b_fp4_ptrs, + mask=offs_k_fp4[:, None] < K - k * (BLOCK_SIZE_K // 2), + other=0.0, + cache_modifier=cache_modifier, + ) + + accumulator_fp4 += tl.dot_scaled(a, a_scale, "e2m1", b, b_scale, "e2m1") + + a_fp4_ptrs += (BLOCK_SIZE_K // 2) * stride_a_fp4_k + b_fp4_ptrs += (BLOCK_SIZE_K // 2) * stride_b_fp4_k + a_fp4_scale_ptrs += ( + BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) * stride_a_fp4_scale_k + b_fp4_scale_ptrs += ( + BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) * stride_b_fp4_scale_k + + c_fp4 = accumulator_fp4.to(c_fp4_ptr.type.element_ty) + + offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int64 + ) + offs_c_fp4_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( + 0, BLOCK_SIZE_N + ).to(tl.int64) + c_fp4_ptrs = ( + c_fp4_ptr + + stride_c_fp4_m * offs_cm[:, None] + + stride_c_fp4_n * offs_c_fp4_n[None, :] + + pid_k * stride_c_fp4_k + ) + c_fp4_mask = (offs_cm[:, None] < M) & (offs_c_fp4_n[None, :] < N_fp4) + tl.store(c_fp4_ptrs, c_fp4, mask=c_fp4_mask) + else: + pid_n -= num_pid_n_fp4 + offs_k_bf16 = tl.arange(0, BLOCK_SIZE_K) + offs_k_bf16_split = pid_k * SPLITK_BLOCK_SIZE + offs_k_bf16 + K = 2 * K + + offs_b_bf16_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_bf16 + a_ptrs = a_bf16_ptr + ( + offs_am[:, None] * stride_a_bf16_m + + offs_k_bf16_split[None, :] * stride_a_bf16_k + ) + b_ptrs = b_bf16_ptr + ( + offs_k_bf16_split[:, None] * stride_b_bf16_k + + offs_b_bf16_n[None, :] * stride_b_bf16_n + ) + + if ADD_BIAS_BF16: + if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): + accumulator_bf16 = tl.load(bias_bf16_ptr + offs_b_bf16_n).to( + dtype=tl.float32 + ) + accumulator_bf16 = tl.broadcast_to( + accumulator_bf16[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) + ) + else: + accumulator_bf16 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + else: + accumulator_bf16 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_ptrs, + mask=offs_k_bf16[None, :] < K - k * BLOCK_SIZE_K, + other=0.0, + ) + b = tl.load( + b_ptrs, + mask=offs_k_bf16[:, None] < K - k * BLOCK_SIZE_K, + other=0.0, + cache_modifier=cache_modifier, + ) + + accumulator_bf16 += tl.dot(a, b, input_precision="ieee") + + a_ptrs += BLOCK_SIZE_K * stride_a_bf16_k + b_ptrs += BLOCK_SIZE_K * stride_b_bf16_k + + c_bf16 = accumulator_bf16.to(c_bf16_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int64 + ) + offs_c_bf16_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( + 0, BLOCK_SIZE_N + ).to(tl.int64) + c_bf16_ptrs = ( + c_bf16_ptr + + stride_c_bf16_m * offs_cm[:, None] + + stride_c_bf16_n * offs_c_bf16_n[None, :] + + pid_k * stride_c_bf16_k + ) + c_bf16_mask = (offs_cm[:, None] < M) & (offs_c_bf16_n[None, :] < N_bf16) + tl.store(c_bf16_ptrs, c_bf16, mask=c_bf16_mask) + + +_fused_gemm_afp4wfp4_preshuffle_a16w16_repr = make_kernel_repr( + "_fused_gemm_afp4wfp4_preshuffle_a16w16_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", + "cache_modifier", + "NUM_KSPLIT", + ], +) + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + "GRID_MN_FP4": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N_fp4"], args["BLOCK_SIZE_N"]), + "GRID_MN_BF16": lambda args: triton.cdiv(args["M"], args["BLOCK_SIZE_M"]) + * triton.cdiv(args["N_bf16"], args["BLOCK_SIZE_N"]), + } +) +@triton.jit(repr=_fused_gemm_afp4wfp4_preshuffle_a16w16_repr) +def _fused_gemm_afp4wfp4_preshuffle_a16w16_kernel( + # Pointers to matrices + a_fp4_ptr, + b_fp4_ptr, + bias_fp4_ptr, + a_fp4_scale_ptr, + b_fp4_scale_ptr, + c_fp4_ptr, + a_bf16_ptr, + b_bf16_ptr, + bias_bf16_ptr, + c_bf16_ptr, + # Matrix dimensions + M, + N_fp4, + N_bf16, + K, + stride_a_fp4_m, + stride_a_fp4_k, + stride_b_fp4_n, + stride_b_fp4_k, + stride_a_fp4_scale_m, + stride_a_fp4_scale_k, + stride_b_fp4_scale_n, + stride_b_fp4_scale_k, + stride_c_fp4_k, + stride_c_fp4_m, + stride_c_fp4_n, + stride_a_bf16_m, + stride_a_bf16_k, + stride_b_bf16_k, + stride_b_bf16_n, + stride_c_bf16_k, + stride_c_bf16_m, + stride_c_bf16_n, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + ADD_BIAS_FP4: tl.constexpr, + ADD_BIAS_BF16: tl.constexpr, + EVEN_K: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, + GRID_MN_FP4: tl.constexpr, + GRID_MN_BF16: tl.constexpr, + SKIP_REDUCE: tl.constexpr, + cache_modifier: tl.constexpr, +): + + tl.assume(stride_a_fp4_m > 0) + tl.assume(stride_a_fp4_k > 0) + tl.assume(stride_b_fp4_k > 0) + tl.assume(stride_b_fp4_n > 0) + tl.assume(stride_c_fp4_k > 0) + tl.assume(stride_c_fp4_m > 0) + tl.assume(stride_c_fp4_n > 0) + tl.assume(stride_a_fp4_scale_m > 0) + tl.assume(stride_a_fp4_scale_k > 0) + tl.assume(stride_b_fp4_scale_k > 0) + tl.assume(stride_b_fp4_scale_n > 0) + + tl.assume(stride_a_bf16_m > 0) + tl.assume(stride_a_bf16_k > 0) + tl.assume(stride_b_bf16_k > 0) + tl.assume(stride_b_bf16_n > 0) + tl.assume(stride_c_bf16_m > 0) + tl.assume(stride_c_bf16_n > 0) + + SCALE_GROUP_SIZE: tl.constexpr = 32 + GRID_MN: tl.constexpr = GRID_MN_FP4 + GRID_MN_BF16 + + pid_unified = tl.program_id(axis=0) + pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) + + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n_fp4 = tl.cdiv(N_fp4, BLOCK_SIZE_N) + num_pid_n_bf16 = tl.cdiv(N_bf16, BLOCK_SIZE_N) + num_pid_n = num_pid_n_fp4 + num_pid_n_bf16 + + if NUM_KSPLIT == 1: + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + tl.assume(pid_k >= 0) + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + + if pid_n < num_pid_n_fp4: + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + offs_k_fp4 = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_fp4_shuffle_arr = tl.arange(0, (BLOCK_SIZE_K // 2) * 16) + offs_k_fp4_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k_fp4 + offs_k_fp4_shuffle = ( + pid_k * (SPLITK_BLOCK_SIZE // 2) * 16 + offs_k_fp4_shuffle_arr + ) + + offs_b_fp4_n = ( + pid_n * (BLOCK_SIZE_N // 16) + tl.arange(0, BLOCK_SIZE_N // 16) + ) % N_fp4 + a_fp4_ptrs = a_fp4_ptr + ( + offs_am[:, None] * stride_a_fp4_m + + offs_k_fp4_split[None, :] * stride_a_fp4_k + ) + b_fp4_ptrs = b_fp4_ptr + ( + offs_b_fp4_n[:, None] * stride_b_fp4_n + + offs_k_fp4_shuffle[None, :] * stride_b_fp4_k + ) + + offs_b_fp4_scale_n = ( + pid_n * (BLOCK_SIZE_N // 32) + tl.arange(0, (BLOCK_SIZE_N // 32)) + ) % N_fp4 + offs_k_fp4_scale = ( + pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) * 32 + ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE * 32) + b_fp4_scale_ptrs = ( + b_fp4_scale_ptr + + offs_b_fp4_scale_n[:, None] * stride_b_fp4_scale_n + + offs_k_fp4_scale[None, :] * stride_b_fp4_scale_k + ) + + if BLOCK_SIZE_M < 32: + offs_ks_non_shufl = ( + pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) + ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + a_fp4_scale_ptrs = ( + a_fp4_scale_ptr + + offs_am[:, None] * stride_a_fp4_scale_m + + offs_ks_non_shufl[None, :] * stride_a_fp4_scale_k + ) + else: + offs_a_fp4_scale_m = ( + pid_m * (BLOCK_SIZE_M // 32) + tl.arange(0, (BLOCK_SIZE_M // 32)) + ) % M + a_fp4_scale_ptrs = ( + a_fp4_scale_ptr + + offs_a_fp4_scale_m[:, None] * stride_a_fp4_scale_m + + offs_k_fp4_scale[None, :] * stride_a_fp4_scale_k + ) + + if ADD_BIAS_FP4: + offs_b_fp4_n_bias = ( + pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + ) % N_fp4 + if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): + accumulator_fp4 = tl.load(bias_fp4_ptr + offs_b_fp4_n_bias).to( + dtype=tl.float32 + ) + accumulator_fp4 = tl.broadcast_to( + accumulator_fp4[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) + ) + else: + accumulator_fp4 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + else: + accumulator_fp4 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + if BLOCK_SIZE_M < 32: + a_scale = tl.load(a_fp4_scale_ptrs) + else: + a_scale = ( + tl.load(a_fp4_scale_ptrs) + .reshape( + BLOCK_SIZE_M // 32, + BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, + 4, + 16, + 2, + 2, + 1, + ) + .permute(0, 5, 3, 1, 4, 2, 6) + .reshape(BLOCK_SIZE_M, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + ) + + b_scale = ( + tl.load(b_fp4_scale_ptrs, cache_modifier=cache_modifier) + .reshape( + BLOCK_SIZE_N // 32, + BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, + 4, + 16, + 2, + 2, + 1, + ) + .permute(0, 5, 3, 1, 4, 2, 6) + .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + ) + + if EVEN_K: + a = tl.load(a_fp4_ptrs) + b = tl.load(b_fp4_ptrs, cache_modifier=cache_modifier) + # else: + # a = tl.load( + # a_fp4_ptrs, + # mask=offs_k[None, :] < K - k * (BLOCK_SIZE_K // 2), + # other=0.0 + # ) + # b = tl.load( + # b_fp4_ptrs, + # mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), + # other=0.0, + # cache_modifier=cache_modifier, + # ) + + b = ( + b.reshape( + 1, + BLOCK_SIZE_N // 16, + BLOCK_SIZE_K // 64, + 2, + 16, + 16, + ) + .permute(0, 1, 4, 2, 3, 5) + .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // 2) + .trans(1, 0) + ) + + accumulator_fp4 += tl.dot_scaled(a, a_scale, "e2m1", b, b_scale, "e2m1") + + a_fp4_ptrs += (BLOCK_SIZE_K // 2) * stride_a_fp4_k + b_fp4_ptrs += (BLOCK_SIZE_K // 2) * 16 * stride_b_fp4_k + if BLOCK_SIZE_M < 32: + a_fp4_scale_ptrs += ( + BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) * stride_a_fp4_scale_k + else: + a_fp4_scale_ptrs += BLOCK_SIZE_K * stride_a_fp4_scale_k + b_fp4_scale_ptrs += BLOCK_SIZE_K * stride_b_fp4_scale_k + + c_fp4 = accumulator_fp4.to(c_fp4_ptr.type.element_ty) + + offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int64 + ) + offs_c_fp4_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( + 0, BLOCK_SIZE_N + ).to(tl.int64) + c_fp4_ptrs = ( + c_fp4_ptr + + stride_c_fp4_m * offs_cm[:, None] + + stride_c_fp4_n * offs_c_fp4_n[None, :] + + pid_k * stride_c_fp4_k + ) + c_fp4_mask = (offs_cm[:, None] < M) & (offs_c_fp4_n[None, :] < N_fp4) + tl.store(c_fp4_ptrs, c_fp4, mask=c_fp4_mask, cache_modifier=".wt") + else: + pid_n -= num_pid_n_fp4 + K = 2 * K + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE, BLOCK_SIZE_K) + + offs_k_bf16 = tl.arange(0, BLOCK_SIZE_K) + offs_k_bf16_split = pid_k * (SPLITK_BLOCK_SIZE) + offs_k_bf16 + offs_b_bf16_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_bf16 + a_ptrs = a_bf16_ptr + ( + offs_am[:, None] * stride_a_bf16_m + + offs_k_bf16_split[None, :] * stride_a_bf16_k + ) + b_ptrs = b_bf16_ptr + ( + offs_k_bf16_split[:, None] * stride_b_bf16_k + + offs_b_bf16_n[None, :] * stride_b_bf16_n + ) + + if ADD_BIAS_BF16: + if NUM_KSPLIT == 1 or (SKIP_REDUCE and pid_k == 0): + accumulator_bf16 = tl.load(bias_bf16_ptr + offs_b_bf16_n).to( + dtype=tl.float32 + ) + accumulator_bf16 = tl.broadcast_to( + accumulator_bf16[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) + ) + else: + accumulator_bf16 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + else: + accumulator_bf16 = tl.zeros( + (BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32 + ) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_ptrs, + mask=offs_k_bf16[None, :] < K - k * BLOCK_SIZE_K, + other=0.0, + ) + b = tl.load( + b_ptrs, + mask=offs_k_bf16[:, None] < K - k * BLOCK_SIZE_K, + other=0.0, + cache_modifier=cache_modifier, + ) + + accumulator_bf16 += tl.dot(a, b, input_precision="ieee") + + a_ptrs += BLOCK_SIZE_K * stride_a_bf16_k + b_ptrs += BLOCK_SIZE_K * stride_b_bf16_k + + c_bf16 = accumulator_bf16.to(c_bf16_ptr.type.element_ty) + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int64 + ) + offs_c_bf16_n = pid_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange( + 0, BLOCK_SIZE_N + ).to(tl.int64) + c_bf16_ptrs = ( + c_bf16_ptr + + stride_c_bf16_m * offs_cm[:, None] + + stride_c_bf16_n * offs_c_bf16_n[None, :] + + pid_k * stride_c_bf16_k + ) + c_bf16_mask = (offs_cm[:, None] < M) & (offs_c_bf16_n[None, :] < N_bf16) + tl.store(c_bf16_ptrs, c_bf16, mask=c_bf16_mask) + + +_gemm_afp4wfp4_a16w16_reduce_repr = make_kernel_repr( + "_fused_gemm_afp4wfp4_a16w16_reduce_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "ACTUAL_KSPLIT", + "MAX_KSPLIT", + "ADD_BIAS_FP4", + "ADD_BIAS_BF16", + ], +) + + +@triton.heuristics({}) # dummy heuristics to invoke kernel re-naming +@triton.jit(repr=_gemm_afp4wfp4_a16w16_reduce_repr) +def _fused_gemm_afp4wfp4_a16w16_reduce_kernel( + bias_fp4_ptr, + c_fp4_in_ptr, + c_fp4_out_ptr, + bias_bf16_ptr, + c_bf16_in_ptr, + c_bf16_out_ptr, + M, + N_fp4, + N_bf16, + stride_c_fp4_in_k, + stride_c_fp4_in_m, + stride_c_fp4_in_n, + stride_c_fp4_out_m, + stride_c_fp4_out_n, + stride_c_bf16_in_k, + stride_c_bf16_in_m, + stride_c_bf16_in_n, + stride_c_bf16_out_m, + stride_c_bf16_out_n, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + ACTUAL_KSPLIT: tl.constexpr, + MAX_KSPLIT: tl.constexpr, + ADD_BIAS_FP4: tl.constexpr, + ADD_BIAS_BF16: tl.constexpr, +): + + tl.assume(stride_c_fp4_in_k > 0) + tl.assume(stride_c_fp4_in_m > 0) + tl.assume(stride_c_fp4_in_n > 0) + tl.assume(stride_c_fp4_out_m > 0) + tl.assume(stride_c_fp4_out_n > 0) + + tl.assume(stride_c_bf16_in_k > 0) + tl.assume(stride_c_bf16_in_m > 0) + tl.assume(stride_c_bf16_in_n > 0) + tl.assume(stride_c_bf16_out_m > 0) + tl.assume(stride_c_bf16_out_n > 0) + + pid_m = tl.program_id(axis=0) + pid_n = tl.program_id(axis=1) + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + + offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + + num_pid_n_fp4 = tl.cdiv(N_fp4, BLOCK_SIZE_N) + offs_k = tl.arange(0, MAX_KSPLIT) + acc_dtype = tl.float32 if c_fp4_in_ptr.type.element_ty != tl.int8 else tl.int32 + + if pid_n < num_pid_n_fp4: + offs_fp4_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_fp4 + c_fp4_in_ptrs = ( + c_fp4_in_ptr + + (offs_k[:, None, None] * stride_c_fp4_in_k) + + (offs_m[None, :, None] * stride_c_fp4_in_m) + + (offs_fp4_n[None, None, :] * stride_c_fp4_in_n) + ) + + if ACTUAL_KSPLIT == MAX_KSPLIT: + c_fp4 = tl.load(c_fp4_in_ptrs) + else: + c_fp4 = tl.load( + c_fp4_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT, other=0.0 + ) + c_fp4 = tl.sum(c_fp4, axis=0) + if ADD_BIAS_FP4: + bias_fp4 = tl.load(bias_fp4_ptr + offs_fp4_n).to(dtype=acc_dtype) + bias_fp4 = tl.broadcast_to(bias_fp4[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N)) + c_fp4 += bias_fp4 + + c_fp4 = c_fp4.to(c_fp4_out_ptr.type.element_ty) + + c_fp4_out_ptrs = ( + c_fp4_out_ptr + + (offs_m[:, None] * stride_c_fp4_out_m) + + (offs_fp4_n[None, :] * stride_c_fp4_out_n) + ) + + tl.store(c_fp4_out_ptrs, c_fp4) + else: + pid_n -= num_pid_n_fp4 + + offs_bf16_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N_bf16 + c_bf16_in_ptrs = ( + c_bf16_in_ptr + + (offs_k[:, None, None] * stride_c_bf16_in_k) + + (offs_m[None, :, None] * stride_c_bf16_in_m) + + (offs_bf16_n[None, None, :] * stride_c_bf16_in_n) + ) + + if ACTUAL_KSPLIT == MAX_KSPLIT: + c_bf16 = tl.load(c_bf16_in_ptrs) + else: + c_bf16 = tl.load( + c_bf16_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT, other=0.0 + ) + c_bf16 = tl.sum(c_bf16, axis=0) + if ADD_BIAS_BF16: + bias_bf16 = tl.load(bias_bf16_ptr + offs_bf16_n).to(dtype=acc_dtype) + bias_bf16 = tl.broadcast_to( + bias_bf16[None, :], (BLOCK_SIZE_M, BLOCK_SIZE_N) + ) + c_bf16 += bias_bf16 + + c_bf16 = c_bf16.to(c_bf16_out_ptr.type.element_ty) + + c_bf16_out_ptrs = ( + c_bf16_out_ptr + + (offs_m[:, None] * stride_c_bf16_out_m) + + (offs_bf16_n[None, :] * stride_c_bf16_out_n) + ) + c_bf16_mask = (offs_m[:, None] < M) & (offs_bf16_n[None, :] < N_bf16) + tl.store(c_bf16_out_ptrs, c_bf16, mask=c_bf16_mask) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N_fp4: int, + N_bf16: int, + K: int, + shuffle: bool = False, +): + shuffle_filename_suffix = "" if not shuffle else "_PRESHUFFLED" + if not hasattr(_get_config, "_config_dict"): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-FUSED-GEMM-AFP4WFP4{shuffle_filename_suffix}-A16W16.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict["default"] = config + + key = f"{N_fp4}_{N_bf16}_{K}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-FUSED-GEMM-AFP4WFP4{shuffle_filename_suffix}-A16W16-N4={N_fp4}-N16={N_bf16}-K={2*K}.json" + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = "default" # fall back to default config + + if M < 16 and "small" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["small"] + elif M < 32 and "small_M16" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["small_M16"] + elif M <= 128: + BLK_M = triton.next_power_of_2(M) + if BLK_M == 32 and "medium_M32" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["medium_M32"] + elif BLK_M == 64 and "medium_M64" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["medium_M64"] + elif BLK_M == 128 and "medium_M128" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["medium_M128"] + elif M <= 256 and "large" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["large"] + else: + BLK_M = triton.next_power_of_2(M) + if f"xlarge_M{BLK_M}" in _get_config._config_dict[key]: + return _get_config._config_dict[key][f"xlarge_M{BLK_M}"] + elif "xlarge" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["xlarge"] + + return _get_config._config_dict[key]["any"] diff --git a/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_mul_add.py b/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_mul_add.py new file mode 100644 index 0000000000..530616e509 --- /dev/null +++ b/aiter/ops/triton/_triton_kernels/fused_gemm_afp4wfp4_mul_add.py @@ -0,0 +1,657 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +import functools +import json +import os +import triton +import triton.language as tl +from ..utils._triton.pid_preprocessing import pid_grid, remap_xcd +from ..utils._triton import arch_info +from ..utils.core import AITER_TRITON_CONFIGS_PATH +from ..utils._triton.kernel_repr import make_kernel_repr + + +_fused_gemm_afp4wfp4_mul_add_repr = make_kernel_repr( + "_fused_gemm_afp4wfp4_mul_add_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", + "cache_modifier", + "NUM_KSPLIT", + ], +) + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + } +) +@triton.jit(repr=_fused_gemm_afp4wfp4_mul_add_repr) +def _fused_gemm_afp4wfp4_mul_add_kernel( + a_ptr, + b_ptr, + c_ptr, + a_scales_ptr, + b_scales_ptr, + c_a_ptr, + c_b_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_ck, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bsn, + stride_bsk, + stride_cam, + stride_can, + stride_cbm, + stride_cbn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + IS_A_SCALAR: tl.constexpr, + IS_B_SCALAR: tl.constexpr, + IS_A_TENSOR: tl.constexpr, + IS_B_TENSOR: tl.constexpr, + FUSE_TYPE: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, + cache_modifier: tl.constexpr, +): + """ + Kernel for computing the matmul C = A x B. + A and B inputs are in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_asm > 0) + tl.assume(stride_ask > 0) + tl.assume(stride_bsk > 0) + tl.assume(stride_bsn > 0) + + GRID_MN = tl.cdiv(M, BLOCK_SIZE_M) * tl.cdiv(N, BLOCK_SIZE_N) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_unified = tl.program_id(axis=0) + # remap so that XCDs get continous chunks of pids (of CHUNK_SIZE). + pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) + + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if NUM_KSPLIT == 1: + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + # Create pointers for first block of A and B input matrices + # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. + offs_k = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn + ) + # Create pointers for the first block of A and B scales + offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE)) + tl.arange( + 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE + ) + a_scale_ptrs = ( + a_scales_ptr + offs_am[:, None] * stride_asm + offs_ks[None, :] * stride_ask + ) + # B scales are N x K even though B operand is K x N. + b_scale_ptrs = ( + b_scales_ptr + offs_bn[:, None] * stride_bsn + offs_ks[None, :] * stride_bsk + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + a_scales = tl.load(a_scale_ptrs) + b_scales = tl.load(b_scale_ptrs, cache_modifier=cache_modifier) + + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + else: + a = tl.load( + a_ptrs, mask=offs_k[None, :] < K - k * (BLOCK_SIZE_K // 2), other=0 + ) + b = tl.load( + b_ptrs, + mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), + other=0, + cache_modifier=cache_modifier, + ) + + accumulator = tl.dot_scaled( + a, a_scales, "e2m1", b, b_scales, "e2m1", accumulator + ) + + # Advance the ptrs to the next K block. + a_ptrs += (BLOCK_SIZE_K // 2) * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + a_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_ask + b_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_bsk + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + if NUM_KSPLIT == 1: + if IS_A_SCALAR and IS_A_TENSOR: + c_a = tl.load(c_a_ptr) + elif IS_A_SCALAR: + c_a = c_a_ptr + else: + c_a = tl.load( + c_a_ptr + + stride_cam * offs_cm[:, None] + + stride_can * offs_cn[None, :], + mask=c_mask, + ) + c_a = c_a.to(tl.float32) + + if IS_B_SCALAR and IS_B_TENSOR: + c_b = tl.load(c_b_ptr) + elif IS_B_SCALAR: + c_b = c_b_ptr + else: + c_b = tl.load( + c_b_ptr + + stride_cbm * offs_cm[:, None] + + stride_cbn * offs_cn[None, :], + mask=c_mask, + ) + c_b = c_b.to(tl.float32) + + if FUSE_TYPE == 0: + accumulator = c_a * accumulator + c_b + else: + accumulator = c_b * c_a + accumulator + + c = accumulator.to(c_ptr.type.element_ty) + + c_ptrs = ( + c_ptr + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + tl.store(c_ptrs, c, mask=c_mask) + + +_fused_gemm_afp4wfp4_preshuffle_mul_add_repr = make_kernel_repr( + "_fused_gemm_afp4wfp4_preshuffle_mul_add_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", + "cache_modifier", + "NUM_KSPLIT", + ], +) + + +@triton.heuristics( + { + "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) + and (args["SPLITK_BLOCK_SIZE"] % args["BLOCK_SIZE_K"] == 0) + and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), + } +) +@triton.jit(repr=_fused_gemm_afp4wfp4_preshuffle_mul_add_repr) +def _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel( + a_ptr, + b_ptr, + c_ptr, + a_scales_ptr, + b_scales_ptr, + c_a_ptr, + c_b_ptr, + M, + N, + K, + stride_am, + stride_ak, + stride_bn, + stride_bk, + stride_ck, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bsn, + stride_bsk, + stride_cam, + stride_can, + stride_cbm, + stride_cbn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + NUM_KSPLIT: tl.constexpr, + SPLITK_BLOCK_SIZE: tl.constexpr, + EVEN_K: tl.constexpr, + IS_A_SCALAR: tl.constexpr, + IS_B_SCALAR: tl.constexpr, + IS_A_TENSOR: tl.constexpr, + IS_B_TENSOR: tl.constexpr, + FUSE_TYPE: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, + cache_modifier: tl.constexpr, +): + """ + Kernel for computing the matmul C = A x B. + A and B inputs are in the microscale fp4 (mxfp4) format. + A_scales and B_scales are in e8m0 format. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + + tl.assume(stride_am > 0) + tl.assume(stride_ak > 0) + tl.assume(stride_bk > 0) + tl.assume(stride_bn > 0) + tl.assume(stride_cm > 0) + tl.assume(stride_cn > 0) + tl.assume(stride_asm > 0) + tl.assume(stride_ask > 0) + tl.assume(stride_bsk > 0) + tl.assume(stride_bsn > 0) + + GRID_MN = tl.cdiv(M, BLOCK_SIZE_M) * tl.cdiv(N, BLOCK_SIZE_N) + + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse. + pid_unified = tl.program_id(axis=0) + pid_unified = remap_xcd(pid_unified, GRID_MN * NUM_KSPLIT, NUM_XCDS=8) + pid_k = pid_unified % NUM_KSPLIT + pid = pid_unified // NUM_KSPLIT + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + if NUM_KSPLIT == 1: + pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) + else: + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + tl.assume(pid_m >= 0) + tl.assume(pid_n >= 0) + # We assume 32 elements along K share the same scale. + SCALE_GROUP_SIZE: tl.constexpr = 32 + + if (pid_k * SPLITK_BLOCK_SIZE // 2) < K: + + num_k_iter = tl.cdiv(SPLITK_BLOCK_SIZE // 2, BLOCK_SIZE_K // 2) + + # Create pointers for first block of A and B input matrices + # The BLOCK sizes are of the elements and in fp4 we pack 2 per uint8 container. + offs_k = tl.arange(0, BLOCK_SIZE_K // 2) + offs_k_shuffle_arr = tl.arange(0, (BLOCK_SIZE_K // 2) * 16) + offs_k_split = pid_k * (SPLITK_BLOCK_SIZE // 2) + offs_k + offs_k_shuffle = pid_k * (SPLITK_BLOCK_SIZE // 2) * 16 + offs_k_shuffle_arr + + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * (BLOCK_SIZE_N // 16) + tl.arange(0, BLOCK_SIZE_N // 16)) % N + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak + ) + b_ptrs = b_ptr + ( + offs_bn[:, None] * stride_bn + offs_k_shuffle[None, :] * stride_bk + ) + + # Create pointers for the first block of A and B scales + offs_asn = ( + pid_n * (BLOCK_SIZE_N // 32) + tl.arange(0, (BLOCK_SIZE_N // 32)) + ) % N + offs_ks = (pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) * 32) + tl.arange( + 0, BLOCK_SIZE_K // SCALE_GROUP_SIZE * 32 + ) + # B scales are N x K even though B operand is K x N. + b_scale_ptrs = ( + b_scales_ptr + + offs_asn[:, None] * stride_bsn + + offs_ks[None, :] * stride_bsk + ) + + if BLOCK_SIZE_M < 32: + offs_ks_non_shufl = ( + pid_k * (SPLITK_BLOCK_SIZE // SCALE_GROUP_SIZE) + ) + tl.arange(0, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + a_scale_ptrs = ( + a_scales_ptr + + offs_am[:, None] * stride_asm + + offs_ks_non_shufl[None, :] * stride_ask + ) + else: + offs_asm = ( + pid_m * (BLOCK_SIZE_M // 32) + tl.arange(0, (BLOCK_SIZE_M // 32)) + ) % M + a_scale_ptrs = ( + a_scales_ptr + + offs_asm[:, None] * stride_asm + + offs_ks[None, :] * stride_ask + ) + + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(pid_k * num_k_iter, (pid_k + 1) * num_k_iter): + if BLOCK_SIZE_M < 32: + a_scales = tl.load(a_scale_ptrs) + else: + a_scales = ( + tl.load(a_scale_ptrs) + .reshape( + BLOCK_SIZE_M // 32, + BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, + 4, + 16, + 2, + 2, + 1, + ) + .permute(0, 5, 3, 1, 4, 2, 6) + .reshape(BLOCK_SIZE_M, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + ) + + b_scales = ( + tl.load(b_scale_ptrs, cache_modifier=cache_modifier) + .reshape( + BLOCK_SIZE_N // 32, + BLOCK_SIZE_K // SCALE_GROUP_SIZE // 8, + 4, + 16, + 2, + 2, + 1, + ) + .permute(0, 5, 3, 1, 4, 2, 6) + .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // SCALE_GROUP_SIZE) + ) + + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + if EVEN_K: + a = tl.load(a_ptrs) + b = tl.load(b_ptrs, cache_modifier=cache_modifier) + + b = ( + b.reshape( + 1, + BLOCK_SIZE_N // 16, + BLOCK_SIZE_K // 64, + 2, + 16, + 16, + ) + .permute(0, 1, 4, 2, 3, 5) + .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K // 2) + .trans(1, 0) + ) + + accumulator += tl.dot_scaled(a, a_scales, "e2m1", b, b_scales, "e2m1") + + # Advance the ptrs to the next K block. + a_ptrs += (BLOCK_SIZE_K // 2) * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * 16 * stride_bk + if BLOCK_SIZE_M < 32: + a_scale_ptrs += (BLOCK_SIZE_K // SCALE_GROUP_SIZE) * stride_ask + else: + a_scale_ptrs += BLOCK_SIZE_K * stride_ask + b_scale_ptrs += BLOCK_SIZE_K * stride_bsk + + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + if NUM_KSPLIT == 1: + if IS_A_SCALAR and IS_A_TENSOR: + c_a = tl.load(c_a_ptr) + elif IS_A_SCALAR: + c_a = c_a_ptr + else: + c_a = tl.load( + c_a_ptr + + stride_cam * offs_cm[:, None] + + stride_can * offs_cn[None, :], + mask=c_mask, + ) + c_a = c_a.to(tl.float32) + + if IS_B_SCALAR and IS_B_TENSOR: + c_b = tl.load(c_b_ptr) + elif IS_B_SCALAR: + c_b = c_b_ptr + else: + c_b = tl.load( + c_b_ptr + + stride_cbm * offs_cm[:, None] + + stride_cbn * offs_cn[None, :], + mask=c_mask, + ) + c_b = c_b.to(tl.float32) + + if FUSE_TYPE == 0: + accumulator = c_a * accumulator + c_b + else: + accumulator = c_b * c_a + accumulator + + c = accumulator.to(c_ptr.type.element_ty) + + c_ptrs = ( + c_ptr + + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :] + + pid_k * stride_ck + ) + tl.store(c_ptrs, c, mask=c_mask, cache_modifier=".wt") + + +_fused_gemm_afp4wfp4_mul_add_reduce_repr = make_kernel_repr( + "_fused_gemm_afp4wfp4_mul_add_reduce_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "ACTUAL_KSPLIT", + "MAX_KSPLIT", + ], +) + + +@triton.heuristics({}) # dummy heuristics to invoke kernel re-naming +@triton.jit(repr=_fused_gemm_afp4wfp4_mul_add_reduce_repr) +def _fused_gemm_afp4wfp4_mul_add_reduce_kernel( + c_in_ptr, + c_out_ptr, + c_a_ptr, + c_b_ptr, + M, + N, + stride_c_in_k, + stride_c_in_m, + stride_c_in_n, + stride_c_out_m, + stride_c_out_n, + stride_cam, + stride_can, + stride_cbm, + stride_cbn, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + ACTUAL_KSPLIT: tl.constexpr, + MAX_KSPLIT: tl.constexpr, + IS_A_SCALAR: tl.constexpr, + IS_B_SCALAR: tl.constexpr, + IS_A_TENSOR: tl.constexpr, + IS_B_TENSOR: tl.constexpr, + FUSE_TYPE: tl.constexpr, +): + + pid_m = tl.program_id(axis=0) + pid_n = tl.program_id(axis=1) + + offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, MAX_KSPLIT) + c_in_ptrs = ( + c_in_ptr + + (offs_k[:, None, None] * stride_c_in_k) + + (offs_m[None, :, None] * stride_c_in_m) + + (offs_n[None, None, :] * stride_c_in_n) + ) + + if ACTUAL_KSPLIT == MAX_KSPLIT: + c = tl.load(c_in_ptrs) + else: + c = tl.load(c_in_ptrs, mask=offs_k[:, None, None] < ACTUAL_KSPLIT) + c = tl.sum(c, axis=0) + + if IS_A_SCALAR and IS_A_TENSOR: + c_a = tl.load(c_a_ptr) + elif IS_A_SCALAR: + c_a = c_a_ptr + else: + c_a = tl.load( + c_a_ptr + stride_cam * offs_m[:, None] + stride_can * offs_n[None, :] + ) + c_a = c_a.to(tl.float32) + + if IS_B_SCALAR and IS_B_TENSOR: + c_b = tl.load(c_b_ptr) + elif IS_B_SCALAR: + c_b = c_b_ptr + else: + c_b = tl.load( + c_b_ptr + stride_cbm * offs_m[:, None] + stride_cbn * offs_n[None, :] + ) + c_b = c_b.to(tl.float32) + + if FUSE_TYPE == 0: + c = c_a * c + c_b + else: + c = c_b * c_a + c + c = c.to(c_out_ptr.type.element_ty) + + c_out_ptrs = ( + c_out_ptr + + (offs_m[:, None] * stride_c_out_m) + + (offs_n[None, :] * stride_c_out_n) + ) + + tl.store(c_out_ptrs, c) + + +@functools.lru_cache(maxsize=1024) +def _get_config( + M: int, + N: int, + K: int, + shuffle: bool = False, +): + shuffle_filename_suffix = "" if not shuffle else "_PRESHUFFLED" + if not hasattr(_get_config, "_config_dict") or not hasattr( + _get_config._config_dict, f"default{shuffle_filename_suffix}" + ): + dev = arch_info.get_device() + _get_config._config_dict = {} + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-AFP4WFP4{shuffle_filename_suffix}.json" + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[f"default{shuffle_filename_suffix}"] = config + + key = f"{N}_{K}{shuffle_filename_suffix}" + if key not in _get_config._config_dict.keys(): + dev = arch_info.get_device() + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-AFP4WFP4{shuffle_filename_suffix}-N={N}-K={2*K}.json" + if os.path.exists(fpath): + with open(fpath, "r") as file: + config = json.load(file) + _get_config._config_dict[key] = config + else: + key = f"default{shuffle_filename_suffix}" # fall back to default config + + if M < 32: + BLK_M = triton.next_power_of_2(M) + if BLK_M >= 16 and "small_M16" in _get_config._config_dict[key]: + return _get_config._config_dict[key]["small_M16"] + return _get_config._config_dict[key]["small"] + elif M <= 128: + BLK_M = triton.next_power_of_2(M) + if BLK_M == 32: + return _get_config._config_dict[key]["medium_M32"] + elif BLK_M == 64: + return _get_config._config_dict[key]["medium_M64"] + elif BLK_M == 128: + return _get_config._config_dict[key]["medium_M128"] + elif M <= 256: + return _get_config._config_dict[key]["large"] + else: + BLK_M = triton.next_power_of_2(M) + if f"xlarge_M{BLK_M}" in _get_config._config_dict[key]: + return _get_config._config_dict[key][f"xlarge_M{BLK_M}"] + return _get_config._config_dict[key]["xlarge"] diff --git a/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py b/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py index d17ad95af0..a4c79043ab 100644 --- a/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py +++ b/aiter/ops/triton/_triton_kernels/fused_mxfp4_quant.py @@ -33,6 +33,7 @@ def _fused_rms_mxfp4_quant_kernel( out1_bs_ptr, out2_ptr, out_res1_ptr, + out1_ptr, eps1, eps2, M, @@ -46,12 +47,14 @@ def _fused_rms_mxfp4_quant_kernel( out1_bs_stride_n, out2_stride_m, out_res1_stride_m, + out1_stride_m, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_N2: tl.constexpr, MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, HAS_SECOND_INPUT: tl.constexpr, FIRST_INPUT_RES: tl.constexpr, + FIRST_INPUT_OUT: tl.constexpr, SCALE_N: tl.constexpr, SCALE_M_PAD: tl.constexpr, SCALE_N_PAD: tl.constexpr, @@ -139,6 +142,14 @@ def _fused_rms_mxfp4_quant_kernel( w1 = tl.load(w1_ptr + x_offs_n, mask=w_mask1, other=w_other1).to(tl.float32) norm1 = _rmsmorm_op(x1, w1, N1, eps1) + + if FIRST_INPUT_OUT: + tl.store( + out1_ptr + x_offs_m[:, None] * out1_stride_m + x_offs_n[None, :], + norm1, + mask=mask1, + ) + out1_fp4, bs_e8m0 = _mxfp4_quant_op( norm1, BLOCK_SIZE_N, BLOCK_SIZE_M, MXFP4_QUANT_BLOCK_SIZE ) @@ -253,3 +264,217 @@ def _fused_flatten_mxfp4_quant( out_block_scales, mask=block_scale_offs < tl.cdiv(N2, MXFP4_QUANT_BLOCK_SIZE), ) + + +@triton.heuristics( + { + "EVEN_M_N": lambda args: args["M"] % args["BLOCK_SIZE_M1"] == 0 + and args["N1"] % (args["BLOCK_SIZE_N1"] * args["NUM_ITER"]) == 0, + } +) +@triton.jit +def _fused_reduce_act_mul_and_dynamic_mxfp4_quant_kernel( + x_ptr, + y_ptr, + y_scale_ptr, + x2_ptr, + y2_ptr, + stride_x_spk, + stride_x_m, + stride_x_n, + stride_y_m, + stride_y_n, + stride_y_scale_m, + stride_y_scale_n, + stride_x2_spk, + stride_x2_m, + stride_x2_n, + stride_y2_m, + stride_y2_n, + M, + N1, + N2, + BLOCK_SIZE_M1: tl.constexpr, + BLOCK_SIZE_N1: tl.constexpr, + BLOCK_SIZE_M2: tl.constexpr, + BLOCK_SIZE_N2: tl.constexpr, + NUM_ITER: tl.constexpr, + NUM_STAGES: tl.constexpr, + MXFP4_QUANT_BLOCK_SIZE: tl.constexpr, + EVEN_M_N: tl.constexpr, + SCALING_MODE: tl.constexpr, + ACTIVATION: tl.constexpr, + scaleN: tl.constexpr, + scaleM_pad: tl.constexpr, + scaleN_pad: tl.constexpr, + SHUFFLE: tl.constexpr, + X_HAS_SPLITK: tl.constexpr, + X_NUM_KSPLIT: tl.constexpr, + X_NUM_KSPLIT_POW2: tl.constexpr, +): + + tl.assume(stride_x_spk > 0) + tl.assume(stride_x_m > 0) + tl.assume(stride_x_n > 0) + tl.assume(stride_y_m > 0) + tl.assume(stride_y_n > 0) + tl.assume(stride_y_scale_m > 0) + tl.assume(stride_y_scale_n > 0) + tl.assume(stride_x2_spk > 0) + tl.assume(stride_x2_m > 0) + tl.assume(stride_x2_n > 0) + tl.assume(stride_y2_m > 0) + tl.assume(stride_y2_n > 0) + + all_pid = tl.program_id(axis=0) + num_pid_m1 = tl.cdiv(M, BLOCK_SIZE_M1) + num_pid_n1 = tl.cdiv(N1, BLOCK_SIZE_N1 * NUM_ITER) + num_pid_1 = num_pid_m1 * num_pid_n1 + + if X_HAS_SPLITK and all_pid >= num_pid_1: + pid2 = all_pid - num_pid_1 + num_pid_n2 = tl.cdiv(N2, BLOCK_SIZE_N2) + pid_m2 = pid2 // num_pid_n2 + pid_n2 = pid2 % num_pid_n2 + offs_m2 = (pid_m2 * BLOCK_SIZE_M2 + tl.arange(0, BLOCK_SIZE_M2)) % M + offs_n2 = (pid_n2 * BLOCK_SIZE_N2 + tl.arange(0, BLOCK_SIZE_N2)) % N2 + offs_spk = tl.arange(0, X_NUM_KSPLIT_POW2) + x2_ptrs = ( + x2_ptr + + offs_spk[:, None, None] * stride_x2_spk + + offs_m2[None, :, None] * stride_x2_m + + offs_n2[None, None, :] * stride_x2_n + ) + if X_NUM_KSPLIT_POW2 == X_NUM_KSPLIT: + x2 = tl.load(x2_ptrs) + else: + x2 = tl.load( + x2_ptrs, mask=offs_spk[:, None, None] < X_NUM_KSPLIT, other=0.0 + ) + x2 = tl.sum(x2, axis=0) + + x2 = x2.to(y2_ptr.type.element_ty) + + y2_out_ptrs = ( + y2_ptr + (offs_m2[:, None] * stride_y2_m) + (offs_n2[None, :] * stride_y2_n) + ) + + tl.store(y2_out_ptrs, x2) + return + + pid_m = all_pid // num_pid_n1 + start_n = all_pid % num_pid_n1 * NUM_ITER + NUM_QUANT_BLOCKS: tl.constexpr = BLOCK_SIZE_N1 // MXFP4_QUANT_BLOCK_SIZE + + offs_spk = None + if X_HAS_SPLITK: + offs_spk = tl.arange(0, X_NUM_KSPLIT_POW2) + + for pid_n in tl.range(start_n, min(start_n + NUM_ITER, N1), num_stages=NUM_STAGES): + x_offs_m = pid_m * BLOCK_SIZE_M1 + tl.arange(0, BLOCK_SIZE_M1) + x_offs_n = pid_n * BLOCK_SIZE_N1 + tl.arange(0, BLOCK_SIZE_N1) + + mask = None + other = None + if X_HAS_SPLITK: + x_ptrs = ( + x_ptr + + offs_spk[:, None, None] * stride_x_spk + + x_offs_m[None, :, None] * stride_x_m + + x_offs_n[None, None, :] * stride_x_n + ) + if X_NUM_KSPLIT_POW2 != X_NUM_KSPLIT and not EVEN_M_N: + mask = ( + (offs_spk[:, None, None] < X_NUM_KSPLIT) + & (x_offs_m[None, :, None] < M) + & (x_offs_n[None, None, :] < N1) + ) + other = 0.0 + elif not (X_NUM_KSPLIT_POW2 == X_NUM_KSPLIT): + mask = offs_spk[:, None, None] < X_NUM_KSPLIT + other = 0.0 + elif not EVEN_M_N: + mask = (x_offs_m[None, :, None] < M) & (x_offs_n[None, None, :] < N1) + other = 0.0 + else: + x_ptrs = ( + x_ptr + x_offs_m[:, None] * stride_x_m + x_offs_n[None, :] * stride_x_n + ) + if not EVEN_M_N: + mask = (x_offs_m[:, None] < M) & (x_offs_n[None, :] < N1) + other = 0.0 + + x = tl.load( + x_ptrs, + mask=mask, + other=other, + cache_modifier=".cg", + ).to(tl.float32) + x_mul = tl.load( + x_ptrs + N1 * stride_x_n, + mask=mask, + other=other, + cache_modifier=".cg", + ).to(tl.float32) + + if X_HAS_SPLITK: + x = tl.sum(x, axis=0) + x_mul = tl.sum(x_mul, axis=0) + + # x = _apply_activation_from_str(a, ACTIVATION) * b + x = ACTIVATION(x) * x_mul + + y, y_scale = _mxfp4_quant_op( + x, BLOCK_SIZE_N1, BLOCK_SIZE_M1, MXFP4_QUANT_BLOCK_SIZE + ) + + out_offs_m = pid_m * BLOCK_SIZE_M1 + tl.arange(0, BLOCK_SIZE_M1) + # out_offs_m = x_offs_m + out_offs_n = pid_n * BLOCK_SIZE_N1 // 2 + tl.arange(0, BLOCK_SIZE_N1 // 2) + out_offs = out_offs_m[:, None] * stride_y_m + out_offs_n[None, :] * stride_y_n + + if EVEN_M_N: + tl.store(y_ptr + out_offs, y) + else: + out_mask = (out_offs_m < M)[:, None] & (out_offs_n < (N1 // 2))[None, :] + tl.store(y_ptr + out_offs, y, mask=out_mask) + + bs_offs_m = pid_m * BLOCK_SIZE_M1 + tl.arange(0, BLOCK_SIZE_M1) + # bs_offs_m = x_offs_m + bs_offs_n = pid_n * NUM_QUANT_BLOCKS + tl.arange(0, NUM_QUANT_BLOCKS) + if SHUFFLE: + bs_offs_0 = bs_offs_m[:, None] // 32 + bs_offs_1 = bs_offs_m[:, None] % 32 + bs_offs_2 = bs_offs_1 % 16 + bs_offs_1 = bs_offs_1 // 16 + bs_offs_3 = bs_offs_n[None, :] // 8 + bs_offs_4 = bs_offs_n[None, :] % 8 + bs_offs_5 = bs_offs_4 % 4 + bs_offs_4 = bs_offs_4 // 4 + bs_offs = ( + bs_offs_1 + + bs_offs_4 * 2 + + bs_offs_2 * 2 * 2 + + bs_offs_5 * 2 * 2 * 16 + + bs_offs_3 * 2 * 2 * 16 * 4 + + bs_offs_0 * 2 * 16 * scaleN + ) + bs_mask1 = (bs_offs_m < M)[:, None] & (bs_offs_n < scaleN)[None, :] + bs_mask = (bs_offs_m < scaleM_pad)[:, None] & (bs_offs_n < scaleN_pad)[ + None, : + ] + y_scale = tl.where(bs_mask1, y_scale, 127) + else: + bs_offs = ( + bs_offs_m[:, None] * stride_y_scale_m + + bs_offs_n[None, :] * stride_y_scale_n + ) + bs_mask = (bs_offs_m < M)[:, None] & (bs_offs_n < scaleN)[None, :] + if EVEN_M_N: + tl.store(y_scale_ptr + bs_offs, y_scale) + else: + tl.store( + y_scale_ptr + bs_offs, + y_scale, + mask=bs_mask, + ) diff --git a/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4_pre_quant_atomic.py b/aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py similarity index 83% rename from aiter/ops/triton/_triton_kernels/gemm_afp4wfp4_pre_quant_atomic.py rename to aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py index 0d27d412c6..f0c2454821 100644 --- a/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4_pre_quant_atomic.py +++ b/aiter/ops/triton/_triton_kernels/gemm_a16wfp4.py @@ -8,25 +8,26 @@ import torch import triton import triton.language as tl -from ..utils._triton.pid_preprocessing import pid_grid, remap_xcd +from ..utils._triton.pid_preprocessing import pid_grid from ..utils._triton import arch_info from ..utils.core import AITER_TRITON_CONFIGS_PATH -from .quant import _mxfp4_quant_op from ..utils._triton.kernel_repr import make_kernel_repr +from .quant import _mxfp4_quant_op -_gemm_afp4wfp4_pre_quant_repr = make_kernel_repr( - "_gemm_afp4_wfp4_pre_quant_kernel", +_gemm_a16wfp4_repr = make_kernel_repr( + "_gemm_a16wfp4_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", "BLOCK_SIZE_K", "GROUP_SIZE_M", - "NUM_KSPLIT", - "SPLITK_BLOCK_SIZE", - "EVEN_K", - "GRID_MN", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", "cache_modifier", + "NUM_KSPLIT", ], ) @@ -40,8 +41,8 @@ * triton.cdiv(args["N"], args["BLOCK_SIZE_N"]), } ) -@triton.jit(repr=_gemm_afp4wfp4_pre_quant_repr) -def _gemm_afp4_wfp4_pre_quant_kernel( +@triton.jit(repr=_gemm_a16wfp4_repr) +def _gemm_a16wfp4_kernel( a_ptr, b_ptr, c_ptr, @@ -66,11 +67,15 @@ def _gemm_afp4_wfp4_pre_quant_kernel( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, GRID_MN: tl.constexpr, + ATOMIC_ADD: tl.constexpr, cache_modifier: tl.constexpr, ): - """ - Kernel for computing the matmul C = A x B. + """Kernel for computing the matmul C = A x B. A and B inputs are in the microscale fp4 (mxfp4) format. A_scales and B_scales are in e8m0 format. A has shape (M, K), B has shape (K, N) and C has shape (M, N) @@ -95,8 +100,6 @@ def _gemm_afp4_wfp4_pre_quant_kernel( num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) if NUM_KSPLIT == 1: - remap_xcd(pid, GRID_MN) - pid_m, pid_n = pid_grid(pid, num_pid_m, num_pid_n, GROUP_SIZE_M=GROUP_SIZE_M) else: pid_m = pid // num_pid_n @@ -104,6 +107,8 @@ def _gemm_afp4_wfp4_pre_quant_kernel( tl.assume(pid_m >= 0) tl.assume(pid_n >= 0) + tl.assume(pid_k >= 0) + # We assume 32 elements along K share the same scale. SCALE_GROUP_SIZE: tl.constexpr = 32 @@ -148,10 +153,15 @@ def _gemm_afp4_wfp4_pre_quant_kernel( b = tl.load(b_ptrs, cache_modifier=cache_modifier) else: a_bf16 = tl.load( - a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0 + a_ptrs, + mask=offs_k_bf16[None, :] < 2 * K - k * BLOCK_SIZE_K, + other=0, ) b = tl.load( - b_ptrs, mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), other=0 + b_ptrs, + mask=offs_k[:, None] < K - k * (BLOCK_SIZE_K // 2), + other=0, + cache_modifier=cache_modifier, ) a, a_scales = _mxfp4_quant_op(a_bf16, BLOCK_SIZE_K, BLOCK_SIZE_M, 32) @@ -175,7 +185,29 @@ def _gemm_afp4_wfp4_pre_quant_kernel( + pid_k * stride_ck ) c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) - tl.atomic_add(c_ptrs, c, mask=c_mask, sem="relaxed") + # if pid == 0: + # tl.device_print("c", c) + if ATOMIC_ADD: + tl.atomic_add(c_ptrs, c, mask=c_mask, sem="relaxed") + else: + tl.store(c_ptrs, c, mask=c_mask) + + +_gemm_a16wfp4_preshuffle_repr = make_kernel_repr( + "_gemm_a16wfp4_preshuffle_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", + "cache_modifier", + "NUM_KSPLIT", + ], +) def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): @@ -220,7 +252,7 @@ def _get_config( if not hasattr(_get_config, "_config_dict"): dev = arch_info.get_device() _get_config._config_dict = {} - fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM_PREQUANT-AFP4WFP4.json" + fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16WFP4.json" with open(fpath, "r") as file: config = json.load(file) _get_config._config_dict["default"] = config @@ -228,7 +260,9 @@ def _get_config( key = f"{N}_{K}" if key not in _get_config._config_dict.keys(): dev = arch_info.get_device() - fpath = f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM_PREQUANT-AFP4WFP4-N={N}-K={2*K}.json" + fpath = ( + f"{AITER_TRITON_CONFIGS_PATH}/gemm/{dev}-GEMM-A16WFP4-N={N}-K={2*K}.json" + ) if os.path.exists(fpath): with open(fpath, "r") as file: config = json.load(file) @@ -236,8 +270,10 @@ def _get_config( else: key = "default" # fall back to default config - if M < 32: + if M < 16: config = _get_config._config_dict[key]["small"] + elif M < 32: + config = _get_config._config_dict[key]["small_M16"] elif M <= 128: BLK_M = triton.next_power_of_2(M) if BLK_M == 32: @@ -253,19 +289,4 @@ def _get_config( config = config.copy() - if config["NUM_KSPLIT"] > 1: - SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( - K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] - ) - - config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE - config["BLOCK_SIZE_K"] = BLOCK_SIZE_K - config["NUM_KSPLIT"] = NUM_KSPLIT - else: - config["SPLITK_BLOCK_SIZE"] = 2 * K - - if config["BLOCK_SIZE_K"] >= 2 * K: - config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) - config["SPLITK_BLOCK_SIZE"] = 2 * K - return config diff --git a/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py b/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py index 514f00cab6..cdbbbadb8a 100644 --- a/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py +++ b/aiter/ops/triton/_triton_kernels/gemm_afp4wfp4.py @@ -13,57 +13,18 @@ _gemm_afp4wfp4_repr = make_kernel_repr( - "_gemm_afp4_wfp4_kernel", + "_gemm_afp4wfp4_kernel", [ "BLOCK_SIZE_M", "BLOCK_SIZE_N", "BLOCK_SIZE_K", "GROUP_SIZE_M", - "NUM_KSPLIT", - "SPLITK_BLOCK_SIZE", - "EVEN_K", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", "cache_modifier", - ], -) - - -_gemm_afp4wfp4_preshuffled_repr = make_kernel_repr( - "_gemm_afp4_wfp4_kernel_preshuffled_scales", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", "NUM_KSPLIT", - "SPLITK_BLOCK_SIZE", - "EVEN_K", - "cache_modifier", - ], -) - - -_gemm_afp4wfp4_preshuffled_weight_scales_repr = make_kernel_repr( - "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "BLOCK_SIZE_K", - "GROUP_SIZE_M", - "NUM_KSPLIT", - "SPLITK_BLOCK_SIZE", - "EVEN_K", - "cache_modifier", - ], -) - - -_gemm_afp4wfp4_reduce_repr = make_kernel_repr( - "_gemm_afp4_wfp4_reduce_kernel", - [ - "BLOCK_SIZE_M", - "BLOCK_SIZE_N", - "ACTUAL_KSPLIT", - "MAX_KSPLIT", ], ) @@ -76,7 +37,7 @@ } ) @triton.jit(repr=_gemm_afp4wfp4_repr) -def _gemm_afp4_wfp4_kernel( +def _gemm_afp4wfp4_kernel( a_ptr, b_ptr, c_ptr, @@ -104,6 +65,10 @@ def _gemm_afp4_wfp4_kernel( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -224,6 +189,23 @@ def _gemm_afp4_wfp4_kernel( tl.store(c_ptrs, c, mask=c_mask) +_gemm_afp4wfp4_preshuffle_scales_repr = make_kernel_repr( + "_gemm_afp4wfp4_preshuffle_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", + "cache_modifier", + "NUM_KSPLIT", + ], +) + + @triton.heuristics( { "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) @@ -231,8 +213,8 @@ def _gemm_afp4_wfp4_kernel( and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), } ) -@triton.jit(repr=_gemm_afp4wfp4_preshuffled_repr) -def _gemm_afp4_wfp4_kernel_preshuffled_scales( +@triton.jit(repr=_gemm_afp4wfp4_preshuffle_scales_repr) +def _gemm_afp4wfp4_kernel_preshuffle_scales( a_ptr, b_ptr, c_ptr, @@ -260,6 +242,10 @@ def _gemm_afp4_wfp4_kernel_preshuffled_scales( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -429,6 +415,23 @@ def _gemm_afp4_wfp4_kernel_preshuffled_scales( tl.store(c_ptrs, c, mask=c_mask, cache_modifier=".wt") +_gemm_afp4wfp4_preshuffle_repr = make_kernel_repr( + "_gemm_afp4wfp4_preshuffle_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "BLOCK_SIZE_K", + "GROUP_SIZE_M", + "num_warps", + "num_stages", + "waves_per_eu", + "matrix_instr_nonkdim", + "cache_modifier", + "NUM_KSPLIT", + ], +) + + @triton.heuristics( { "EVEN_K": lambda args: (args["K"] % (args["BLOCK_SIZE_K"] // 2) == 0) @@ -436,8 +439,8 @@ def _gemm_afp4_wfp4_kernel_preshuffled_scales( and (args["K"] % (args["SPLITK_BLOCK_SIZE"] // 2) == 0), } ) -@triton.jit(repr=_gemm_afp4wfp4_preshuffled_weight_scales_repr) -def _gemm_afp4_wfp4_kernel_preshuffled_weight_scales( +@triton.jit(repr=_gemm_afp4wfp4_preshuffle_repr) +def _gemm_afp4wfp4_preshuffle_kernel( a_ptr, b_ptr, c_ptr, @@ -465,6 +468,10 @@ def _gemm_afp4_wfp4_kernel_preshuffled_weight_scales( NUM_KSPLIT: tl.constexpr, SPLITK_BLOCK_SIZE: tl.constexpr, EVEN_K: tl.constexpr, + num_warps: tl.constexpr, + num_stages: tl.constexpr, + waves_per_eu: tl.constexpr, + matrix_instr_nonkdim: tl.constexpr, cache_modifier: tl.constexpr, ): """ @@ -525,12 +532,10 @@ def _gemm_afp4_wfp4_kernel_preshuffled_weight_scales( offs_am[:, None] * stride_am + offs_k_split[None, :] * stride_ak ) b_ptrs = b_ptr + ( - # offs_k_split[:, None] * stride_bk + offs_bn[None, :] * stride_bn - offs_bn[:, None] * stride_bn - + offs_k_shuffle[None, :] * stride_bk + offs_bn[:, None] * stride_bn + offs_k_shuffle[None, :] * stride_bk ) - # Create pointers for the first block of A and B scales + # Create pointers for the first block of A and B scales offs_asn = ( pid_n * (BLOCK_SIZE_N // 32) + tl.arange(0, (BLOCK_SIZE_N // 32)) ) % N @@ -645,8 +650,20 @@ def _gemm_afp4_wfp4_kernel_preshuffled_weight_scales( tl.store(c_ptrs, c, mask=c_mask, cache_modifier=".wt") +_gemm_afp4wfp4_reduce_repr = make_kernel_repr( + "_gemm_afp4wfp4_reduce_kernel", + [ + "BLOCK_SIZE_M", + "BLOCK_SIZE_N", + "ACTUAL_KSPLIT", + "MAX_KSPLIT", + ], +) + + +@triton.heuristics({}) # dummy heuristics to invoke kernel re-naming @triton.jit(repr=_gemm_afp4wfp4_reduce_repr) -def _gemm_afp4_wfp4_reduce_kernel( +def _gemm_afp4wfp4_reduce_kernel( c_in_ptr, c_out_ptr, M, diff --git a/aiter/ops/triton/batched_gemm_a16wfp4.py b/aiter/ops/triton/batched_gemm_a16wfp4.py new file mode 100755 index 0000000000..a10cc66bea --- /dev/null +++ b/aiter/ops/triton/batched_gemm_a16wfp4.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +from typing import Optional +import torch +import triton +import aiter.ops.triton.utils._triton.arch_info as arch_info +from aiter.ops.triton._triton_kernels.batched_gemm_a16wfp4 import ( + _batched_gemm_a16wfp4_reduce_kernel, + _batched_gemm_a16wfp4_kernel, + _get_config, +) +from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.gemm_a16wfp4 import ( + get_splitk, +) + +_LOGGER = AiterTritonLogger() + +global _USE_GEMM_SPLITK_BF16 +_USE_GEMM_SPLITK_BF16 = False + + +def set_use_gemm_splitk_bf16(value: bool): + global _USE_GEMM_SPLITK_BF16 + _USE_GEMM_SPLITK_BF16 = value + + +def batched_gemm_a16wfp4( + x, + w, + w_scales, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, + transpose_bm: Optional[bool] = False, + prequant: Optional[bool] = True, + y_scale: Optional[torch.Tensor] = None, +): + """ + Computes batched FP4 matrix multiplication Y[i] = X[i] @ W[i]^T with active activation quantization. + X is quantized to MXFP4 during computation, W is pre-quantized FP4. + + Args: + x (torch.Tensor): Higher precision input batch with shape (B, M, K) (BF16 or FP16). + Quantized to MXFP4 on-the-fly during GEMM. + w (torch.Tensor): FP4 E2M1 weight batch with shape (B, N, K), internally transposed. + w_scales (torch.Tensor): E8M0 per-group scale for w with shape (B, N, K//32). + One scale per 32 elements in K dimension. + dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). + y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (B, M, N). + config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, + BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). + transpose_bm (Optional[bool]): Transpose batch and M dimensions in output. + + + Returns: + torch.Tensor: Output batch with shape (B, M, N). + """ + _LOGGER.info( + f"BATCHED_GEMM_AFP4WFP_PREQUANT: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w.shape)}" + ) + + assert prequant is True, "prequant = False is not yet supported" + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + Bx, M, K = x.shape + Bw, N, K = w.shape + assert Bx == Bw + B = Bx + + if config is None: + config = _get_config(M, N, K) + + if y is None: + if transpose_bm: + y = torch.empty((M, B, N), dtype=dtype, device=x.device) + else: + y = torch.empty((B, M, N), dtype=dtype, device=x.device) + else: + if transpose_bm: + assert ( + y.shape[0] == M and y.shape[1] == B and y.shape[2] == N + ), f"Output dimension error {y.shape} {B} {M} {N}" + else: + assert ( + y.shape[0] == B and y.shape[1] == M and y.shape[2] == N + ), f"Output dimension error {y.shape} {B} {M} {N}" + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (B, config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (B, config["NUM_KSPLIT"], M, N), + dtype=torch.float32, + device=y.device, + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + + if config["NUM_KSPLIT"] == 1: + stride_ck = 0 + stride_cn = y.stride(2) + if transpose_bm: + stride_cb = y.stride(1) + stride_cm = y.stride(0) + else: + stride_cb = y.stride(0) + stride_cm = y.stride(1) + else: + stride_cb = y_pp.stride(0) + stride_ck = y_pp.stride(1) + stride_cm = y_pp.stride(2) + stride_cn = y_pp.stride(3) + + grid = lambda META: ( # noqa: E731 + B, + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + _batched_gemm_a16wfp4_kernel[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + w_scales, + y_scale, + M, + N, + K, + x.stride(0), + x.stride(1), + x.stride(2), + w.stride(0), + w.stride(1), + w.stride(2), + stride_cb, + stride_ck, + stride_cm, + stride_cn, + w_scales.stride(0), + w_scales.stride(1), + w_scales.stride(2), + PRE_QUANT=prequant, + HAVE_Y_SCALE=(y_scale is not None), + **config, + ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + B, + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _batched_gemm_a16wfp4_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y_pp.stride(3), + y.stride(0) if transpose_bm else y.stride(1), + y.stride(1) if transpose_bm else y.stride(0), + y.stride(2), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + config["NUM_KSPLIT"], + ) + return y diff --git a/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py index 8679344856..b5376dc766 100755 --- a/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py +++ b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py @@ -11,6 +11,9 @@ _get_config, ) from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.batched_gemm_a16wfp4 import ( + batched_gemm_a16wfp4, +) _LOGGER = AiterTritonLogger() @@ -31,126 +34,9 @@ def batched_gemm_afp4wfp4_pre_quant( y: Optional[torch.Tensor] = None, config: Optional[dict] = None, ): - """ - Computes batched FP4 matrix multiplication Y[i] = X[i] @ W[i]^T with active activation quantization. - X is quantized to MXFP4 during computation, W is pre-quantized FP4. - - Args: - x (torch.Tensor): Higher precision input batch with shape (B, M, K) (BF16 or FP16). - Quantized to MXFP4 on-the-fly during GEMM. - w (torch.Tensor): FP4 E2M1 weight batch with shape (B, N, K), internally transposed. - w_scales (torch.Tensor): E8M0 per-group scale for w with shape (B, N, K//32). - One scale per 32 elements in K dimension. - dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). - y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (B, M, N). - config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, - BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). - - Returns: - torch.Tensor: Output batch with shape (B, M, N). - """ _LOGGER.info( - f"BATCHED_GEMM_AFP4WFP_PREQUANT: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w.shape)}" - ) - - assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" - - Bx, M, K = x.shape - Bw, N, K = w.shape - By, _, _ = y.shape - assert Bx == Bw == By - Batch = Bx - - if config is None: - config = _get_config(M, N, K) - - if config["NUM_KSPLIT"] > 1: - SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( - K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] - ) - - config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE - config["BLOCK_SIZE_K"] = BLOCK_SIZE_K - config["NUM_KSPLIT"] = NUM_KSPLIT - - if _USE_GEMM_SPLITK_BF16: - y_pp = torch.empty( - (Batch, config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device - ) - else: - y_pp = torch.empty( - (Batch, config["NUM_KSPLIT"], M, N), - dtype=torch.float32, - device=y.device, - ) - else: - config["SPLITK_BLOCK_SIZE"] = 2 * K - y_pp = None - - if config["BLOCK_SIZE_K"] >= 2 * K: - config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) - config["SPLITK_BLOCK_SIZE"] = 2 * K - - grid = lambda META: ( # noqa: E731 - Batch, - ( - META["NUM_KSPLIT"] - * triton.cdiv(M, META["BLOCK_SIZE_M"]) - * triton.cdiv(N, META["BLOCK_SIZE_N"]) - ), + "batched_gemm_afp4wfp4_pre_quant will be deprecated in future AITER release, please switch to batched_gemm_a16wfp4" ) - _batched_gemm_afp4_wfp4_pre_quant_kernel[grid]( - x, - w, - y if config["NUM_KSPLIT"] == 1 else y_pp, - w_scales, - M, - N, - K, - x.stride(0), - x.stride(1), - x.stride(2), - w.stride(0), - w.stride(1), - w.stride(2), - y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), - 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), - y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), - y.stride(2) if config["NUM_KSPLIT"] == 1 else y_pp.stride(3), - w_scales.stride(0), - w_scales.stride(1), - w_scales.stride(2), - **config, + return batched_gemm_a16wfp4( + x, w, w_scales, dtype, y, config, transpose_bm=False, prequant=True ) - - if config["NUM_KSPLIT"] > 1: - REDUCE_BLOCK_SIZE_M = 16 - # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails - # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and - # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials - REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 - ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) - - grid_reduce = ( - Batch, - triton.cdiv(M, REDUCE_BLOCK_SIZE_M), - triton.cdiv(N, REDUCE_BLOCK_SIZE_N), - ) - _batched_gemm_afp4_wfp4_pre_quant_reduce_kernel[grid_reduce]( - y_pp, - y, - M, - N, - y_pp.stride(0), - y_pp.stride(1), - y_pp.stride(2), - y_pp.stride(3), - y.stride(0), - y.stride(1), - y.stride(2), - REDUCE_BLOCK_SIZE_M, - REDUCE_BLOCK_SIZE_N, - ACTUAL_KSPLIT, - config["NUM_KSPLIT"], - ) - return y diff --git a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16-N4=512-N16=256-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16-N4=512-N16=256-K=7168.json new file mode 100644 index 0000000000..2778eb61a2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16-N4=512-N16=256-K=7168.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "small_M16": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 8 + }, + "medium_M32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 6, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 7 + }, + "medium_M64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 7 + }, + "medium_M128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "large": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 4 + }, + "xlarge": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16.json b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16.json new file mode 100644 index 0000000000..21d51bd9a8 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4-A16W16.json @@ -0,0 +1,14 @@ +{ + "any": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4_PRESHUFFLED-A16W16.json b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4_PRESHUFFLED-A16W16.json new file mode 100644 index 0000000000..c356742b24 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-FUSED-GEMM-AFP4WFP4_PRESHUFFLED-A16W16.json @@ -0,0 +1,38 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "any": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + } +} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json index 94947a5a1a..848e1c288f 100644 --- a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16W16-N=256-K=7168.json @@ -76,5 +76,18 @@ "cache_modifier": null, "NUM_KSPLIT": 1, "kpack": 1 + }, + "any": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 32, + "cache_modifier": null, + "NUM_KSPLIT": 1, + "kpack": 1 } } diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=512-K=7168.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=512-K=7168.json new file mode 100644 index 0000000000..f2a37990bc --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=512-K=7168.json @@ -0,0 +1,75 @@ +{ + "small": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "medium_M32": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "medium_M64": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "medium_M128": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "large": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + }, + "xlarge": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 14 + } + +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=7168-K=2048.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=7168-K=2048.json new file mode 100644 index 0000000000..a7d0e5319e --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4-N=7168-K=2048.json @@ -0,0 +1,86 @@ +{ + "small": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "small_M16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 4 + }, + "medium_M64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } +} diff --git a/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4.json b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4.json new file mode 100644 index 0000000000..87585de25d --- /dev/null +++ b/aiter/ops/triton/configs/gemm/MI350X-GEMM-A16WFP4.json @@ -0,0 +1,87 @@ +{ + "small": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "small_M16": { + "BLOCK_SIZE_M": 4, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M32": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M64": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "medium_M128": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "large": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": ".cg", + "NUM_KSPLIT": 1 + }, + "xlarge": { + "BLOCK_SIZE_M": 8, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 512, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 1, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "cache_modifier": null, + "NUM_KSPLIT": 1 + } + +} diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index a92d1d94da..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 3170e87c52..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 339d8b60b3..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 0a9d32b9b3..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index fd439c98f0..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 1c6d43bf8b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index fb4cfcdbb1..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index f4f2b5f242..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index a07265a92c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 164a2daa55..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 11a81135b7..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index caf2d9663c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 0f16aad417..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d216f4db45..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 4b472f387c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 1a354efcee..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 4d9bd9f32e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index effc769ed2..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 3a94d7f3f9..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 5c9fa23455..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 2b7333aab5..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 9f9940f169..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index b2a2cfed56..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 07d026c9bc..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 517dd3009b..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 75c2c0f392..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d96d7107d0..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index a69de9121f..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 786585e8e6..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 3f9b8bc0f8..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 161cc7f778..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 4d9bd9f32e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 2a8f53e1ec..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d3f3b6944e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d854a26d44..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 18518c7114..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index ddfa86a175..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 630d0cf7be..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 878752918d..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index db028776cd..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 5f22b90628..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e1bb487e35..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 43952c536a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 9e5e0d0b33..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e06ba3b5e8..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index dfcc4c6fe5..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 0703f1c3c5..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index bf18531344..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 97fc49e9ef..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 7180f472b1..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 4474a4cbba..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index c02d8df2eb..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 625ebad0aa..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 6720a67345..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 97be1f6400..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 9bde266a37..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 59feb2de7f..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e1d87be86c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index a51b2294a3..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index a70e0a6ed4..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 0214033b0d..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 6da53f98e2..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 7de88c2416..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 839ba3f892..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e58dac5e89..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 5a5927322e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 677161f2e2..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index ab66b9aac0..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 06164a3817..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 2a8f53e1ec..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d3f3b6944e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 94f8f302d4..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d854a26d44..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 18518c7114..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index ddfa86a175..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 630d0cf7be..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 878752918d..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index db028776cd..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 5f22b90628..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index aa851bad5e..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e1bb487e35..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 43952c536a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 65d5835012..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 9e5e0d0b33..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e06ba3b5e8..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index dfcc4c6fe5..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 0703f1c3c5..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index bf18531344..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 97fc49e9ef..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 7180f472b1..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 4474a4cbba..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index c02d8df2eb..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 625ebad0aa..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 1cf1324d04..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index a582768be3..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e1d87be86c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 4a266b4bdb..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "ce7a874c46162d4a686ded6b749e772fdd69eb8099e1788200e24e28b3b714e6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 7ab0ef739d..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 7b2dc1c3ec..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 4484a8d7ff..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 97adaa1c6a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index f7aff5c000..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 89db920df9..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 7bca47bc78..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index a68caf2419..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 129f6593aa..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "b8c2841a693dc9b73f1792c733bccb56ed63fbad8b5b0818edf2ab6ab963dadd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 2b1e91002c..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index b8395db679..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index a88a1a528c..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "e22eb2008573c5f836861cb1efaf8f54828bffbb6f2c6f1e1e1f96374e36c9c7", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 5d30e79cfc..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 2a8f53e1ec..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d3f3b6944e..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 94f8f302d4..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index d854a26d44..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 18518c7114..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "2e39738c94cf8e93300d242f527e5a75558988d05bb786410e6aa7c079c43155", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index ddfa86a175..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 630d0cf7be..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "4d89d1c52ce0647d9996ad94a4956ebbdc392eb7945d4a844cc3eedafb74339c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 878752918d..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index db028776cd..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 5f22b90628..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index aa851bad5e..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e1bb487e35..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 43952c536a..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 65d5835012..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 9e5e0d0b33..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index e06ba3b5e8..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index dfcc4c6fe5..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "783e1e926ea3e03a7a188de990e795a2417db40c4b258ff9eb71c41e877bb3c0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 0703f1c3c5..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 441977fcda..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index bf18531344..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 146ca2148f..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 97fc49e9ef..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index 7180f472b1..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "c5cc000d5ddbf5481c376636b1c60b99ab83c90d0670791191880e92214f806a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 4474a4cbba..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json deleted file mode 100644 index c02d8df2eb..0000000000 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ /dev/null @@ -1 +0,0 @@ -{"hash": "cc6e7510758c916f36e6cd30a0065cd1230e801e7aa8a0769e3174d5eed3332e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco deleted file mode 100644 index 625ebad0aa..0000000000 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and /dev/null differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..fb37bfec76 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index b13e41cf45..6c9bbab49f 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "ac0db73ed121f17392580b800e40b9bb87fb9d0369e185933a9886c5f4583ee1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..27028e374b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 053718dae0..9a0d6d26b6 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "f70c711c78f7418d6182e8cfd2b0d0211ab59b720b83dcbbfd09de4594147fb5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "6d85fca9c7dda9dd3d31527b373b3c29db32528e788d6cb9143e0e81d6ae34af", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..93d8bead42 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 1282fab52e..4ca7039532 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "b54b063171df37071e5d216a95968f9b0071bfc3f1dac8a6507d7d3412b3b2c2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "9fd3df79022457865bb0cf42f522518c6cd3e7b1ab219fb72a2e44a73264f9c8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..41c4b47847 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index b62a824706..1285795a7d 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "950f33955df085252de58736ed2fa6ca548cd5920743ecf8df8ce38db3a27bf1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "8fc8efc57f3e712af9df23ab9931e5fd351bf1e02eea9fdca71a6a66937cf13e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..835e251818 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 7235ff4fd4..826c8da5ae 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "f66e735c364132c4a96f7ab74f2c7f24fa8c65d66df35f9fa0e3d69d1cbecb82", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..100b5bb85e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 87d5d11a93..3b1d8498f3 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "70389c6a931f80ad9dabdafe6366f140480dc46d3546e1a47f6e8038dffbbcbe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "c9e983b7947524e0a29c25c4a49650d79c688f11f50e9fe55d8bb698ba080920", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..f49047088e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 6e600e607e..a981e911b6 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "a098a54b9f5edd21bdbd84ba2ae5bc0f6493e3ae9e7fbe11ba4f755d7d33b2c8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "d313ce59ac66783429f1fce91908e814f031eea49a81174adc8e34d0096a2ec5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..94243c9c4a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 9d8fbfacef..5c07dd95d4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "576f5ce01ce40e9047a2afe995e3d1a807b4d8ba89ccfde6875bbcdf1bedc771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "e456eba3a5b4830cde475cc44e7cef91ff21d809663a7ed0eeb0328af862b63f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..49da0f7265 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json index 65d5835012..309d8ac9e8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "78f393626543655d3ba606bacdb60417112eff9c077d601fd4ae53e4b203e727", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "37d651c067f3f7312122c03fa9c7dbe80fe2afdd2f5fcd3bf83363c2363d2389", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..ac65871215 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json index 4aea58cafe..0e2f5d06a6 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "7d4eee9825f37b996ae066357854ac66af67d8ff733626eb7ee22f12d90c425a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "3fdba0f802a66c444714f4be12e76dc72fc690ff876052acc7cc46e5039ffa7d", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..f64316fbed Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json index 3e7d0dbdf7..1ff26942cf 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "f2d1a87b6a5690047dc9744b4dcbf4e6ca2d231295a0eb21b4de4d2d57d5c452", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "d0bbf4b796f5cdb38155e4196ee0ae86905a8fb5e602a193f2eff7a5e68d1c06", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..8ed7dd1629 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json index 779fd69d94..31b4af3827 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "3f103db1b06fbcc5665da2d706abfc42ab421a1e2136147b1fb729db1aa0c1de", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "652620172805dfbc029f1d71f6020daeee787441d5002a470dde957124663162", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..8ed16da6d9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json index 638a5fd66a..5370206b70 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "674006f4c8ea7904eecc04cb91ea7fd771ffb64c1b070a084ec21c03ffe1f1c2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "ff17f59d450c0f6fd17e02dafe9880c20d9f0b50b2ea7671930c5fa093bdb1cb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..04de5251bc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json index 30a971a865..821efe3ad8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "50630419988688add7ab5f7992729c367ac76cd80edb7ed14b1c4f86a6af5938", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "de64f305bd6b13d6f447e75a3f28c36ee5dc346e4732e083f0080fad0c11172e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..b528710129 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json index 24580e0921..389cb90b96 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "af947e9c60407171fac651b5e73064363155003f3949a1280322f28c0bc82174", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "cf8a4eb32f6ee1258daeff2f719991d5e93a49a56b3d4e472ce1ce3ac1724886", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..d82c2aeb50 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 56bed44a0c..5ddc74b606 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=1-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "21134c6efa51cd932dd9df5b64c4b4b8baccaa9c75ecae3319de4d9c11304c13", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..3682538491 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index cf1de878a4..b1a3df2bc0 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "1eea6a6f69efd9adbfc722daf6f70fbf96785d4c2536c27089af2750e93f1007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "6fd92a5704cafe8fc8128fcfeb9580a709cc96f90ab4bd77cb30eb36fff0719a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..cbb9bd7d3e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..08acc59e8a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "60c353bc69789c371fe574057f3c38a26d378b47a2aedcd108b6dc4c5488b6d1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..b1b4884cd7 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 2df2522e1c..d3e5b004b4 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "97cdf5b137cd798fc01173124f1fe7f434603233131809f90b9122692b5e0691", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "50d6ea63d4e28ea753cea3329b6ed6e5c066d3eabd224d8d425de7f4d3c364e2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..30ab95328e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index f32f36bceb..19fa9e3080 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "7a8df9f76c7249c0727020baab6fc5a45bff3f61821a017af0bcdcb31a158d51", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "bad417c8a82cc77f535d3fc28f0dc2a7427c276cb1dd6b0abb3ee12beca26028", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..be3502f644 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..2b46e5b0cd --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "abeb38cc5212739d26e48af1239c32e6bcd38acdeffa8934cfa0eec238ba914e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..a1ff2bcb9d Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index f0dd4491d2..26e386be8b 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ed379549185fb90aacc990a30e95b69837b80e2ba48fcbe8b2c328df34d911d2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "edd9a4dd479bd0e61aa73929bb69bdedeff06acec387784d606306865431f18c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..0445503797 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 441977fcda..7966e519cc 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "0a47bc2419c29f318626dacd627dfb5495c1a5502f6538ec6c159de4bfd43f21", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..36928ba196 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 644acdd15d..53a17c95ad 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "c468b044688faad941ae6530c535e4dc5ccab9ec70b273112a98fe310e96fbe5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "5d7c6d40c30976d49ffb61cbf961b7fe950a36c3a37bd6ba59cec67527c19145", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..70bbd03caa Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json index c18ad7066a..ed71d1dedb 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "00a3f26ad3db5e526e2ffc540824d99e1c060a1d6a6d27796dc7b6d5e2f28128", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 1, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8192, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "a843505af9d0de09f24ecbd9fb47b0ce8532558484ed9f55e38acfed4f4aa00e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 1, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8192, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_4_num_stages_1_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..22eff3d041 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..18737db4b3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "b089cd526efdd85d4e0d99613711462c0a362de9e03d13e6315d3f097b7e953c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..b2668f40db Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json index d36b270856..153a1e017f 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "50524344e989cc18ac62628fc02d73d4163eb245c05d3868c90e9efe40f885ea", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "3c10221760281745da88cf8135cfa607de0f063d4ac484b4f523c9c36e1c0ebd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..767fc69949 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json index d8d02b17f6..18f2ff2202 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "fddfd666ae3dc3d6b62572367cdeafad40d6e4e6bb921f30391f7428b1e1e338", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "72d78a34ddfbc40e0fb4f881355c529401dcdd0b759c87ef259f7bb2af00489a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..1b6ce9bac2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..887b408f0a --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "e18e6682d0234bd280b9bd99053b708d8d38a4d60ca2d73a361cb9eed704766d", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..d95a6a3734 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json index 5cc7a5b2d1..eeaf944c6e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "dc7719c21d6c20b721db205eed9d3b7e5b88c2259f331a26d538e2b9da4193f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "443ae274e193fd6043c642cb8976463c41d70d5404bfc373ae30cc7127999df1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..b428691a4e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json index 0f45827489..a65774671f 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "40729b4e2ffb586195b561a7924416e04e15972dcf0e08b6be64b5979c49d7be", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "87c7858153ab8e434bec427abdaee357f7585e21511220f34366b12f60fd68ea", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..3682538491 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index cf1de878a4..b1a3df2bc0 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=16-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "1eea6a6f69efd9adbfc722daf6f70fbf96785d4c2536c27089af2750e93f1007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "6fd92a5704cafe8fc8128fcfeb9580a709cc96f90ab4bd77cb30eb36fff0719a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..53844517b4 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index b13e41cf45..8d99b9dafd 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "420fc2afaf9c7fd3ed527fbe8ed836dcebd868d643eb727fd7cecda2e352dac1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..725ef6adfc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index ca1818a8b5..3abb740476 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "87c999f39f544c659c4c4b3649617c5cfab67bc2a5df8a26c6227aa4cf4ea998", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "28ee3738457603540beaceed7d072d429f4a2c5a741ddb66bd51eae63ab350f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..aab0c22969 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..35666fde2b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "b4aeabf30320553d6f69e0071a9a432edde5b09ddecb1d3c98819ec0d3457cfe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..433c6c5860 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 343d8fdce0..fdc102e3f2 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "06c044aea0bc437a4798835deb75891e2ca4f556f7d00f2f3139a895210cbb8b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "08bad7427dda84af014b9fb0de8c2b1b41f77e3b364f03e5b35cd89c640a3e41", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..87e957c393 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 7235ff4fd4..de5bb66c57 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "e20a6504a51ac5fa2fa8a64817846b4ef6e2b28d01fef20a3daa90a66ff38771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..1c9266f454 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index f2eef37457..c160ecfc89 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "f6d74b8d7b05e98a86916eaf47d9226c4134e31fab17c2db49e2f0f187e1057b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..4633c4d980 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 74fa020cf3..ca4e8466cc 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=16-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "1f80213aa7e03eafff54b07fffeecfbe5013f46fdaad2a5092c34cccd87c2115", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "c63afbc7803e924742aa4ee3bb2a2cfc4a39c135b7e349061ce18485d365a293", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..05d93400da Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 307d8cda4e..010147f21e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "b72cad96794df04006b57f794600ac44aac42f119a19c470001e89ac5b6d044c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..89264b015b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json index 94f8f302d4..9134672e63 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "f0c1e844f172bda1a622216d81027dc06ff9952abbadf81e2aeaf8182b0c084f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "575dd304f117606679059d6cb68d955bb489a78beb0909257af4b0e192ad1f23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..eb127d475e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json index 83ae9f44b8..21845be148 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "8676ab450247ae5f46c633e97f5e6d8ce947a706f4dba861a79ccdcea9b82c87", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..719a1bad56 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..97c6c20cc3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "7b410e9cdd75a5f3fe773f65a28d85bcc51d8eadf022829e0d45921fa0e52914", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..951de4909c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json index 441977fcda..9b92656576 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "cc29416468762d3f1d4815aa637c2c53a46b8c7d35d98fe67a9c24e160732486", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "1db6cd1a83ff815c7665cf291d373830bd1a143e696927015a8ee2b49ebcb854", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..0944d9dd34 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json index 0c64b2b637..c8c8b85ef8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "3e3805a7ab6809b0520f39281f72e918acfd2568d5b7d1852b7aa65ff6dede2f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "2cac51523210d7e7e299993c9e25d1d44bddff40f03186798034d38fc430e9ac", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..740a857e7c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..ddf2559e4f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "eee21ae42957c88dc5a42d31b397651349617feb51b740074745b249d75c0e37", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..59205a6f65 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..5c79d583fa --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "a1b9a6eac170c026febb882b2e6e012093b1cb60feab008afa954c773a1204f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..5b85fd6844 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 56bed44a0c..193ff13c31 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=2-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "cbaacc428925683de01130c1a14cb4a4b113da5e9e6dc35cba74a56c75a0cbcb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..ccc1c910dc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index b94e9b5868..df7471e61b 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "69c533376c135f1466f40015aa8dbb2e47737901c0704ebae287d5a6c817625c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 4864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "aba3b6ee938d70c1de141002417f1c921db453f895532640c9d2d65c780ab2b2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 4864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..bd1a146c2b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index b9d277c8a3..0cb383c0c3 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ed3a75371cec725aed630ea65b4fa508941952f5e0c9471fa93107b230a4f03b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "182a8ed3e19738c0c42ca43be5de931da4416ebcd0509607b61372242c2a45a2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..000b0d5ba4 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 0c62a8849a..e2f54b18e3 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=14336-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ee7c8bc727b05b5294121866e6263da1994dc1ee7734e4888045d513b10cb4d4", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "b22a5a1a4d14245543d7b2ad5bd91c1641772f593a921524c3e95329aa436da0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..4cfe758be9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 41c58a031b..9f301d7816 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "98a5f489f33182cd98a113529f0648b30b5042f9318b52363b3e9c54368c2f79", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "eb96864ca43b4613934d411fd2ea15dd9cfc555c57119d06c6f4f1f75dee56a8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..28aa9941f1 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 4a266b4bdb..f7360288f7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ce7a874c46162d4a686ded6b749e772fdd69eb8099e1788200e24e28b3b714e6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "3df6d8326c4abd390dc4720816542d0605767741f96ee0230f8621937eb0fc5c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..fc9faf3808 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index ee907f79de..cab57e198e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "cd35a510fafc921b16596ea3787adfac00ce14bd5ca2f8194c08a2d8ce625c63", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "d3d6536e323275a8550d70f546c7abffe289db8eaff89278776a63fdf1a0064d", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..247c5a37a2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 70876a4dfe..d9c0c6c203 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ab1b8066b3ca873af3ef3ef52ab68a54be546b27071022826927f25df768ad7f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "ab30d0921fd33b047a432093f7a8eba01043359d171b7de45de9ab54a5133867", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..3692d162f3 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index afda0cc597..4c59a2d328 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "d5ae75cc2a5e451f4541f51892bebc6bade5f9e0fc50a14924d5df9d3e862ab1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "bd51efc6fc2ebfad453142081d544dad23bb8d0bf5f803c532e453e2b8e9f6b6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..bd0113c69a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json index 80d0d783bd..4e75183db5 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "dd02aea24f07a469be50cb48315080339bec4331fb29dc3ab324044e4fad83d9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "152f10b4ee54c3b8887d02aadc4117b0fb2e5d53830b3b21aa1567f1ddd59952", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..755463aef2 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json index 0cac508631..14544f3cdc 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "f2988946eb27a029d039bab8743ccf128ede0f91ff0bdcabd59a3d0a8737b90a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "6f059192e4847d649f5acf872608c5075589dc0091433f3a449c866c4d89292b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..38c63c0bce Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json index 47c0104cea..9a9dfc37d7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "3d1105e6d7346400d4523456e30259ea586690de52c26e4bd7a2c6fdbd75d2c5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "58afbabb9e2270e3dbad9690b5cc1153ce3d2f4f17fe6755ae7b6230f4a29233", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..0b2906cfb8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json index 2be5a94c79..18dcdf62c7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "7ece21689ec170d622f73f8dd019d6603006f843506aa999a4ba733398455007", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "5157457c9163fe8de2015403edc1507123d499ca3c9acacdb7e7cd108b19551c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..76f3e9b02a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json index 23034f50b9..a3355a0fac 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "0d0e2c47e7ca82c5ca8e47b5b51e21ae0139be3b70bc174af2be8545770544e0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "f057025d1f6a3f49c65ec89b7333757d4eb93f9615c6b919263fad3a110c104e", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 17408, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..1b33ea85b8 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json index e25967cdc2..892737a390 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "f9bac69d0a515dec752a6cd3498979c6d2e5fa55f1f20c6c2e68de845e4c0709", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "383c9daf6dfa8ad66a127d9a5f1883a50a6752f4caa6ba98743981d30e943f52", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..d3ef79c3ce Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json index ba5641bae4..33719c4499 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=7168/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "487a67f0a0313c1afa4b0aa5dbeee4606311eaad808e0e7b69875fcb29b1edb9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "96bc5c880ab21d11a6417760154aa4a08fe8f77df9c93f608b84f72cc0c0c7dd", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10752, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..f6be9762d4 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 52d755e6a7..5e48567136 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=32-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=32-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "491ea027fe915421c2c388460048bda1909bbb2fe234f26aa650cce8f2b1f5f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "151f1abb8712ab497b6d023fb7b92094d081e09db716f99e2697b994486dbeb9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21504, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..53844517b4 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index b13e41cf45..8d99b9dafd 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "18d7faac2adb5642a8e32f8baa82b17e7625c2984e8eeccac30edab6e4d3a514", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "420fc2afaf9c7fd3ed527fbe8ed836dcebd868d643eb727fd7cecda2e352dac1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..725ef6adfc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..3abb740476 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "28ee3738457603540beaceed7d072d429f4a2c5a741ddb66bd51eae63ab350f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..aab0c22969 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..35666fde2b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "b4aeabf30320553d6f69e0071a9a432edde5b09ddecb1d3c98819ec0d3457cfe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..433c6c5860 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..fdc102e3f2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "08bad7427dda84af014b9fb0de8c2b1b41f77e3b364f03e5b35cd89c640a3e41", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..87e957c393 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 2c41af514d..de5bb66c57 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "900c95ea5068e521cac115dba2f5a39c95629558de25edd9ed355a2bca806bc9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "e20a6504a51ac5fa2fa8a64817846b4ef6e2b28d01fef20a3daa90a66ff38771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..1c9266f454 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index f2eef37457..c160ecfc89 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "f6d74b8d7b05e98a86916eaf47d9226c4134e31fab17c2db49e2f0f187e1057b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..4633c4d980 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index ad581d4eb7..ca4e8466cc 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "bf4271e7a83f7f7e1c1b4d82c565f2c0599c4cbcf518e758b1992254783f6b47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "c63afbc7803e924742aa4ee3bb2a2cfc4a39c135b7e349061ce18485d365a293", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..05d93400da Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 307d8cda4e..010147f21e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "b72cad96794df04006b57f794600ac44aac42f119a19c470001e89ac5b6d044c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..89264b015b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..9134672e63 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "575dd304f117606679059d6cb68d955bb489a78beb0909257af4b0e192ad1f23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..eb127d475e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json index 83ae9f44b8..21845be148 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "8676ab450247ae5f46c633e97f5e6d8ce947a706f4dba861a79ccdcea9b82c87", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..719a1bad56 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..97c6c20cc3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "7b410e9cdd75a5f3fe773f65a28d85bcc51d8eadf022829e0d45921fa0e52914", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..951de4909c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json index c3c9f54b2b..9b92656576 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "d794139ad9c7a2aa2f2fb6efaef5771241cd987dd8be123f9af2ee45a41127fa", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "1db6cd1a83ff815c7665cf291d373830bd1a143e696927015a8ee2b49ebcb854", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..0944d9dd34 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json index 146ca2148f..c8c8b85ef8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "2cac51523210d7e7e299993c9e25d1d44bddff40f03186798034d38fc430e9ac", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..740a857e7c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..ddf2559e4f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "eee21ae42957c88dc5a42d31b397651349617feb51b740074745b249d75c0e37", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..59205a6f65 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..5c79d583fa --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "a1b9a6eac170c026febb882b2e6e012093b1cb60feab008afa954c773a1204f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..5b85fd6844 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index b81c00af1a..193ff13c31 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=4-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "5dccf75c4c6643db197699c190cf3be8883f25f5c1b82c171c1e1f9a5acf5a54", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "cbaacc428925683de01130c1a14cb4a4b113da5e9e6dc35cba74a56c75a0cbcb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..d1f64d58f9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..954817efbc --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "d2863f812ca9919f4cf0ec9e777284e06bca17f566144d7168d0e93e696797f3", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..3491d72086 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 227d75eb8e..0bd14848ab 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=1280-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "304af16ef6752d5164f9f17fd233db9fa50ab36dfb098ed207eeded7ff62fe2f", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8704, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "48fa7fa7492eba12b8281e38654ced4e5f477f1b2738c39249f262235ad8fb3a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8704, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..28aa9941f1 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 5f9f3d73c6..f7360288f7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "8442b59cd54bfc72a3bed8dd9aacc04807eeff7f628d351975e80daeeb8c07b2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "3df6d8326c4abd390dc4720816542d0605767741f96ee0230f8621937eb0fc5c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..b049873ef9 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 14bfdc0ba7..6097e16fc7 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=2560-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "b01f43ede2d3f0f3d7058a795400af47f6d1e9602413dc8926d63d2c7056c74a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 36864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "a1a79beada1434a829871ab8ffa102288d547bb1b733bf8e0eeedf73ef945fc5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 36864, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..587a325f4a Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 447c6a6175..87f0361e64 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "7526c8a1b2ea0dd2646354a0a3fc36c41ff4b1e46d1f24d4994f03ce10cbfd50", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "6d7f5c3af319c96f4216c47475ab3aa2e51c4347bee4fd43e22f1c7238388fe9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..c1282756d6 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 1a94f54cc4..a7737a4c03 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "66c96ad140c5c0362b52113637538dabdb72593f77e280d7d9894f3e565863b8", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "ef26c3fb40e808093f1c8fc9f2d002d13e55aece745120f4ab1a947bbe855d0b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_128_BLOCK_SIZE_K_256_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco similarity index 51% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco index 29d777447c..2b8e65fa7d 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index e720580e2d..fe312b2bae 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "952567c0aad2f93638b1a9f6c7b73712b04bbcec87dcb455d62aba552ea88c23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "4e3f1a0c388291dc916ee60db0e4fdfbdafc65a5796cffe006b0dcfe249807fa", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..0d51bb85f0 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 741b678759..15a82b2231 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "5cf1d8f50dfd5ca23bd5a74ef61af0b11ac8a7954942d1cf721029569da9db61", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "392b2bc2d1007f9e6b6ebcc06b48c1f33156824fccbe6a30f4e0822c098c0599", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..87a0a1022b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json index c737064a11..959a5fa4f9 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=1024/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "f7afa37c6f7c464bac61859ac2a2eb1845baf4a665c09d03dfb371135b442ed5", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "82f633dae5e5153b28a342ae0ab70f6e2d3e761aa346187b53647fd5ba0439a7", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..3bb148b108 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json index 8264c424c9..80b3562c8f 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ce178e99a4c7c1317d9d74002f85df6166b0ce8be2e81a0278015f94a98f8568", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 18432, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "545c02d58ef82bf6a0207d821fda959285d1982c8b1664c5deeb1a237d05e261", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 18432, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..c4b6292a95 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json index 8d6754c157..8034113d72 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=2048/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "5f77dc9bedad300205059b17200cec0a92128347a2e1163f97475d1ac61f36d0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "700495532aeef8d16ba9f707f12b6448d71f1b844a7cf76150ac75b8cb748a27", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 34816, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco similarity index 51% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco index e05feb3d94..37163ed7c2 100644 Binary files a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.hsaco and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json index d9e06080a5..1253b21f98 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=28672/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "2ca8102db9b04514810267d567d9111a674640efa166eda7ac964f2c9b62e741", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "ec36099aa99e531107eb2c3c25f64c0c6971a8de889a5dc30fc4a2ae44a2b2cb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 19456, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..655ef5fe7f Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..761b5d9b9b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "26c7c4a40dd58ce24cb2cc1673d5bcb989efcc5d9a15ac0edcb97126c51490ca", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..ad6f41ff11 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json index fa4dfc7269..4dd051172b 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=4096/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "762edd1695475aecc55e561d12419f63581b9f2ba898b37a2b1b23bf316fc823", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "4343c8f411a01d056a4182188aef339479bebaea49e5fa68d89406a52e21a776", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..4dbd1053f7 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..9634e61c25 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "25eac10645d889445bd67388060ec83cf57a83e17335c8ba18cd81ae18f27c02", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 9728, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_32_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_NONE_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..e86d9ae9de Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index a64d2bad03..9ae9a9d63e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=64-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=64-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "f0ea00fc779e1822790bc4801a6bc5b8ca6c6859ecdeca7d368233cff1b06c66", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "ee34536f286c36e118f1a52f2281dc95788c9890590b2d4afac4c918faf053f0", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 38912, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_64_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..53844517b4 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index cf000a5aea..8d99b9dafd 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=1-N=10240-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=10240-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "4971efc8d6396be9b0df4db743227b0777c6aa214766b931a446d515ce1a8695", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "420fc2afaf9c7fd3ed527fbe8ed836dcebd868d643eb727fd7cecda2e352dac1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..725ef6adfc Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..3abb740476 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=1280-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "28ee3738457603540beaceed7d072d429f4a2c5a741ddb66bd51eae63ab350f2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_8"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..aab0c22969 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..35666fde2b --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=14336-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "b4aeabf30320553d6f69e0071a9a432edde5b09ddecb1d3c98819ec0d3457cfe", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..433c6c5860 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..fdc102e3f2 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=2560-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "08bad7427dda84af014b9fb0de8c2b1b41f77e3b364f03e5b35cd89c640a3e41", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..87e957c393 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 7235ff4fd4..de5bb66c57 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=28672-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=28672-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "fe3758b9506495129900c7cf93886044f55e7ae4a301af969674ca852f415a5a", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "e20a6504a51ac5fa2fa8a64817846b4ef6e2b28d01fef20a3daa90a66ff38771", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..1c9266f454 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index f2eef37457..c160ecfc89 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=5120-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=5120-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "ada59cd55b8b6fe94986c411060bcddc6f9248b327c3caeb5726a051269f1ce6", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "f6d74b8d7b05e98a86916eaf47d9226c4134e31fab17c2db49e2f0f187e1057b", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 12800, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..4633c4d980 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index aa851bad5e..ca4e8466cc 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=2-N=57344-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=57344-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "39055514308e3b06ce23fd535721c52ecd3fde994340f13ca04cf458fb9ad977", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "c63afbc7803e924742aa4ee3bb2a2cfc4a39c135b7e349061ce18485d365a293", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 5376, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..05d93400da Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 307d8cda4e..010147f21e 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=7168-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=7168-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "9dbd38f815d7c4b94125e8752f305f34ac64ee8016e9da0a4a96de97b39cbbf9", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "b72cad96794df04006b57f794600ac44aac42f119a19c470001e89ac5b6d044c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_4_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..89264b015b Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..9134672e63 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=1024/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "575dd304f117606679059d6cb68d955bb489a78beb0909257af4b0e192ad1f23", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 6, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 3200, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_512_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_6_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..eb127d475e Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json index 83ae9f44b8..21845be148 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=14336/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=14336/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "1efbe003cc33d4234b58aeb88e93b09225ed8b61a992e952703136099c838dd2", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "8676ab450247ae5f46c633e97f5e6d8ce947a706f4dba861a79ccdcea9b82c87", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 1, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 26112, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_1_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_2"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..719a1bad56 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..97c6c20cc3 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=2048/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "7b410e9cdd75a5f3fe773f65a28d85bcc51d8eadf022829e0d45921fa0e52914", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6400, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_64_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..951de4909c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..9b92656576 --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=28672/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "1db6cd1a83ff815c7665cf291d373830bd1a143e696927015a8ee2b49ebcb854", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 6528, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_16_BLOCK_SIZE_N_32_BLOCK_SIZE_K_256_GROUP_SIZE_M_4_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_7"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..0944d9dd34 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json index 146ca2148f..c8c8b85ef8 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=4-N=8192-K=3584/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=3584/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "70f2db6830f849f710567cea9c20e0c7bba4770c4b207b43fac139574b52cc47", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "2cac51523210d7e7e299993c9e25d1d44bddff40f03186798034d38fc430e9ac", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 10880, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_512_GROUP_SIZE_M_8_num_warps_2_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..740a857e7c Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..ddf2559e4f --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=4096/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "eee21ae42957c88dc5a42d31b397651349617feb51b740074745b249d75c0e37", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..59205a6f65 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json new file mode 100644 index 0000000000..5c79d583fa --- /dev/null +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=7168/_gemm_afp4wfp4_preshuffle_kernel.json @@ -0,0 +1 @@ +{"hash": "a1b9a6eac170c026febb882b2e6e012093b1cb60feab008afa954c773a1204f1", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 2, "waves_per_eu": 4, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 21760, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_32_BLOCK_SIZE_K_1024_GROUP_SIZE_M_1_num_warps_2_num_stages_2_waves_per_eu_4_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_1"} \ No newline at end of file diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco new file mode 100644 index 0000000000..5b85fd6844 Binary files /dev/null and b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.hsaco differ diff --git a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json similarity index 72% rename from aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json rename to aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json index 56bed44a0c..193ff13c31 100644 --- a/aiter/ops/triton/configs/gemm/aot/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales_M=8-N=8192-K=8192/_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.json +++ b/aiter/ops/triton/configs/gemm/aot/_gemm_afp4wfp4_preshuffle_kernel_M=8-N=8192-K=8192/_gemm_afp4wfp4_preshuffle_kernel.json @@ -1 +1 @@ -{"hash": "e3d11be1f7e25cbb51cfcc75be2bc37f0d2592e6cd6aa1e60d3e209fc72cd38c", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4_wfp4_kernel_preshuffled_weight_scales"} \ No newline at end of file +{"hash": "cbaacc428925683de01130c1a14cb4a4b113da5e9e6dc35cba74a56c75a0cbcb", "target": {"backend": "hip", "arch": "gfx950", "warp_size": 64}, "num_warps": 4, "waves_per_eu": 2, "num_stages": 2, "num_ctas": 1, "extern_libs": [["ocml", "/app/triton-tot/python/triton/backends/amd/lib/ocml.bc"], ["ockl", "/app/triton-tot/python/triton/backends/amd/lib/ockl.bc"]], "cluster_dims": [1, 1, 1], "debug": false, "sanitize_overflow": true, "arch": "gfx950", "supported_fp8_dtypes": ["fp8e4b8", "fp8e4nv", "fp8e5", "fp8e5b16"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b8", "fp8e5b16"], "default_dot_input_precision": "ieee", "allowed_dot_input_precisions": ["ieee", "bf16x3", "bf16x6"], "enable_fp_fusion": true, "launch_cooperative_grid": false, "matrix_instr_nonkdim": 16, "kpack": 1, "allow_flush_denorm": false, "max_num_imprecise_acc_default": 0, "backend_name": "hip", "instrumentation_mode": "", "schedule_hint": "none", "warp_size": 64, "triton_version": "3.5.0", "shared": 8448, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "_gemm_afp4wfp4_preshuffle_kernel_BLOCK_SIZE_M_8_BLOCK_SIZE_N_128_BLOCK_SIZE_K_1024_GROUP_SIZE_M_8_num_warps_4_num_stages_2_waves_per_eu_2_matrix_instr_nonkdim_16_cache_modifier_CG_NUM_KSPLIT_4"} \ No newline at end of file diff --git a/aiter/ops/triton/fused_fp8_quant.py b/aiter/ops/triton/fused_fp8_quant.py index 42ca530f97..c86d6d642e 100644 --- a/aiter/ops/triton/fused_fp8_quant.py +++ b/aiter/ops/triton/fused_fp8_quant.py @@ -394,7 +394,7 @@ def fused_reduce_act_mul_fp8_group_quant( x2: (SPK, M, 2*N1), dtype = fp32. if x is 2-dim, - x: (M, N2), dtype = fp16 or bf16. + x: (M, 2*N1), dtype = fp16 or bf16. x2 must be None the kernel is essentially identical to aiter.ops.triton.activation.act_mul_and_fp8_group_quant @@ -412,7 +412,9 @@ def fused_reduce_act_mul_fp8_group_quant( y_scale: (M, cdiv(N1, group_size)), dtype = fp32 y2: (M, N2), dtype = dtype """ - _LOGGER.info(f"FUSED_REDUCTION_ACT_MUL_FP8_GROUP_QUANT: x={tuple(x.shape)}") + _LOGGER.info( + f"FUSED_REDUCTION_ACT_MUL_FP8_GROUP_QUANT: x={tuple(x.shape)} activation={activation}" + ) assert ( x.dim() == 2 or x.dim() == 3 diff --git a/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py b/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py index 40f5f62633..90b50bd066 100644 --- a/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py +++ b/aiter/ops/triton/fused_gemm_a8w8_blockscale_a16w16.py @@ -47,12 +47,13 @@ def fused_gemm_a8w8_blockscale_a16w16( - x_fp8_scale: Scale tensor for X with shape (M, *scale_k). - w_fp8_scale: Scale tensor for W with shape (**scale_n, *scale_k). - x_bf16: Matrix X with shape (M, K). - - w_bf16: Matrix W with shape (N_fp8, K). + - w_bf16: Matrix W with shape (N_bf16, K). Note: M, N, K must be identical for x_fp8 and x_bf16, but the N-dim fow w_fp8 and w_bf16 can be different Returns: - - Y: The output matrix with shape (M, N). + - y_fp8: The output matrix with shape (M, N_fp8). + - y_bf16: The output matrix with shape (M, N_bf16). *scale_k = (K + scale_block_size_k - 1) // scale_block_size_k **scale_n = (N_fp8 + scale_block_size_n - 1) // scale_block_size_n @@ -74,7 +75,7 @@ def fused_gemm_a8w8_blockscale_a16w16( x_fp8.shape[1] == x_bf16.shape[1] ), "K-dim should be identical for x_fp8 and x_bf16" assert x_fp8.shape[1] == w_fp8.shape[1], "Incompatible dimensions!!!" - assert w_bf16.shape[1] == w_bf16.shape[1], "Incompatible dimensions!!!" + assert x_bf16.shape[1] == w_bf16.shape[1], "Incompatible dimensions!!!" # Transpose w and w_scale w_fp8 = w_fp8.T diff --git a/aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py b/aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py new file mode 100644 index 0000000000..5dfeb5737e --- /dev/null +++ b/aiter/ops/triton/fused_gemm_afp4wfp4_a16w16.py @@ -0,0 +1,258 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +from typing import Optional +import functools +import json +import os +import torch +import triton +import triton.language as tl +from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton._triton_kernels.fused_gemm_afp4wfp4_a16w16 import ( + _fused_gemm_afp4wfp4_a16w16_kernel, + _fused_gemm_afp4wfp4_preshuffle_a16w16_kernel, + _fused_gemm_afp4wfp4_a16w16_reduce_kernel, + _get_config, +) +from aiter.ops.triton.gemm_afp4wfp4 import get_splitk +from .utils.core import AITER_TRITON_CONFIGS_PATH + +import os +from aiter.utility.triton.triton_metadata_redirect import AOTMetadataContext + +_LOGGER = AiterTritonLogger() + + +def fused_gemm_afp4wfp4_a16w16( + x_fp4: torch.Tensor, + w_fp4: torch.Tensor, + x_fp4_scale: torch.Tensor, + w_fp4_scale: torch.Tensor, + x_bf16: torch.Tensor, + w_bf16: torch.Tensor, + is_fp4_preshuffled: bool = True, + bias_fp4: Optional[torch.Tensor] = None, + bias_bf16: Optional[torch.Tensor] = None, + dtype: Optional[float] = torch.bfloat16, + y_fp4: Optional[torch.Tensor] = None, + y_bf16: Optional[torch.Tensor] = None, + skip_reduce: Optional[bool] = False, + config: Optional[dict] = None, + use_aot: Optional[bool] = True, +): + """ + Computes the 8 bit matmul Y = X x WT + B using the block-scale quantization approach for x_fp4 and w_fp4. + Computes the 16 bit matmul Y = X x WT + B for x_bf16 and w_bf16 + + This fusion is primarily aiming for fusing the gate up-projections and MOE gating: + gate up-projections: (M, K) x (2N, K) = (M, 2N) + MOE gating: (M, K) x (N, K) + (N, ) = (M, N) + + Key parameters: + - x_fp4: Matrix X with shape (M, K). + - w_fp4: Matrix W with shape (N_fp4, K). + - x_fp4_scale: Scale tensor for X with shape (M, K // 32) + - w_fp4_scale: Scale tensor for W with shape (N, K // 32) + - x_bf16: Matrix X with shape (M, K). + - w_bf16: Matrix W with shape (N_bf16, K). + + Note: M, N, K must be identical for x_fp4 and x_bf16, but the N-dim fow w_fp4 and w_bf16 can be different + + Returns: + - y_fp4: The output matrix with shape (M, N_fp4). + - y_bf16: The output matrix with shape (M, N_bf16). + + """ + _LOGGER.info( + f"FUSED_GEMM_A8W8_BLOCKSCALE_A16W16: x_fp4={tuple(x_fp4.shape)} w_fp4={tuple(w_fp4.shape)} x_fp4_scale={tuple(x_fp4_scale.shape)} w_fp4_scale={tuple(w_fp4_scale.shape)} x_bf16={tuple(x_bf16.shape)} w_bf16={tuple(w_bf16.shape)}" + ) + + M, K = x_fp4.shape + N_fp4, K = w_fp4.shape + if is_fp4_preshuffled: + N_fp4 = N_fp4 * 16 + K = K // 16 + M, _ = x_bf16.shape + N_bf16, _ = w_bf16.shape + + # Check constraints. + assert ( + x_fp4.shape[0] == x_bf16.shape[0] + ), "M-dim should be identical for x_fp4 and x_bf16" + assert ( + x_fp4.shape[1] * 2 == x_bf16.shape[1] + ), "K-dim should be identical for x_fp4 and x_bf16" + if is_fp4_preshuffled: + assert x_fp4.shape[1] == w_fp4.shape[1] // 16, "Incompatible dimensions!!!" + else: + assert x_fp4.shape[1] == w_fp4.shape[1], "Incompatible dimensions!!!" + assert x_bf16.shape[1] == w_bf16.shape[1], "Incompatible dimensions!!!" + + # Transpose w and w_scale + if not is_fp4_preshuffled: + w_fp4 = w_fp4.T + w_bf16 = w_bf16.T + + if config is None: + config = _get_config(M, N_fp4, N_bf16, K, is_fp4_preshuffled) + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + config["NUM_KSPLIT"] = triton.cdiv(K, config["SPLITK_BLOCK_SIZE"] // 2) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + config["NUM_KSPLIT"] = 1 + + if y_fp4 is None and (config["NUM_KSPLIT"] == 1 or not skip_reduce): + y_fp4 = torch.empty((M, N_fp4), dtype=dtype, device=x_fp4.device) + + if y_bf16 is None and (config["NUM_KSPLIT"] == 1 or not skip_reduce): + y_bf16 = torch.empty((M, N_bf16), dtype=dtype, device=x_bf16.device) + + if config["NUM_KSPLIT"] > 1: + y_fp4_pp = torch.empty( + (config["NUM_KSPLIT"], M, N_fp4), + dtype=torch.float32, + device=x_fp4.device, + ) + y_bf16_pp = torch.empty( + (config["NUM_KSPLIT"], M, N_bf16), + dtype=torch.float32, + device=x_bf16.device, + ) + else: + y_fp4_pp = None + y_bf16_pp = None + + config["BLOCK_SIZE_N"] = max(config["BLOCK_SIZE_N"], 32) + if is_fp4_preshuffled: + if M < 32: + assert ( + config["BLOCK_SIZE_M"] <= 16 + ), "for M < 32, BLOCK_SIZE_M must be 16 or less as x_scale are assumed to be un-shuffled" + else: + assert ( + config["BLOCK_SIZE_M"] >= 32 + ), "for M >= 32, BLOCK_SIZE_M must be 32 or more as x_scale are assumed to be preshuffled" + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * ( + triton.cdiv(N_fp4, META["BLOCK_SIZE_N"]) + + triton.cdiv(N_bf16, META["BLOCK_SIZE_N"]) + ) + ), + ) + selected_kernel = ( + _fused_gemm_afp4wfp4_preshuffle_a16w16_kernel + if is_fp4_preshuffled + else _fused_gemm_afp4wfp4_a16w16_kernel + ) + + def selected_kernel_wrapper(): + selected_kernel[grid]( + x_fp4, + w_fp4, + bias_fp4, + x_fp4_scale, + w_fp4_scale, + y_fp4 if config["NUM_KSPLIT"] == 1 else y_fp4_pp, + x_bf16, + w_bf16, + bias_bf16, + y_bf16 if config["NUM_KSPLIT"] == 1 else y_bf16_pp, + M, + N_fp4, + N_bf16, + K, + x_fp4.stride(0), + x_fp4.stride(1), + w_fp4.stride(0), + w_fp4.stride(1), + x_fp4_scale.stride(0), + x_fp4_scale.stride(1), + w_fp4_scale.stride(0), + w_fp4_scale.stride(1), + 0 if config["NUM_KSPLIT"] == 1 else y_fp4_pp.stride(0), + y_fp4.stride(0) if config["NUM_KSPLIT"] == 1 else y_fp4_pp.stride(1), + y_fp4.stride(1) if config["NUM_KSPLIT"] == 1 else y_fp4_pp.stride(2), + x_bf16.stride(0), + x_bf16.stride(1), + w_bf16.stride(0), + w_bf16.stride(1), + 0 if config["NUM_KSPLIT"] == 1 else y_bf16_pp.stride(0), + y_bf16.stride(0) if config["NUM_KSPLIT"] == 1 else y_bf16_pp.stride(1), + y_bf16.stride(1) if config["NUM_KSPLIT"] == 1 else y_bf16_pp.stride(2), + ADD_BIAS_FP4=(bias_fp4 is not None), + ADD_BIAS_BF16=(bias_bf16 is not None), + SKIP_REDUCE=skip_reduce, + **config, + ) + + M_POW2 = triton.next_power_of_2(M) + if M < 32 and M_POW2 > 16: + M_POW2 = 16 + metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{selected_kernel.fn.__name__}_M={M_POW2}-N4={N_fp4}-N16={N_bf16}-K={K*2}" + if use_aot and os.path.exists(metadata_pth): + with AOTMetadataContext( + selected_kernel.fn.__name__, + f"{metadata_pth}", + ): + selected_kernel_wrapper() + else: + selected_kernel_wrapper() + + if config["NUM_KSPLIT"] > 1: + if skip_reduce: + return y_fp4_pp, y_bf16_pp + REDUCE_BLOCK_SIZE_M = 32 + REDUCE_BLOCK_SIZE_N = 32 + ACTUAL_KSPLIT = triton.cdiv(K, config["SPLITK_BLOCK_SIZE"] // 2) + + grid_reduce = ( + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N_fp4, REDUCE_BLOCK_SIZE_N) + + triton.cdiv(N_bf16, REDUCE_BLOCK_SIZE_N), + ) + _fused_gemm_afp4wfp4_a16w16_reduce_kernel[grid_reduce]( + bias_fp4, + y_fp4_pp, + y_fp4, + bias_bf16, + y_bf16_pp, + y_bf16, + M, + N_fp4, + N_bf16, + y_fp4_pp.stride(0), + y_fp4_pp.stride(1), + y_fp4_pp.stride(2), + y_fp4.stride(0), + y_fp4.stride(1), + y_bf16_pp.stride(0), + y_bf16_pp.stride(1), + y_bf16_pp.stride(2), + y_bf16.stride(0), + y_bf16.stride(1), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + triton.next_power_of_2(config["NUM_KSPLIT"]), + ADD_BIAS_FP4=(bias_fp4 is not None), + ADD_BIAS_BF16=(bias_bf16 is not None), + ) + + return y_fp4, y_bf16 diff --git a/aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py b/aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py new file mode 100644 index 0000000000..3d5d3d8640 --- /dev/null +++ b/aiter/ops/triton/fused_gemm_afp4wfp4_mul_add.py @@ -0,0 +1,445 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +from typing import Optional, Union +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils._triton.arch_info as arch_info +from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton._triton_kernels.fused_gemm_afp4wfp4_mul_add import ( + _fused_gemm_afp4wfp4_mul_add_kernel, + _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel, + _fused_gemm_afp4wfp4_mul_add_reduce_kernel, + _get_config, +) +from aiter.ops.triton._triton_kernels.gemm_afp4wfp4 import ( + _gemm_afp4wfp4_reduce_kernel, +) +from .utils.core import AITER_TRITON_CONFIGS_PATH + +import os +from aiter.utility.triton.triton_metadata_redirect import AOTMetadataContext + +_LOGGER = AiterTritonLogger() + +global _USE_GEMM_SPLITK_BF16 +_USE_GEMM_SPLITK_BF16 = False + + +def set_use_gemm_splitk_bf16(value: bool): + global _USE_GEMM_SPLITK_BF16 + _USE_GEMM_SPLITK_BF16 = value + + +def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): + # heuristics for make "EVEN_K == True" as much as possible + NUM_KSPLIT_STEP = 2 + BLOCK_SIZE_K_STEP = 2 + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + while NUM_KSPLIT > 1 and BLOCK_SIZE_K > 16: + if ( + K % (SPLITK_BLOCK_SIZE // 2) == 0 + and SPLITK_BLOCK_SIZE % BLOCK_SIZE_K == 0 + and K % (BLOCK_SIZE_K // 2) == 0 + ): + break + elif K % (SPLITK_BLOCK_SIZE // 2) != 0 and NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif SPLITK_BLOCK_SIZE % BLOCK_SIZE_K != 0: + if NUM_KSPLIT > 1: + NUM_KSPLIT = NUM_KSPLIT // NUM_KSPLIT_STEP + elif BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + elif K % (BLOCK_SIZE_K // 2) != 0 and BLOCK_SIZE_K > 16: + BLOCK_SIZE_K = BLOCK_SIZE_K // BLOCK_SIZE_K_STEP + else: + break + + SPLITK_BLOCK_SIZE = ( + triton.cdiv((2 * triton.cdiv(K, NUM_KSPLIT)), BLOCK_SIZE_K) * BLOCK_SIZE_K + ) + + return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT + + +def fused_gemm_afp4wfp4_mul_add( + x, + w, + x_scales, + w_scales, + a: Union[torch.Tensor, float, int], + b: Union[torch.Tensor, float, int], + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, + fuse_type: Optional[int] = 0, +): + """ + Computes matrix multiplication Y = X @ W^T with FP4 activations and FP4 weights. + if fuse_type == 0: + the final output = a * Y + b + elif fuse_type == 1 + the final output = a * b + Y + + Args: + x (torch.Tensor): FP4 E2M1 input matrix with shape (M, K). + w (torch.Tensor): FP4 E2M1 weight matrix with shape (N, K), internally transposed. + x_scales (torch.Tensor): E8M0 per-group scale for x with shape (M, K//32). + One scale per 32 elements in K dimension. + w_scales (torch.Tensor): E8M0 per-group scale for w with shape (N, K//32). + One scale per 32 elements in K dimension. + dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). + y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). + config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, + BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). + + Returns: + torch.Tensor: Output with shape (M, N). + """ + + _LOGGER.info( + f"GEMM_AFPWFP4: x.shape={tuple(x.shape)} w.shape={tuple(w.shape)} x_scale={tuple(x_scales.shape)} w_scale={tuple(w_scales.shape)} " + ) + + if isinstance(a, float) or isinstance(a, int): + IS_A_SCALAR = True + IS_A_TENSOR = False + elif isinstance(a, torch.Tensor) and a.is_contiguous(): + IS_A_TENSOR = True + if a.numel() == 1: + IS_A_SCALAR = True + else: + IS_A_SCALAR = False + if isinstance(b, float) or isinstance(b, int): + IS_B_SCALAR = True + IS_B_TENSOR = False + elif isinstance(b, torch.Tensor) and b.is_contiguous(): + IS_B_TENSOR = True + if b.numel() == 1: + IS_B_SCALAR = True + else: + IS_B_SCALAR = False + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + M, K = x.shape + N, K = w.shape + + # Transpose w + w = w.T + + if y is None: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K) + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + _fused_gemm_afp4wfp4_mul_add_kernel[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + x_scales, + w_scales, + a, + b, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), + y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), + y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), + x_scales.stride(0), + x_scales.stride(1), + w_scales.stride(0), + w_scales.stride(1), + 0 if IS_A_SCALAR else a.stride(0), + 0 if IS_A_SCALAR else a.stride(1), + 0 if IS_B_SCALAR else b.stride(0), + 0 if IS_B_SCALAR else b.stride(1), + IS_A_SCALAR=IS_A_SCALAR, + IS_B_SCALAR=IS_B_SCALAR, + IS_A_TENSOR=IS_A_TENSOR, + IS_B_TENSOR=IS_B_TENSOR, + FUSE_TYPE=fuse_type, + **config, + ) + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _fused_gemm_afp4wfp4_mul_add_reduce_kernel[grid_reduce]( + y_pp, + y, + a, + b, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y.stride(0), + y.stride(1), + 0 if IS_A_SCALAR else a.stride(0), + 0 if IS_A_SCALAR else a.stride(1), + 0 if IS_B_SCALAR else b.stride(0), + 0 if IS_B_SCALAR else b.stride(1), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + triton.next_power_of_2(config["NUM_KSPLIT"]), + IS_A_SCALAR=IS_A_SCALAR, + IS_B_SCALAR=IS_B_SCALAR, + IS_A_TENSOR=IS_A_TENSOR, + IS_B_TENSOR=IS_B_TENSOR, + FUSE_TYPE=fuse_type, + ) + + return y + + +def fused_gemm_afp4wfp4_preshuffle_add_mul( + x, + w, + x_scales, + w_scales, + a: Union[torch.Tensor, float, int], + b: Union[torch.Tensor, float, int], + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, + use_aot: Optional[bool] = True, + fuse_type: Optional[int] = 0, +): + """ + Computes matrix multiplication Y = X @ W^T with FP4 activations and FP4 weights using preshuffled weight scales. + Weight matrix and scales are stored in optimized layout for improved performance. + if fuse_type == 0: + The final output = a * Y + b + elif fuse_type == 1 + The final output = a * b + Y + + Args: + x (torch.Tensor): FP4 E2M1 input matrix with shape (M, K). + w (torch.Tensor): FP4 E2M1 weight matrix with shape (N//16, K*16), internally transposed. + Preshuffled layout: logical shape after unpacking is (N, K). + x_scales (torch.Tensor): E8M0 per-group scale for x with shape (M//32, K) if M >= 32, + or (M, K//32) if M < 32. + w_scales (torch.Tensor): E8M0 per-group scale for w with shape (N//32, K). + Groups of 32 rows in N dimension share K scales. + dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). + y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). + config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, + BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, SPLITK_BLOCK_SIZE). + use_aot (Optional[bool]): Enable ahead-of-time compilation metadata. + + Returns: + torch.Tensor: Output with shape (M, N). + """ + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + if isinstance(a, float) or isinstance(a, int): + IS_A_SCALAR = True + IS_A_TENSOR = False + elif isinstance(a, torch.Tensor) and a.is_contiguous(): + IS_A_TENSOR = True + if a.numel() == 1: + IS_A_SCALAR = True + else: + IS_A_SCALAR = False + if isinstance(b, float) or isinstance(b, int): + IS_B_SCALAR = True + IS_B_TENSOR = False + elif isinstance(b, torch.Tensor) and b.is_contiguous(): + IS_B_TENSOR = True + if b.numel() == 1: + IS_B_SCALAR = True + else: + IS_B_SCALAR = False + + M, K = x.shape + N, K = w.shape + N = N * 16 + K = K // 16 + + if y is None: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config is None: + config = _get_config(M, N, K, True) + + if config["NUM_KSPLIT"] > 1: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if _USE_GEMM_SPLITK_BF16: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=y.dtype, device=y.device + ) + else: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + + config["BLOCK_SIZE_N"] = max(config["BLOCK_SIZE_N"], 32) + if M < 32: + assert ( + config["BLOCK_SIZE_M"] <= 16 + ), "for M < 32, BLOCK_SIZE_M must be 16 or less as x_scale are assumed to be un-shuffled" + else: + assert ( + config["BLOCK_SIZE_M"] >= 32 + ), "for M >= 32, BLOCK_SIZE_M must be 32 or more as x_scale are assumed to be preshuffled" + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + + def kernel_wrapper(): + _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel[grid]( + x, + w, + y if config["NUM_KSPLIT"] == 1 else y_pp, + x_scales, + w_scales, + a, + b, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + 0 if config["NUM_KSPLIT"] == 1 else y_pp.stride(0), + y.stride(0) if config["NUM_KSPLIT"] == 1 else y_pp.stride(1), + y.stride(1) if config["NUM_KSPLIT"] == 1 else y_pp.stride(2), + x_scales.stride(0), + x_scales.stride(1), + w_scales.stride(0), + w_scales.stride(1), + 0 if IS_A_SCALAR else a.stride(0), + 0 if IS_A_SCALAR else a.stride(1), + 0 if IS_B_SCALAR else b.stride(0), + 0 if IS_B_SCALAR else b.stride(1), + IS_A_SCALAR=IS_A_SCALAR, + IS_B_SCALAR=IS_B_SCALAR, + IS_A_TENSOR=IS_A_TENSOR, + IS_B_TENSOR=IS_B_TENSOR, + FUSE_TYPE=fuse_type, + **config, + ) + + M_POW2 = triton.next_power_of_2(M) + if M < 32 and M_POW2 > 16: + M_POW2 = 16 + metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{_fused_gemm_afp4wfp4_preshuffle_mul_add_kernel.fn.__name__}_M={M_POW2}-N={N}-K={K*2}" + if use_aot and os.path.exists(metadata_pth): + with AOTMetadataContext( + _fused_gemm_afp4wfp4_preshuffle_mul_add_kernel.fn.__name__, + f"{metadata_pth}", + ): + kernel_wrapper() + else: + kernel_wrapper() + + if config["NUM_KSPLIT"] > 1: + REDUCE_BLOCK_SIZE_M = 16 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + REDUCE_BLOCK_SIZE_N = 128 if _USE_GEMM_SPLITK_BF16 else 64 + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _fused_gemm_afp4wfp4_mul_add_reduce_kernel[grid_reduce]( + y_pp, + y, + a, + b, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y.stride(0), + y.stride(1), + 0 if IS_A_SCALAR else a.stride(0), + 0 if IS_A_SCALAR else a.stride(1), + 0 if IS_B_SCALAR else b.stride(0), + 0 if IS_B_SCALAR else b.stride(1), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + triton.next_power_of_2(config["NUM_KSPLIT"]), + IS_A_SCALAR=IS_A_SCALAR, + IS_B_SCALAR=IS_B_SCALAR, + IS_A_TENSOR=IS_A_TENSOR, + IS_B_TENSOR=IS_B_TENSOR, + FUSE_TYPE=fuse_type, + ) + + return y diff --git a/aiter/ops/triton/fused_mxfp4_quant.py b/aiter/ops/triton/fused_mxfp4_quant.py index 141bf6d2fe..8e19148e53 100644 --- a/aiter/ops/triton/fused_mxfp4_quant.py +++ b/aiter/ops/triton/fused_mxfp4_quant.py @@ -1,3 +1,4 @@ +from typing import Literal import torch import triton import triton.language as tl @@ -7,6 +8,10 @@ _rmsmorm_op, _fused_rms_mxfp4_quant_kernel, _fused_flatten_mxfp4_quant, + _fused_reduce_act_mul_and_dynamic_mxfp4_quant_kernel, +) +from aiter.ops.triton._triton_kernels.activation import ( + _get_activation_from_str, ) from aiter.ops.triton.utils.logger import AiterTritonLogger @@ -23,6 +28,7 @@ def fused_rms_mxfp4_quant( res1: Optional[torch.Tensor] = None, shuffle: Optional[bool] = False, scale_shuffle_padding: Optional[bool] = False, + output_unquantized_inp1=False, ): """ This op contains several steps: @@ -78,6 +84,12 @@ def fused_rms_mxfp4_quant( device=x1.device, ) + out1 = None + out1_stride_m = 0 + if output_unquantized_inp1: + out1 = torch.empty((M, N1), dtype=x1.dtype, device=x1.device) + out1_stride_m = out1.stride(0) + out_res1 = None res1_stride_m = 0 out_res1_stride_m = 0 @@ -105,6 +117,7 @@ def fused_rms_mxfp4_quant( out1_bs, out2, out_res1, + out1, x1_epsilon, x2_epsilon, M, @@ -117,12 +130,14 @@ def fused_rms_mxfp4_quant( *out1_bs.stride(), out2_stride_m, out_res1_stride_m, + out1_stride_m, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_N2=BLOCK_SIZE_N2, MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, HAS_SECOND_INPUT=(x2 is not None), FIRST_INPUT_RES=(res1 is not None), + FIRST_INPUT_OUT=output_unquantized_inp1, SCALE_N=SCALE_N_valid, SCALE_M_PAD=(SCALE_M if use_scale_shuffle_padding else 1), SCALE_N_PAD=SCALE_N, @@ -130,7 +145,7 @@ def fused_rms_mxfp4_quant( SHUFFLE_PAD=use_scale_shuffle_padding, ) - return (out1_fp4, out1_bs), out2, out_res1 + return (out1_fp4, out1_bs), out1, out2, out_res1 def fused_flatten_mxfp4_quant( @@ -176,3 +191,179 @@ def fused_flatten_mxfp4_quant( ) return out, out_block_scales + + +def fused_reduce_act_mul_and_mxfp4_quant( + x: torch.Tensor, + activation: Literal["silu", "gelu", "gelu_tanh"], + x2: Optional[torch.Tensor] = None, + scaling_mode: str = "even", + shuffle: bool = False, + scale_shuffle_padding: bool = False, + dtype: Optional[float] = torch.bfloat16, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply reduction along the first dimension and apply the activation function + per-token group quantization to MX FP4 format. + If x2 is provided, the only reduction along the first dimension is applied to x2 + + Args: + if x is 3-dim, + x: (SPK, M, 2*N1), dtype = fp32. + x2: (SPK, M, 2*N1), dtype = fp32. + + if x is 2-dim, + x: (M, 2*N1), dtype = fp16 or bf16. + x2 must be None + the kernel is essentially identical to aiter.ops.triton.activation.act_mul_and_mxfp4_group_quant + + activation: activation function to apply before quantization. + - It splits the features into two parts and applies the activation to the first part. + - Then, it adds the results together before quantization. + - Supports the following activations: + - "silu" + - "gelu" + - "gelu_tanh" + + scaling_mode: The method to calculate MX block scaling. + - "even" (default): `even_round` in `quark.torch.quantization.utils`. + - etc. + shuffle: Indicates whether to enable preshuffling of scales. + - When enabled, scale dimensions (X, Y) are adjusted to be multiples of 8 and 256, respectively. + Returns: + tuple: (y, y_scale), y2 + if shuffle or scale_shuffle_padding: + y: (M_pad, N1_pad), dtype = uint8 + y_scale: (M_pad, N1_pad), dtype = uint8 + y2: (M, N2), dtype = dtype + + where M_pad = cdiv(M, 256) * 256 + N1_pad = cdiv(cdiv(N1, MXFP4_QUANT_BLOCK_SIZE), 8) * 8 + else: + y: (M, N1), dtype = uint8 + y_scale: (M, cdiv(N1, MXFP4_QUANT_BLOCK_SIZE)), dtype = uint8 + y2: (M, N2), dtype = dtype + + A tuple of (y, y_scale). + """ + _LOGGER.info( + f"ACT_MUL_MXFP4_QUANT: x={tuple(x.shape)} activation={activation} shuffle={shuffle}" + ) + + assert ( + x.dim() == 2 or x.dim() == 3 + ), "The number of dimentions for x should be 2 or 3" + X_HAS_SPLITK = False + x_num_splitk = 1 + N2 = 1 + y2 = None + if x.dim() == 3: + x_num_splitk, M, N1 = x.shape + x_num_splitk, _, N2 = x2.shape + assert ( + x.shape[0] == x2.shape[0] and x.shape[1] == x2.shape[1] + ), "The first two dimensions should be identical between x and x2" + assert ( + x_num_splitk > 1 + ), "x.shape[0] should be larger then 1 in x.dim() == 3 cases" + X_HAS_SPLITK = True + y2 = torch.empty((M, N2), dtype=dtype, device=x2.device) + else: + M, N1 = x.shape + # Activation (N/2) and storing results in uint8 (N/2) results in a feature dimension of N/4 + assert ( + N1 % 4 == 0 + ), "The last dimension for x1 should be multiple of 4 for acitvation, multiplication and mxfp4 quantization" + + MXFP4_QUANT_BLOCK_SIZE = 32 + N_half = N1 // 2 + y = torch.empty((M, N_half // 2), dtype=torch.uint8, device=x.device) + scaleN_valid = triton.cdiv(N_half, MXFP4_QUANT_BLOCK_SIZE) + # Setting scale M to be multiple of 256 and scale N to be multiple of 8 + use_scale_shuffle_padding = shuffle or scale_shuffle_padding + if use_scale_shuffle_padding: + scaleM = triton.cdiv(M, 256) * 256 + scaleN = triton.cdiv(scaleN_valid, 8) * 8 + else: + scaleM = M + scaleN = scaleN_valid + y_scale = torch.empty( + (scaleM, scaleN), + dtype=torch.uint8, + device=x.device, + ) + + NUM_ITER = 1 + NUM_WARPS = 4 + NUM_STAGES = 1 + + BLOCK_SIZE_M1 = 1 if M <= 128 else 4 + BLOCK_SIZE_M2 = 1 if M <= 128 else 4 + + # for small N values + if N_half <= 1024: + BLOCK_SIZE_N1 = 32 + else: + BLOCK_SIZE_N1 = 128 + + if N2 <= 256: + BLOCK_SIZE_N2 = 8 + elif N2 <= 1024: + BLOCK_SIZE_N2 = 32 + else: + BLOCK_SIZE_N2 = 128 + + # shuffle requires block sizes to be multiple of 32 + if shuffle: + BLOCK_SIZE_M1 = triton.cdiv(BLOCK_SIZE_M1, 32) * 32 + BLOCK_SIZE_N1 = triton.cdiv(BLOCK_SIZE_N1, 32) * 32 + + num_pid = triton.cdiv(M, BLOCK_SIZE_M1) * triton.cdiv( + N_half, BLOCK_SIZE_N1 * NUM_ITER + ) + if X_HAS_SPLITK: + num_pid += triton.cdiv(M, BLOCK_SIZE_M2) * triton.cdiv(N2, BLOCK_SIZE_N2) + + grid = (num_pid,) + _fused_reduce_act_mul_and_dynamic_mxfp4_quant_kernel[grid]( + x, + y, + y_scale, + x2, + y2, + 0 if not X_HAS_SPLITK else x.stride(0), + x.stride(0) if not X_HAS_SPLITK else x.stride(1), + x.stride(1) if not X_HAS_SPLITK else x.stride(2), + y.stride(0), + y.stride(1), + y_scale.stride(0), + y_scale.stride(1), + 0 if not X_HAS_SPLITK else x2.stride(0), + 0 if not X_HAS_SPLITK else x2.stride(1), + 0 if not X_HAS_SPLITK else x2.stride(2), + 0 if not X_HAS_SPLITK else y2.stride(0), + 0 if not X_HAS_SPLITK else y2.stride(1), + M=M, + N1=N_half, + N2=N2, + BLOCK_SIZE_M1=BLOCK_SIZE_M1, + BLOCK_SIZE_N1=BLOCK_SIZE_N1, + BLOCK_SIZE_M2=BLOCK_SIZE_M2, + BLOCK_SIZE_N2=BLOCK_SIZE_N2, + NUM_ITER=NUM_ITER, + NUM_STAGES=NUM_STAGES, + MXFP4_QUANT_BLOCK_SIZE=MXFP4_QUANT_BLOCK_SIZE, + SCALING_MODE=0, + ACTIVATION=_get_activation_from_str(activation) if activation else "", + scaleN=scaleN_valid, + scaleM_pad=(scaleM if use_scale_shuffle_padding else 1), + scaleN_pad=scaleN, + SHUFFLE=shuffle, + X_HAS_SPLITK=X_HAS_SPLITK, + X_NUM_KSPLIT=x_num_splitk, + X_NUM_KSPLIT_POW2=triton.next_power_of_2(x_num_splitk), + num_warps=NUM_WARPS, + waves_per_eu=0, + num_stages=1, + ) + + return (y, y_scale), y2 diff --git a/aiter/ops/triton/gemm_a16wfp4.py b/aiter/ops/triton/gemm_a16wfp4.py new file mode 100644 index 0000000000..40744fba68 --- /dev/null +++ b/aiter/ops/triton/gemm_a16wfp4.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +from typing import Optional +import torch +import triton +import triton.language as tl +import aiter.ops.triton.utils._triton.arch_info as arch_info +from aiter.ops.triton.quant import _mxfp4_quant_op +from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton._triton_kernels.gemm_a16wfp4 import ( + _gemm_a16wfp4_kernel, + _get_config, +) +from aiter.ops.triton._triton_kernels.gemm_afp4wfp4 import ( + _gemm_afp4wfp4_reduce_kernel, +) +from aiter.ops.triton.gemm_afp4wfp4 import ( + get_splitk, +) + + +_LOGGER = AiterTritonLogger() + + +def gemm_a16wfp4( + x, + w, + w_scales, + atomic_add: bool = False, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +): + """ + Computes the matmul Y = X x W + W is an e2m1 fp4 tensor and w_scales is an e8m0 tensor. + Every 32 elements in the K dimension share one e8m0 scale. + X gets quantized to the microscale fp4 (mxfp4) format before the GEMM. + + + Key parameters: + - X: Matrix X with shape (M, K). + - W: Matrix W with shape (N, K). + - W_scales: Matrix with shape (N, K // 32) + + Returns: + - Y: The output matrix with shape (M, N). + """ + + _LOGGER.info( + f"GEMM_A16WFP4: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w_scales.shape)} " + ) + + assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" + + M, K = x.shape + N, K = w.shape + + # inner kernel expects (K, N) + w = w.T + + if config is None: + config = _get_config(M, N, K) + + if y is None: + if atomic_add: + y = torch.zeros((M, N), dtype=dtype, device=x.device) + else: + y = torch.empty((M, N), dtype=dtype, device=x.device) + + if config["NUM_KSPLIT"] > 1 and not atomic_add: + SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( + K, config["BLOCK_SIZE_K"], config["NUM_KSPLIT"] + ) + + config["SPLITK_BLOCK_SIZE"] = SPLITK_BLOCK_SIZE + config["BLOCK_SIZE_K"] = BLOCK_SIZE_K + config["NUM_KSPLIT"] = NUM_KSPLIT + + if config["BLOCK_SIZE_K"] >= 2 * K: + config["BLOCK_SIZE_K"] = triton.next_power_of_2(2 * K) + config["SPLITK_BLOCK_SIZE"] = 2 * K + config["NUM_KSPLIT"] = 1 + config["BLOCK_SIZE_K"] = max(config["BLOCK_SIZE_K"], 64) + + if config["NUM_KSPLIT"] > 1 and not atomic_add: + y_pp = torch.empty( + (config["NUM_KSPLIT"], M, N), dtype=torch.float32, device=y.device + ) + else: + config["SPLITK_BLOCK_SIZE"] = 2 * K + y_pp = None + + grid = lambda META: ( # noqa: E731 + ( + META["NUM_KSPLIT"] + * triton.cdiv(M, META["BLOCK_SIZE_M"]) + * triton.cdiv(N, META["BLOCK_SIZE_N"]) + ), + ) + _gemm_a16wfp4_kernel[grid]( + x, + w, + y if y_pp is None else y_pp, + w_scales, + M, + N, + K, + x.stride(0), + x.stride(1), + w.stride(0), + w.stride(1), + 0 if y_pp is None else y_pp.stride(0), + y.stride(0) if y_pp is None else y_pp.stride(1), + y.stride(1) if y_pp is None else y_pp.stride(2), + w_scales.stride(0), + w_scales.stride(1), + ATOMIC_ADD=atomic_add, + **config, + ) + + if config["NUM_KSPLIT"] > 1 and not atomic_add: + REDUCE_BLOCK_SIZE_M = 16 + REDUCE_BLOCK_SIZE_N = 64 + # TODO: Need to debug - REDUCE_BLOCK_SIZE_N=128 with fp32 partials fails + # NOTE: REDUCE_BLOCK_SIZE_N=16 gives best perf with fp32 partials and + # REDUCE_BLOCK_SIZE_N=128 gives best perf with bf16 partials + ACTUAL_KSPLIT = triton.cdiv(K, (config["SPLITK_BLOCK_SIZE"] // 2)) + + grid_reduce = ( + triton.cdiv(M, REDUCE_BLOCK_SIZE_M), + triton.cdiv(N, REDUCE_BLOCK_SIZE_N), + ) + _gemm_afp4wfp4_reduce_kernel[grid_reduce]( + y_pp, + y, + M, + N, + y_pp.stride(0), + y_pp.stride(1), + y_pp.stride(2), + y.stride(0), + y.stride(1), + REDUCE_BLOCK_SIZE_M, + REDUCE_BLOCK_SIZE_N, + ACTUAL_KSPLIT, + triton.next_power_of_2(config["NUM_KSPLIT"]), + ) + + return y diff --git a/aiter/ops/triton/gemm_afp4wfp4.py b/aiter/ops/triton/gemm_afp4wfp4.py index a5353b9051..820f2c5105 100644 --- a/aiter/ops/triton/gemm_afp4wfp4.py +++ b/aiter/ops/triton/gemm_afp4wfp4.py @@ -8,10 +8,10 @@ import aiter.ops.triton.utils._triton.arch_info as arch_info from aiter.ops.triton.utils.logger import AiterTritonLogger from aiter.ops.triton._triton_kernels.gemm_afp4wfp4 import ( - _gemm_afp4_wfp4_kernel, - _gemm_afp4_wfp4_kernel_preshuffled_scales, - _gemm_afp4_wfp4_kernel_preshuffled_weight_scales, - _gemm_afp4_wfp4_reduce_kernel, + _gemm_afp4wfp4_kernel, + _gemm_afp4wfp4_kernel_preshuffle_scales, + _gemm_afp4wfp4_preshuffle_kernel, + _gemm_afp4wfp4_reduce_kernel, _get_config, ) from .utils.core import AITER_TRITON_CONFIGS_PATH @@ -138,7 +138,7 @@ def gemm_afp4wfp4( ), ) - _gemm_afp4_wfp4_kernel[grid]( + _gemm_afp4wfp4_kernel[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -173,7 +173,7 @@ def gemm_afp4wfp4( triton.cdiv(M, REDUCE_BLOCK_SIZE_M), triton.cdiv(N, REDUCE_BLOCK_SIZE_N), ) - _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( + _gemm_afp4wfp4_reduce_kernel[grid_reduce]( y_pp, y, M, @@ -272,7 +272,7 @@ def gemm_afp4wfp4_preshuffled_scales( ), ) - _gemm_afp4_wfp4_kernel_preshuffled_scales[grid]( + _gemm_afp4wfp4_kernel_preshuffled_scales[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -307,7 +307,7 @@ def gemm_afp4wfp4_preshuffled_scales( triton.cdiv(M, REDUCE_BLOCK_SIZE_M), triton.cdiv(N, REDUCE_BLOCK_SIZE_N), ) - _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( + _gemm_afp4wfp4_reduce_kernel[grid_reduce]( y_pp, y, M, @@ -326,7 +326,7 @@ def gemm_afp4wfp4_preshuffled_scales( return y -def gemm_afp4wfp4_preshuffled_weight_scales( +def gemm_afp4wfp4_preshuffle( x, w, x_scales, @@ -417,13 +417,13 @@ def gemm_afp4wfp4_preshuffled_weight_scales( M_POW2 = triton.next_power_of_2(M) if M < 32 and M_POW2 > 16: M_POW2 = 16 - metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{_gemm_afp4_wfp4_kernel_preshuffled_weight_scales.fn.__name__}_M={M_POW2}-N={N}-K={K*2}" + metadata_pth = f"{AITER_TRITON_CONFIGS_PATH}/gemm/aot/{_gemm_afp4wfp4_preshuffle_kernel.fn.__name__}_M={M_POW2}-N={N}-K={K*2}" if use_aot and os.path.exists(metadata_pth): with AOTMetadataContext( - _gemm_afp4_wfp4_kernel_preshuffled_weight_scales.fn.__name__, + _gemm_afp4wfp4_preshuffle_kernel.fn.__name__, f"{metadata_pth}", ): - _gemm_afp4_wfp4_kernel_preshuffled_weight_scales[grid]( + _gemm_afp4wfp4_preshuffle_kernel[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -446,7 +446,7 @@ def gemm_afp4wfp4_preshuffled_weight_scales( **config, ) else: - _gemm_afp4_wfp4_kernel_preshuffled_weight_scales[grid]( + _gemm_afp4wfp4_preshuffle_kernel[grid]( x, w, y if config["NUM_KSPLIT"] == 1 else y_pp, @@ -481,7 +481,7 @@ def gemm_afp4wfp4_preshuffled_weight_scales( triton.cdiv(M, REDUCE_BLOCK_SIZE_M), triton.cdiv(N, REDUCE_BLOCK_SIZE_N), ) - _gemm_afp4_wfp4_reduce_kernel[grid_reduce]( + _gemm_afp4wfp4_reduce_kernel[grid_reduce]( y_pp, y, M, @@ -498,3 +498,19 @@ def gemm_afp4wfp4_preshuffled_weight_scales( ) return y + + +def gemm_afp4wfp4_preshuffled_weight_scales( + x, + w, + x_scales, + w_scales, + dtype: Optional[float] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, + use_aot: Optional[bool] = True, +): + _LOGGER.info( + "gemm_afp4wfp4_preshuffled_weight_scales will be deprecated in future AITER release, please switch to gemm_afp4wfp4_preshuffle" + ) + return gemm_afp4wfp4_preshuffle(x, w, x_scales, w_scales, dtype, y, config, use_aot) diff --git a/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py index 94369cc2c8..d3738fd4aa 100644 --- a/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py +++ b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py @@ -5,12 +5,9 @@ import torch import triton import triton.language as tl -import aiter.ops.triton.utils._triton.arch_info as arch_info -from aiter.ops.triton.quant import _mxfp4_quant_op from aiter.ops.triton.utils.logger import AiterTritonLogger -from aiter.ops.triton._triton_kernels.gemm_afp4wfp4_pre_quant_atomic import ( - _gemm_afp4_wfp4_pre_quant_kernel, - _get_config, +from aiter.ops.triton.gemm_a16wfp4 import ( + gemm_a16wfp4, ) _LOGGER = AiterTritonLogger() @@ -24,69 +21,7 @@ def gemm_afp4wfp4_pre_quant( y: Optional[torch.Tensor] = None, config: Optional[dict] = None, ): - """ - Computes matrix multiplication Y = X @ W^T with on-the-fly FP4 quantization of activations. - X is quantized to MXFP4 during computation, W is pre-quantized FP4. Uses atomic operations for split-K reduction. - - Args: - x (torch.Tensor): Higher precision input matrix with shape (M, K) (BF16 or FP16). - Quantized to FP4 E2M1 on-the-fly during GEMM. - w (torch.Tensor): FP4 E2M1 weight matrix with shape (N, K), internally transposed. - w_scales (torch.Tensor): E8M0 per-group scale for w with shape (N, K//32). - One scale per 32 elements in K dimension. - dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). - y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). - Must be zero-initialized for atomic operations. - config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, - BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT). - - Returns: - torch.Tensor: Output with shape (M, N). - """ - _LOGGER.info( - f"GEMM_AFP4WFP4_PRE_QUANT_ATOMIC: x={tuple(x.shape)} w={tuple(w.shape)} w_scale={tuple(w_scales.shape)} " - ) - - assert arch_info.is_fp4_avail(), "MXFP4 is not available on your device" - - M, K = x.shape - N, K = w.shape - - # inner kernel expects (K, N) - w = w.T - - if y is None: - y = torch.zeros((M, N), dtype=dtype, device=x.device) - - if config is None: - config = _get_config(M, N, K) - - grid = lambda META: ( # noqa: E731 - ( - META["NUM_KSPLIT"] - * triton.cdiv(M, META["BLOCK_SIZE_M"]) - * triton.cdiv(N, META["BLOCK_SIZE_N"]) - ), - ) - _gemm_afp4_wfp4_pre_quant_kernel[grid]( - x, - w, - y, - w_scales, - M, - N, - K, - x.stride(0), - x.stride(1), - w.stride(0), - w.stride(1), - 0, - y.stride(0), - y.stride(1), - w_scales.stride(0), - w_scales.stride(1), - **config, + "gemm_afp4wfp4_pre_quant will be deprecated in future AITER release, please switch to gemm_a16wfp4" ) - - return y + return gemm_a16wfp4(x, w, w_scales, True, dtype, y, config) diff --git a/op_tests/op_benchmarks/triton/bench_batched_gemm_afp4wfp4_pre_quant.py b/op_tests/op_benchmarks/triton/bench_batched_gemm_a16wfp4.py similarity index 97% rename from op_tests/op_benchmarks/triton/bench_batched_gemm_afp4wfp4_pre_quant.py rename to op_tests/op_benchmarks/triton/bench_batched_gemm_a16wfp4.py index 16a0c8805f..fd92be7a0e 100644 --- a/op_tests/op_benchmarks/triton/bench_batched_gemm_afp4wfp4_pre_quant.py +++ b/op_tests/op_benchmarks/triton/bench_batched_gemm_a16wfp4.py @@ -2,8 +2,8 @@ import torch import triton import math -from op_tests.triton_tests.test_batched_gemm_afp4wfp4_pre_quant import ( - generate_batched_gemm_afp4wfp4_pre_quant_inputs, +from aiter.op_tests.triton_tests.test_batched_gemm_a16wfp4 import ( + generate_batched_gemm_a16wfp4_inputs, ) from op_tests.op_benchmarks.triton.utils.argparse import ( get_parser, diff --git a/op_tests/triton_tests/test_batched_gemm_afp4wfp4_pre_quant.py b/op_tests/triton_tests/test_batched_gemm_a16wfp4.py similarity index 92% rename from op_tests/triton_tests/test_batched_gemm_afp4wfp4_pre_quant.py rename to op_tests/triton_tests/test_batched_gemm_a16wfp4.py index 09094f513e..7fe8df7e7b 100755 --- a/op_tests/triton_tests/test_batched_gemm_afp4wfp4_pre_quant.py +++ b/op_tests/triton_tests/test_batched_gemm_a16wfp4.py @@ -1,7 +1,7 @@ import torch import pytest -from aiter.ops.triton.batched_gemm_afp4wfp4_pre_quant import ( - batched_gemm_afp4wfp4_pre_quant, +from aiter.ops.triton.batched_gemm_a16wfp4 import ( + batched_gemm_a16wfp4, ) import aiter.ops.triton.utils._triton.arch_info as arch_info @@ -9,9 +9,7 @@ SCALE_GROUP_SIZE = 32 -def generate_batched_gemm_afp4wfp4_pre_quant_inputs( - B, M, N, K, dtype, layout="TN", output=False -): +def generate_batched_gemm_a16wfp4_inputs(B, M, N, K, dtype, layout="TN", output=False): """ Returns: - x: (B, M, K) @@ -175,20 +173,18 @@ def run_torch(x, w, w_scales, dtype): @pytest.mark.parametrize("B, M, N, K", get_x_vals()) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("layout", ["TN", "TT", "NN", "NT"]) -def test_batched_gemm_afp4_wfp4_pre_quant( - B: int, M: int, N: int, K: int, layout, dtype -): +def test_batched_gemm_a16wfp4(B: int, M: int, N: int, K: int, layout, dtype): if not (arch_info.is_fp4_avail()): pytest.skip("MXFP4 not supported on this architecture") torch.cuda.empty_cache() # Helps avoid hangs in large tests - x, w, x_scales, w_scales, out = generate_batched_gemm_afp4wfp4_pre_quant_inputs( + x, w, x_scales, w_scales, out = generate_batched_gemm_a16wfp4_inputs( B, M, N, K, dtype, layout=layout, output=True ) torch_out = run_torch(x, w, w_scales, dtype).to(dtype) - batched_gemm_afp4wfp4_pre_quant(x, w, w_scales, dtype, out) + batched_gemm_a16wfp4(x, w, w_scales, dtype, out, transpose_bm=False, prequant=True) torch.testing.assert_close(torch_out, out) diff --git a/op_tests/triton_tests/test_fused_gemm_afp4wfp4_a16w16.py b/op_tests/triton_tests/test_fused_gemm_afp4wfp4_a16w16.py new file mode 100644 index 0000000000..aecc1cede6 --- /dev/null +++ b/op_tests/triton_tests/test_fused_gemm_afp4wfp4_a16w16.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +import torch +import triton +import pytest +from aiter.ops.triton.fused_gemm_afp4wfp4_a16w16 import ( + fused_gemm_afp4wfp4_a16w16, +) +from op_tests.triton_tests.test_gemm_afp4wfp4 import ( + generate_gemm_afp4wfp4_inputs, +) +from op_tests.triton_tests.test_gemm_afp4wfp4 import run_torch as run_torch_fp4 +from op_tests.triton_tests.test_gemm_a16w16 import generate_gemm_a16w16_inputs +import torch.nn.functional as F +import aiter.ops.triton.utils._triton.arch_info as arch_info + + +def run_torch( + x_fp4, + w_fp4, + x_fp4_scale, + w_fp4_scale, + x_bf16, + w_bf16, + bias_fp4, + bias_bf16, + dtype=torch.bfloat16, +): + y_fp4 = run_torch_fp4(x_fp4, w_fp4, x_fp4_scale, w_fp4_scale, dtype) + if bias_fp4 is not None: + y_fp4 += bias_fp4 + y_bf16 = F.linear(x_bf16, w_bf16, bias=bias_bf16) + return y_fp4.to(dtype), y_bf16.to(dtype) + + +def run_triton( + x_fp4, + w_fp4, + x_fp4_scale, + w_fp4_scale, + x_bf16, + w_bf16, + bias_fp4, + bias_bf16, + dtype=torch.bfloat16, + y_fp4=None, + y_bf16=None, + skip_reduce=False, + is_fp4_preshuffled=True, +): + return fused_gemm_afp4wfp4_a16w16( + x_fp4, + w_fp4, + x_fp4_scale, + w_fp4_scale, + x_bf16, + w_bf16, + is_fp4_preshuffled=is_fp4_preshuffled, + bias_fp4=bias_fp4, + bias_bf16=bias_bf16, + dtype=dtype, + y_fp4=y_fp4, + y_bf16=y_bf16, + skip_reduce=skip_reduce, + ) + + +def get_x_vals(): + + x_vals = [ + (m, n1, n2, k) + for k in [1024, 8192, 7168] + for n2 in [256, 512] + for n1 in [256, 512] + for m in [1, 8, 32, 64, 128, 8192] + ] + return x_vals + + +@pytest.mark.parametrize("M, N1, N2, K", get_x_vals()) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("output", [True, False]) +@pytest.mark.parametrize("skip_reduce", [True, False]) +@pytest.mark.parametrize("fp4_shuffle", [True, False]) +def test_gemm(dtype, M, N1, N2, K, output, skip_reduce, fp4_shuffle): + + if not (arch_info.is_fp4_avail()): + pytest.skip("MXFP4 not supported on this architecture") + + ( + x_fp4, + w_fp4, + w_fp4_triton, + x_fp4_scale, + w_fp4_scale, + x_fp4_scale_triton, + w_fp4_scale_triton, + out_dtype, + y_fp4, + ) = generate_gemm_afp4wfp4_inputs( + M, + N1, + K, + dtype, + layout="TN", + output=output, + shuffle_scales_fg=fp4_shuffle, + shuffle_weight_fg=fp4_shuffle, + ) + + x_bf16, w_bf16, bias_bf16, _, y_bf16 = generate_gemm_a16w16_inputs( + M, N2, K, dtype, output=output, bias=True + ) + bias_bf16 = torch.randn((N2,), dtype=bias_bf16.dtype, device=bias_bf16.device) + bias_fp4 = torch.randn((N1,), dtype=bias_bf16.dtype, device=bias_bf16.device) + y_torch_fp4, y_torch_bf16 = run_torch( + x_fp4, + w_fp4, + x_fp4_scale, + w_fp4_scale, + x_bf16, + w_bf16, + bias_fp4, + bias_bf16, + dtype, + ) + y_triton_fp4, y_triton_bf16 = run_triton( + x_fp4, + w_fp4_triton, + x_fp4_scale_triton, + w_fp4_scale_triton, + x_bf16, + w_bf16, + bias_fp4, + bias_bf16, + dtype, + y_fp4, + y_bf16, + skip_reduce=skip_reduce, + is_fp4_preshuffled=fp4_shuffle, + ) + + if y_triton_fp4.dim() == 3: + y_triton_fp4 = y_triton_fp4.sum(axis=0).to(dtype=dtype) + y_triton_bf16 = y_triton_bf16.sum(axis=0).to(dtype=dtype) + + triton.testing.assert_close(y_torch_bf16, y_triton_bf16, atol=0.1, rtol=1e-1) + triton.testing.assert_close(y_torch_fp4, y_triton_fp4, atol=0.1, rtol=1e-1) diff --git a/op_tests/triton_tests/test_fused_gemm_afp4wfp4_mul_add.py b/op_tests/triton_tests/test_fused_gemm_afp4wfp4_mul_add.py new file mode 100644 index 0000000000..e75eeed64f --- /dev/null +++ b/op_tests/triton_tests/test_fused_gemm_afp4wfp4_mul_add.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +import pytest +import os +import torch +from aiter.ops.triton.fused_gemm_afp4wfp4_mul_add import ( + fused_gemm_afp4wfp4_mul_add, + fused_gemm_afp4wfp4_preshuffle_add_mul, +) +import aiter.ops.triton.utils._triton.arch_info as arch_info +from op_tests.triton_tests.test_gemm_afp4wfp4 import generate_gemm_afp4wfp4_inputs +from op_tests.triton_tests.test_gemm_afp4wfp4 import ( + run_torch as run_torch_gemm_afp4wfp4, +) +from op_tests.triton_tests.test_fused_mul_add import generate_fused_mul_add_inputs +from op_tests.triton_tests.test_fused_mul_add import ( + run_torch as run_torch_fused_mul_add, +) + + +def get_x_vals(): + + x_vals = [(1024 * v, 1024 * v, 1024 * v) for v in range(1, 9)] + x_vals += [(1, 1, 32)] # minimal case + return x_vals + + +@pytest.mark.parametrize("M, N, K", get_x_vals()) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("layout", ["TN"]) +@pytest.mark.parametrize("output", [True, False]) +@pytest.mark.parametrize( + "shuffle_weight_scales", + [True, False], +) +@pytest.mark.parametrize( + "a_type_is_scalar", + [(float, True), (int, True), (torch.Tensor, True), (torch.Tensor, False)], +) +@pytest.mark.parametrize( + "b_type_is_scalar", + # [(float, True), (int, True), (torch.Tensor, True), (torch.Tensor, False)], + [(torch.Tensor, False)], +) +@pytest.mark.parametrize( + "fuse_type", + [0, 1], +) +def test_fused_gemm_afp4wfp4_mul_add( + M: int, + N: int, + K: int, + dtype, + layout, + output, + shuffle_weight_scales, + a_type_is_scalar, + b_type_is_scalar, + fuse_type, +): + if not (arch_info.is_fp4_avail()): + pytest.skip("MXFP4 not supported on this architecture") + + if shuffle_weight_scales: + if N % 32 > 0: + pytest.skip( + f"N = {N} is not divisible by 32, skip this test for preshuffled weight/scales tests" + ) + elif K % 256 > 0: + pytest.skip( + f"K = {K} is not divisible by 256, skip this test for preshuffled weight/scales tests" + ) + + ( + x, + w, + w_triton, + x_scales, + w_scales, + x_scales_triton, + w_scales_triton, + out_dtype, + y, + ) = generate_gemm_afp4wfp4_inputs( + M, + N, + K, + dtype, + layout=layout, + output=output, + shuffle_scales_fg=shuffle_weight_scales, + shuffle_weight_fg=shuffle_weight_scales, + ) + _, a, b = generate_fused_mul_add_inputs( + [M, N], a_type_is_scalar, b_type_is_scalar, dtype + ) + + if fuse_type == 0: + torch_out = run_torch_fused_mul_add( + run_torch_gemm_afp4wfp4(x, w, x_scales, w_scales, torch.float32), a, b + ).to(dtype) + else: + torch_out = run_torch_fused_mul_add( + b, a, run_torch_gemm_afp4wfp4(x, w, x_scales, w_scales, torch.float32) + ).to(dtype) + + if shuffle_weight_scales: + if output: + triton_out = fused_gemm_afp4wfp4_preshuffle_add_mul( + x, + w_triton, + x_scales_triton, + w_scales_triton, + a, + b, + dtype, + y, + use_aot=(dtype == torch.bfloat16 and layout == "TN"), + fuse_type=fuse_type, + ) + else: + triton_out = fused_gemm_afp4wfp4_preshuffle_add_mul( + x, + w_triton, + x_scales_triton, + w_scales_triton, + a, + b, + dtype, + use_aot=(dtype == torch.bfloat16 and layout == "TN"), + fuse_type=fuse_type, + ) + else: + if output: + triton_out = fused_gemm_afp4wfp4_mul_add( + x, + w_triton, + x_scales_triton, + w_scales_triton, + a, + b, + dtype, + y, + fuse_type=fuse_type, + ) + else: + triton_out = fused_gemm_afp4wfp4_mul_add( + x, + w_triton, + x_scales_triton, + w_scales_triton, + a, + b, + dtype, + fuse_type=fuse_type, + ) + + torch.testing.assert_close(torch_out, triton_out, atol=0.1, rtol=0.1) diff --git a/op_tests/triton_tests/test_fused_mul_add.py b/op_tests/triton_tests/test_fused_mul_add.py index a8ffe86791..fdd8d783c6 100644 --- a/op_tests/triton_tests/test_fused_mul_add.py +++ b/op_tests/triton_tests/test_fused_mul_add.py @@ -3,7 +3,7 @@ from aiter.ops.triton.fused_mul_add import fused_mul_add -def generate_qk_inputs(shape, a_type_is_scalar, b_type_is_scalar, dtype): +def generate_fused_mul_add_inputs(shape, a_type_is_scalar, b_type_is_scalar, dtype): x = torch.randn(*shape, dtype=dtype, device="cuda") if a_type_is_scalar[1]: @@ -29,7 +29,7 @@ def generate_qk_inputs(shape, a_type_is_scalar, b_type_is_scalar, dtype): return x, a, b -def ref_mul_add(x, a, b): +def run_torch(x, a, b): return (a * x.to(torch.float32) + b).to(x.dtype) @@ -50,9 +50,11 @@ def test_mul_add(shape, a_type_is_scalar, b_type_is_scalar, output: bool, dtype) torch.cuda.empty_cache() # Helps avoid hangs in large tests - x, a, b = generate_qk_inputs(shape, a_type_is_scalar, b_type_is_scalar, dtype) + x, a, b = generate_fused_mul_add_inputs( + shape, a_type_is_scalar, b_type_is_scalar, dtype + ) - x_torch = ref_mul_add(x, a, b).clone() + x_torch = run_torch(x, a, b).clone() if output: x_triton = torch.empty_like(x) fused_mul_add(x, a, b, x_triton) diff --git a/op_tests/triton_tests/test_fused_mxfp4_quant.py b/op_tests/triton_tests/test_fused_mxfp4_quant.py index 8c66a8aa26..0ee8daa171 100644 --- a/op_tests/triton_tests/test_fused_mxfp4_quant.py +++ b/op_tests/triton_tests/test_fused_mxfp4_quant.py @@ -1,8 +1,10 @@ import torch +import torch.nn.functional as F import pytest from aiter.ops.triton.fused_mxfp4_quant import ( fused_flatten_mxfp4_quant, fused_rms_mxfp4_quant, + fused_reduce_act_mul_and_mxfp4_quant, ) from op_tests.triton_tests.test_quant_mxfp4 import torch_dynamic_mxfp4_quant from op_tests.triton_tests.test_gemm_afp4wfp4 import ( @@ -11,6 +13,7 @@ SCALE_GROUP_SIZE, ) from op_tests.triton_tests.test_gemm_afp4wfp4 import shuffle_scales, un_shuffle_scales +import aiter.ops.triton.utils._triton.arch_info as arch_info torch.manual_seed(0) @@ -33,6 +36,7 @@ def calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, eps=1e-6, shuffle=F x1 = res1_out = x1 + resid1 res1_out = res1_out.to(orig_dtype) x1 = rmsnorm(x1, rms1_w, eps) + out1 = x1.to(orig_dtype) out1_fp4, out1_scale = torch_dynamic_mxfp4_quant(x1) out2 = None @@ -55,7 +59,7 @@ def calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, eps=1e-6, shuffle=F out1_scale = shuffle_scales(out1_scale_pad) out1_scale = out1_scale.view(out1_scale.shape[0] * 32, -1) - return (out1_fp4, out1_scale), out2, res1_out + return (out1_fp4, out1_scale), out1, out2, res1_out def convert_mxfp4_to_fp32(x, x_scales): @@ -96,6 +100,10 @@ def generate_fused_rms_quant_data( @pytest.mark.parametrize("N", [32, 64, 128]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) def test_flatten_quant(B: int, M: int, N: int, dtype): + + if not (arch_info.is_fp4_avail()): + pytest.skip("MXFP4 not supported on this architecture") + torch.cuda.empty_cache() # Helps avoid hangs in large tests x = torch.randn((B, M, N), dtype=dtype, device="cuda").transpose(0, 1) @@ -135,6 +143,10 @@ def test_fused_rms_quant( shuffle: bool, scale_shuffle_padding: bool, ): + + if not (arch_info.is_fp4_avail()): + pytest.skip("MXFP4 not supported on this architecture") + torch.cuda.empty_cache() # Helps avoid hangs in large tests x1, x2, rms1_w, rms2_w, resid1 = generate_fused_rms_quant_data( x1_shape=(M, N1), @@ -145,11 +157,11 @@ def test_fused_rms_quant( res1=res1, dtype=dtype, ) - (x1_fp4_torch, x1_scales_torch), x2_torch, res1_out_torch = ( + (y1_fp4_torch, y1_scales_torch), y1_torch, y2_torch, y1_res_torch = ( calculate_target_w_torch(x1, rms1_w, resid1, x2, rms2_w, shuffle=shuffle) ) - (x1_fp4_triton, x1_scales_triton), x2_triton, res1_out_triton = ( + (y1_fp4_triton, y1_scales_triton), y1_triton, y2_triton, y1_res_triton = ( fused_rms_mxfp4_quant( x1, rms1_w, @@ -160,28 +172,152 @@ def test_fused_rms_quant( resid1, shuffle=shuffle, scale_shuffle_padding=scale_shuffle_padding, + output_unquantized_inp1=True, ) ) + if y1_triton is not None: + torch.testing.assert_close(y1_torch, y1_triton) + if shuffle: - x1_scales_triton = un_shuffle_scales( - x1_scales_triton.view(x1_scales_triton.shape[0] // 32, -1) + y1_scales_triton = un_shuffle_scales( + y1_scales_triton.view(y1_scales_triton.shape[0] // 32, -1) ) - x1_scales_torch = un_shuffle_scales( - x1_scales_torch.view(x1_scales_torch.shape[0] // 32, -1) + y1_scales_torch = un_shuffle_scales( + y1_scales_torch.view(y1_scales_torch.shape[0] // 32, -1) ) scaleN_valid = (N1 + 31) // 32 - x1_scales_triton = x1_scales_triton[:M, :scaleN_valid] - x1_scales_torch = x1_scales_torch[:M, :scaleN_valid] + y1_scales_triton = y1_scales_triton[:M, :scaleN_valid] + y1_scales_torch = y1_scales_torch[:M, :scaleN_valid] + + if y2_triton is not None: + torch.testing.assert_close(y2_torch, y2_triton) + + if y1_res_triton is not None: + torch.testing.assert_close(y1_res_torch, y1_res_triton) + + y1_fp32_torch = convert_mxfp4_to_fp32(y1_fp4_torch, y1_scales_torch) + y1_fp32_triton = convert_mxfp4_to_fp32(y1_fp4_triton, y1_scales_triton) + + torch.testing.assert_close(y1_fp32_torch, y1_fp32_triton) + + +def run_torch_reduce_act_mul_mxfp4_group_quant(x, x2, activation, dtype, shuffle): + x = x.to(torch.float32) + d = x.shape[-1] // 2 + y2 = None + if x.dim() == 3: + x = x.sum(axis=0) + y2 = x2.sum(axis=0).to(dtype=dtype) + else: + assert x2 is None, "x2 must be None in x.dim() == 2 cases" + x, x_mul = x.split([d, d], dim=-1) + if activation == "silu": + out = F.silu(x) * x_mul + elif activation == "gelu": + out = F.gelu(x) * x_mul + out, out_scale = torch_dynamic_mxfp4_quant(out) + if shuffle: + # out_scale_pad = out_scale + M = out_scale.shape[0] + N = out.shape[1] * 2 + scaleM = (M + 255) // 256 * 256 + scaleN_valid = (N + 31) // 32 + scaleN = (scaleN_valid + 7) // 8 * 8 + out_scale_pad = torch.empty( + (scaleM, scaleN), dtype=out_scale.dtype, device=out_scale.device + ) + out_scale_pad[:M, :scaleN] = out_scale[:M, :scaleN] + out_scale = shuffle_scales(out_scale_pad) + out_scale = out_scale.view(out_scale.shape[0] * 32, -1) + return (out, out_scale), y2 + + +def generate_fused_reduce_act_mul_mxfp4_group_quant( + M: int, + N1: int, + dtype=torch.bfloat16, + SPK: int = 1, + N2: int = 1, +): + if SPK == 1: + x = torch.randn((M, N1 * 2), dtype=dtype).cuda() / 10 + else: + x = torch.randn((SPK, M, N1 * 2), dtype=torch.float32).cuda() / 10 + x2 = None + if SPK > 1: + x2 = torch.randn((SPK, M, N2), dtype=torch.float32).cuda() / 10 - if x2_triton is not None: - torch.testing.assert_close(x2_torch, x2_triton) + return x, x2 + + +@pytest.mark.parametrize( + "M, N1, N2", + [ + (1, 256, 256), + (2, 256, 256), + (4, 256, 256), + (32, 256, 256), + (1, 4, 256), + (1, 28, 256), + (1, 32, 256), + (1, 64, 256), + (1, 68, 256), + (128, 28, 256), + (128, 32, 256), + (128, 64, 256), + (128, 68, 256), + (256, 32, 256), + ], +) +@pytest.mark.parametrize("SPK", [1, 4]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +@pytest.mark.parametrize("activation", ["silu", "gelu"]) +@pytest.mark.parametrize("shuffle", [False, True]) +@pytest.mark.parametrize("scale_shuffle_padding", [False, True]) +def test_fused_reduce_act_mul_mxfp4_group_quant( + M: int, + N1: int, + N2: int, + SPK: int, + dtype, + activation: str, + shuffle: bool, + scale_shuffle_padding: bool, +): + if not (arch_info.is_fp4_avail()): + pytest.skip("MXFP4 not supported on this architecture") + + if shuffle and (N1 * 2) % 512 != 0: + pytest.skip() + + x, x2 = generate_fused_reduce_act_mul_mxfp4_group_quant( + M, N1, dtype=dtype, SPK=SPK, N2=N2 + ) + + (y_q_torch, y_s_torch), y2_torch = run_torch_reduce_act_mul_mxfp4_group_quant( + x, x2, activation, dtype=dtype, shuffle=shuffle + ) + + (y_q_triton, y_s_triton), y2_triton = fused_reduce_act_mul_and_mxfp4_quant( + x, + activation=activation, + x2=x2, + shuffle=shuffle, + scale_shuffle_padding=scale_shuffle_padding, + dtype=dtype, + ) + + if shuffle: + y_s_triton = un_shuffle_scales(y_s_triton.view(y_s_triton.shape[0] // 32, -1)) + y_s_torch = un_shuffle_scales(y_s_torch.view(y_s_torch.shape[0] // 32, -1)) - if res1_out_triton is not None: - torch.testing.assert_close(res1_out_torch, res1_out_triton) + torch.testing.assert_close(y2_torch, y2_triton, atol=0.1, rtol=0.1) - res_fp32_torch = convert_mxfp4_to_fp32(x1_fp4_torch, x1_scales_torch) - res_fp32_triton = convert_mxfp4_to_fp32(x1_fp4_triton, x1_scales_triton) + scaleN_valid = (N1 // 2 + 31) // 32 + y_s_triton = y_s_triton[:M, :scaleN_valid] + y_s_torch = y_s_torch[:M, :scaleN_valid] - torch.testing.assert_close(res_fp32_torch, res_fp32_triton) + torch.testing.assert_close(y_q_triton, y_q_torch) + torch.testing.assert_close(y_s_triton, y_s_torch) diff --git a/op_tests/triton_tests/test_gemm_afp4wfp4_pre_quant_atomic.py b/op_tests/triton_tests/test_gemm_a16wfp4.py similarity index 83% rename from op_tests/triton_tests/test_gemm_afp4wfp4_pre_quant_atomic.py rename to op_tests/triton_tests/test_gemm_a16wfp4.py index 87b887d07c..50618c963b 100644 --- a/op_tests/triton_tests/test_gemm_afp4wfp4_pre_quant_atomic.py +++ b/op_tests/triton_tests/test_gemm_a16wfp4.py @@ -1,17 +1,19 @@ import torch import pytest -from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant +from aiter.ops.triton.gemm_a16wfp4 import gemm_a16wfp4 import aiter.ops.triton.utils._triton.arch_info as arch_info # Note this is specified by the HW and cannot be changed. SCALE_GROUP_SIZE = 32 -def generate_gemm_afp4wfp4_pre_quant_inputs( +def generate_gemm_a16wfp4_inputs( M: int, N: int, K: int, output: bool, + atomic_add: bool, + dtype: bool, layout: str = "TN", ): torch.manual_seed(5) @@ -49,7 +51,8 @@ def generate_gemm_afp4wfp4_pre_quant_inputs( y = None if output: - y = torch.zeros((M, N), device=x.device, dtype=torch.float32) + dtype = torch.float32 if atomic_add else dtype + y = torch.zeros((M, N), device=x.device, dtype=dtype) return x, w, x_scales, w_scales, y @@ -89,8 +92,9 @@ def get_x_vals(): x_vals += [(2 ** (v - 1), 4096 * v, 4096 * v) for v in range(1, 6)] x_vals += [(16, 16384, 3328 * 2), (128, 16384, 3328 * 2)] x_vals += [(32, 512, 7168)] - x_vals += [(1, 1, SCALE_GROUP_SIZE)] # minimal case x_vals += [(1, 1280, 8192)] + x_vals += [(v, 7168, 2048) for v in [1, 4, 8, 32, 64, 128]] + # x_vals += [(1, 1, SCALE_GROUP_SIZE)] # minimal case, TODO: fix return x_vals @@ -143,7 +147,10 @@ def run_torch(x, w, w_scales, dtype): @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) @pytest.mark.parametrize("layout", ["TN", "TT", "NN", "NT"]) @pytest.mark.parametrize("output", [True, False]) -def test_gemm_afp4_wfp4_pre_quant(M: int, N: int, K: int, dtype, layout, output: bool): +@pytest.mark.parametrize("atomic_add", [True, False]) +def test_gemm_a16wfp4( + M: int, N: int, K: int, dtype, layout, output: bool, atomic_add: bool +): if not (arch_info.is_fp4_avail()): pytest.skip("MXFP4 not supported on this architecture") @@ -153,13 +160,16 @@ def test_gemm_afp4_wfp4_pre_quant(M: int, N: int, K: int, dtype, layout, output: if M == 4864 and N == 8192 and K == 4160: pytest.skip("Skipping this config. due to compilation error.") - x, w, _, w_scales, y = generate_gemm_afp4wfp4_pre_quant_inputs( - M, N, K, layout=layout, output=output + x, w, _, w_scales, y = generate_gemm_a16wfp4_inputs( + M, N, K, output=output, atomic_add=atomic_add, dtype=dtype, layout=layout ) + y_dtype = torch.float32 if atomic_add else dtype if output: - y = gemm_afp4wfp4_pre_quant(x, w, w_scales, torch.float32, y).to(dtype) + y = gemm_a16wfp4(x, w, w_scales, atomic_add=atomic_add, dtype=y_dtype, y=y).to( + dtype + ) else: - y = gemm_afp4wfp4_pre_quant(x, w, w_scales, torch.float32).to(dtype) + y = gemm_a16wfp4(x, w, w_scales, atomic_add=atomic_add, dtype=y_dtype).to(dtype) torch_out = run_torch(x, w, w_scales, dtype).to(dtype) diff --git a/op_tests/triton_tests/test_gemm_afp4wfp4.py b/op_tests/triton_tests/test_gemm_afp4wfp4.py index 7f79d2c536..98325c8e89 100644 --- a/op_tests/triton_tests/test_gemm_afp4wfp4.py +++ b/op_tests/triton_tests/test_gemm_afp4wfp4.py @@ -5,8 +5,7 @@ import torch from aiter.ops.triton.gemm_afp4wfp4 import ( gemm_afp4wfp4, - gemm_afp4wfp4_preshuffled_scales, - gemm_afp4wfp4_preshuffled_weight_scales, + gemm_afp4wfp4_preshuffle, ) import aiter.ops.triton.utils._triton.arch_info as arch_info from aiter.ops.triton.utils.types import str_to_torch_dtype @@ -230,22 +229,16 @@ def run_torch(x, w, x_scales, w_scales, dtype): @pytest.mark.parametrize("layout", ["TN", "TT", "NN", "NT"]) @pytest.mark.parametrize("output", [True, False]) @pytest.mark.parametrize( - "shuffle_scales_fg, shuffle_weight_fg", - [(False, False), (True, False), (True, True)], + "shuffle_weight_scales", + [True, False], ) def test_gemm_afp4_wfp4( - M: int, N: int, K: int, dtype, layout, output, shuffle_scales_fg, shuffle_weight_fg + M: int, N: int, K: int, dtype, layout, output, shuffle_weight_scales ): if not (arch_info.is_fp4_avail()): pytest.skip("MXFP4 not supported on this architecture") - if shuffle_weight_fg and not shuffle_scales_fg: - pytest.skip("Preshuffling weight without preshuffled scales is not supported") - - if shuffle_weight_fg or shuffle_scales_fg: - if shuffle_scales_fg and not shuffle_weight_fg and M < 32: - pytest.skip("Minimal tile size for preshuffled scales is 32x32x256") - + if shuffle_weight_scales: if N % 32 > 0: pytest.skip( f"N = {N} is not divisible by 32, skip this test for preshuffled weight/scales tests" @@ -272,15 +265,15 @@ def test_gemm_afp4_wfp4( dtype, layout=layout, output=output, - shuffle_scales_fg=shuffle_scales_fg, - shuffle_weight_fg=shuffle_weight_fg, + shuffle_scales_fg=shuffle_weight_scales, + shuffle_weight_fg=shuffle_weight_scales, ) torch_out = run_torch(x, w, x_scales, w_scales, dtype).to(dtype) - if shuffle_scales_fg and shuffle_weight_fg: + if shuffle_weight_scales: if output: - triton_out = gemm_afp4wfp4_preshuffled_weight_scales( + triton_out = gemm_afp4wfp4_preshuffle( x, w_triton, x_scales_triton, @@ -290,7 +283,7 @@ def test_gemm_afp4_wfp4( use_aot=(dtype == torch.bfloat16 and layout == "TN"), ) else: - triton_out = gemm_afp4wfp4_preshuffled_weight_scales( + triton_out = gemm_afp4wfp4_preshuffle( x, w_triton, x_scales_triton, @@ -298,15 +291,15 @@ def test_gemm_afp4_wfp4( dtype, use_aot=(dtype == torch.bfloat16 and layout == "TN"), ) - elif shuffle_scales_fg and not shuffle_weight_fg: - if output: - triton_out = gemm_afp4wfp4_preshuffled_scales( - x, w_triton, x_scales_triton, w_scales_triton, dtype, y - ) - else: - triton_out = gemm_afp4wfp4_preshuffled_scales( - x, w_triton, x_scales_triton, w_scales_triton, dtype - ) + # TODO: remove in the future + # if output: + # triton_out = gemm_afp4wfp4_preshuffled_scales( + # x, w_triton, x_scales_triton, w_scales_triton, dtype, y + # ) + # else: + # triton_out = gemm_afp4wfp4_preshuffled_scales( + # x, w_triton, x_scales_triton, w_scales_triton, dtype + # ) else: if output: triton_out = gemm_afp4wfp4(